Goal: Explore different clustering methods on the Eisen yeast expression profiling data set
In addition, make various random permutations of the original data matrix to:
Load our tools from previous sessions
from cdt_reader import parse_cdt
Load 3rd party clustering and array tools
import Bio.Cluster as Pycluster
import numpy as np
Load the data
(colnames,genes,annotations,data) = parse_cdt("supp2data.cdt")
Functions for handling missing data
def toFloat(x):
"""Coerce x to a floating point value by substituting 0.0 for None"""
if(x == None):
return 0.0
assert(type(x) == float)
return x
def toMask(x):
"""Return 1 (unmasked) for floating point x or 0 (masked) for missing values"""
if(x == None):
return 0
return 1
def toEmpty(x):
"""Return floating point x or empty string for missing values"""
if(x == None):
return ""
assert(type(x) == float)
return x
Factor the data as an "all floats" array, d, for clustering + a "mask" array, m, to indicate the locations of missing values
rows = []
for i in data:
row = []
for j in i:
row.append(toFloat(j))
rows.append(row)
d = np.array(rows)
d.shape, d.dtype
rows = []
for i in data:
row = []
for j in i:
row.append(toMask(j))
rows.append(row)
m = np.array(rows)
m.shape, m.dtype
Additionally, make another copy of the data as array D for writing the CDT files, this time coercing missing values to empty strings.
rows = []
for i in data:
row = []
for j in i:
row.append(toEmpty(j))
rows.append(row)
D = np.array(rows)
D.shape, D.dtype
Set up a single Pycluster.Record for writing the output files
record = Pycluster.Record()
record.data = D
record.geneid = genes
record.genename = annotations
record.gweight = None
record.gorder = None
record.expid = colnames
record.eweight = [1.]*len(colnames)
record.eorder = None
record.uniqid = "UNIQID"
Do hierarchical clustering for all 9 combinations of three distance functions and three linkage methods.
%%time
for distance in "cue":
for method in "msa":
dist = Pycluster.distancematrix(d, dist = distance)
tree = Pycluster.treecluster(distancematrix = dist, method = method)
record.save("supp2data."+distance+method, geneclusters = tree)
from random import seed, shuffle
seed(42)
L = ["eggs","spam","green eggs","ham"]
shuffle(L)
L
L = ["eggs","spam","green eggs","ham"]
s = set(L)
L2 = list(s)
L2
L = ["eggs","spam","green eggs","ham"]
M = [70,12,13,14]
indices = list(range(len(L)))
indices
shuffle(indices)
indices
L2 = []
M2 = []
for i in indices:
L2.append(L[i])
M2.append(M[i])
print(L2)
print(M2)
gene_indices = list(range(len(genes)))
shuffle(gene_indices)
genes2 = []
annotations2 = []
for i in gene_indices:
genes2.append(genes[i])
annotations2.append(annotations[i])
gene_indices[:10]
d2 = d[gene_indices,:]
d2.shape
m2 = m[gene_indices,:]
D2 = D[gene_indices,:]
record = Pycluster.Record()
record.data = D2
record.geneid = genes2
record.genename = annotations2
record.gweight = None
record.gorder = None
record.expid = colnames
record.eweight = [1.]*len(colnames)
record.eorder = None
record.uniqid = "UNIQID"
record.save("supp2data.row_shuffled")
%%time
dist = Pycluster.distancematrix(d2, dist = "u")
tree = Pycluster.treecluster(distancematrix = dist, method = "m")
record.save("supp2data.row_shuffled.um", geneclusters = tree)
mydict = {"hi":4,"seventy":"eggs",3:42}
mydict["hi"]
mydict["seventy"]
!ls *.cdt
corr = open("correlation.cdt").readlines()
len(corr)
corr[1][:100]
uid2corr = {}
for line in corr[1:]:
uid2corr[line.split("\t")[0]] = line
len(uid2corr)
uid2corr["YBR166C"][:100]
fp = open("supp2data.um.cdt")
print(fp.readline()[:100])
print(fp.readline()[:100])
print(fp.readline()[:100])
fp.close()
fp = open("correlation.reordered.cdt","w")
fp.write(corr[0])
fp2 = open("supp2data.um.cdt")
header = fp2.readline()
eweights = fp2.readline()
for line in fp2:
uid = line.split("\t")[1]
fp.write(uid2corr[uid])
fp.close()
fp2.close()
try:
uid2corr["hydrolase"]
except KeyError:
print(None)