Goal: Explore different clustering methods on the Eisen yeast expression profiling data set

In addition, make various random permutations of the original data matrix to:

As a pre-processing step, remove the patterns due to Mike Eisen's original clustering of this data
As a negative control, remove some or all of the signal in the data

Load our tools from previous sessions

from cdt_reader import parse_cdt

Load 3rd party clustering and array tools

import Bio.Cluster as Pycluster
import numpy as np

Load the data

(colnames,genes,annotations,data) = parse_cdt("supp2data.cdt")

Functions for handling missing data

def toFloat(x):
    """Coerce x to a floating point value by substituting 0.0 for None"""
    if(x == None):
        return 0.0
    assert(type(x) == float)
    return x

def toMask(x):
    """Return 1 (unmasked) for floating point x or 0 (masked) for missing values"""
    if(x == None):
        return 0
    return 1

def toEmpty(x):
    """Return floating point x or empty string for missing values"""
    if(x == None):
        return ""
    assert(type(x) == float)
    return x

Factor the data as an "all floats" array, d, for clustering + a "mask" array, m, to indicate the locations of missing values

rows = []
for i in data:
    row = []
    for j in i:
        row.append(toFloat(j))
    rows.append(row)
d = np.array(rows)
d.shape, d.dtype

((2467, 79), dtype('float64'))

rows = []
for i in data:
    row = []
    for j in i:
        row.append(toMask(j))
    rows.append(row)
m = np.array(rows)
m.shape, m.dtype

((2467, 79), dtype('int64'))

Additionally, make another copy of the data as array D for writing the CDT files, this time coercing missing values to empty strings.

rows = []
for i in data:
    row = []
    for j in i:
        row.append(toEmpty(j))
    rows.append(row)
D = np.array(rows)
D.shape, D.dtype

((2467, 79), dtype('<U32'))

Set up a single Pycluster.Record for writing the output files

record = Pycluster.Record()
record.data = D
record.geneid = genes
record.genename = annotations
record.gweight = None
record.gorder = None
record.expid = colnames
record.eweight = [1.]*len(colnames)
record.eorder = None
record.uniqid = "UNIQID"

Do hierarchical clustering for all 9 combinations of three distance functions and three linkage methods.

%%time
for distance in "cue":
    for method in "msa":
        dist = Pycluster.distancematrix(d, dist = distance)
        tree = Pycluster.treecluster(distancematrix = dist, method = method)
        record.save("supp2data."+distance+method, geneclusters = tree)

CPU times: user 1min 9s, sys: 40 ms, total: 1min 9s
Wall time: 1min 9s

from random import seed, shuffle

seed(42)

L = ["eggs","spam","green eggs","ham"]

shuffle(L)

L

['green eggs', 'spam', 'ham', 'eggs']

L = ["eggs","spam","green eggs","ham"]
s = set(L)
L2 = list(s)
L2

['spam', 'green eggs', 'ham', 'eggs']

L = ["eggs","spam","green eggs","ham"]

M = [70,12,13,14]

indices = list(range(len(L)))
indices

[0, 1, 2, 3]

shuffle(indices)

indices

[3, 2, 0, 1]

L2 = []
M2 = []
for i in indices:
    L2.append(L[i])
    M2.append(M[i])
print(L2)
print(M2)

['ham', 'green eggs', 'eggs', 'spam']
[14, 13, 70, 12]

gene_indices = list(range(len(genes)))
shuffle(gene_indices)

genes2 = []
annotations2 = []
for i in gene_indices:
    genes2.append(genes[i])
    annotations2.append(annotations[i])

gene_indices[:10]

[1314, 476, 889, 2410, 1612, 2043, 2439, 2194, 1618, 2025]

d2 = d[gene_indices,:]

d2.shape

(2467, 79)

m2 = m[gene_indices,:]
D2 = D[gene_indices,:]

record = Pycluster.Record()
record.data = D2
record.geneid = genes2
record.genename = annotations2
record.gweight = None
record.gorder = None
record.expid = colnames
record.eweight = [1.]*len(colnames)
record.eorder = None
record.uniqid = "UNIQID"

record.save("supp2data.row_shuffled")

%%time
dist = Pycluster.distancematrix(d2, dist = "u")
tree = Pycluster.treecluster(distancematrix = dist, method = "m")
record.save("supp2data.row_shuffled.um", geneclusters = tree)

CPU times: user 6.66 s, sys: 12 ms, total: 6.68 s
Wall time: 6.67 s

mydict = {"hi":4,"seventy":"eggs",3:42}

mydict["hi"]

4

mydict["seventy"]

'eggs'

!ls *.cdt

clustered1_um.cdt	      supp2data.ea.cdt
corr.cdt		      supp2data.em.cdt
correlation.cdt		      supp2data.es.cdt
EuclideanCluster_average.cdt  supp2data.row_shuffled.cdt
EuclideanCluster_max.cdt      supp2data.row_shuffled.cdt.cdt
EuclideanCluster_single.cdt   supp2data.row_shuffled.um.cdt
PearsonCluster_average.cdt    supp2data.ua.cdt
PearsonCluster_max.cdt	      supp2data.um.cdt
PearsonCluster_single.cdt     supp2data.us.cdt
supp2data.ca.cdt	      UncenteredPearsonCluster_average.cdt
supp2data.cdt		      UncenteredPearsonCluster_max.cdt
supp2data.cm.cdt	      UncenteredPearsonCluster_single.cdt
supp2data.cs.cdt

corr = open("correlation.cdt").readlines()
len(corr)

2468

corr[1][:100]

'YBR166C\tTYR1   TYROSINE BIOSYNTHESIS    PREPHENATE DEHYDROGENASE (NADP+\t1.0\t0.12164363213592637\t0.35'

uid2corr = {}
for line in corr[1:]:
    uid2corr[line.split("\t")[0]] = line
len(uid2corr)

2467

uid2corr["YBR166C"][:100]

'YBR166C\tTYR1   TYROSINE BIOSYNTHESIS    PREPHENATE DEHYDROGENASE (NADP+\t1.0\t0.12164363213592637\t0.35'

fp = open("supp2data.um.cdt")
print(fp.readline()[:100])
print(fp.readline()[:100])
print(fp.readline()[:100])
fp.close()

GID	UNIQID	NAME	GWEIGHT	alpha 0	alpha 7	alpha 14	alpha 21	alpha 28	alpha 35	alpha 42	alpha 49	alpha 
EWEIGHT				1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000
GENE17X	YDR311W	TFB1   TRANSCRIPTION       TFIIH 75 KD SUBUNIT	1.000000	0.14	-0.23	-0.04	-0.07	0.28

fp = open("correlation.reordered.cdt","w")
fp.write(corr[0])
fp2 = open("supp2data.um.cdt")
header = fp2.readline()
eweights = fp2.readline()
for line in fp2:
    uid = line.split("\t")[1]
    fp.write(uid2corr[uid])
fp.close()
fp2.close()

try:
    uid2corr["hydrolase"]
except KeyError:
    print(None)

None