from urllib.request import urlopen
fp = urlopen("http://histo.ucsf.edu/BMS270/BMS270_2018/data/supp2data.cdt")
fp.readline()
fp.readline()
fp.close()
from cdt_reader import parse_cdt
from stats import pearson
(colnames,genes,annotations,data) = parse_cdt("supp2data.cdt",0.)
def dist_matrix(data):
matrix = []
for i in data:
row = []
for j in data:
row.append(pearson(i,j))
matrix.append(row)
return matrix
dist_matrix(data[:3])
def dist_matrix2(data):
matrix = []
row = []
for i in data:
for j in data:
row.append(pearson(i,j))
matrix.append(row)
return matrix
dist_matrix2(data[:3])
d = dist_matrix2(data[:3])
for i in d:
print(i)
rows = []
a = [1,2,3]
rows.append(a)
a += [4,5,6]
rows.append(a)
print(rows)
a[3] = "spam"
rows
rows = []
a = [1,2,3]
rows.append(a[:])
a += [4,5,6]
rows.append(a[:])
print(rows)
def write_correlation_cdt(genes,annotations,matrix,fname):
out = open(fname,"w")
out.write("\t".join(["ORF","NAME"]+genes)+"\n")
for (gene,anno,row) in zip(genes,annotations,matrix):
srow = []
for j in row:
srow.append(str(j))
out.write("\t".join([gene,anno]+srow)+"\n")
out.close()
matrix = dist_matrix(data[:100])
write_correlation_cdt(genes[:100],annotations[:100],matrix,"corr.cdt")
import Bio.Cluster as Pycluster
%who
import numpy as np
d = np.array(data)
d
d.shape
d.dtype
%%time
dist = Pycluster.distancematrix(d, dist = "u")
%%time
tree = Pycluster.treecluster(distancematrix = dist, method = "m")
record = Pycluster.Record()
# Restore "None" values
record.data = d
record.geneid = genes
record.genename = annotations
record.gweight = None
record.gorder = None
record.expid = colnames
record.eweight = [1.]*len(colnames)
record.eorder = None
record.uniqid = "UNIQID"
record.save("clustered1_um", geneclusters = tree)
!ls -lhrt | tail