import numpy as np
%matplotlib nbagg
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.colors import LinearSegmentedColormap
def col_center(d):
mean = np.mean(d, axis = 0)
return d - mean
def col_scale(d):
norm = np.linalg.norm(d, axis = 0)
return d/norm
def correlation_matrix(d):
scaled = col_scale(col_center(d))
return np.dot(scaled.T,scaled)
%cd ../data
!ls *.txt
!head Mus_musculus.GRCm38.79.ENSEMBL_names.txt
!wget 'http://histo.ucsf.edu/BMS270/BMS270_2019/data/Mus_musculus.GRCm38.79.ENSEMBL_names.txt'
from csv import reader, excel_tab
gene2name = {}
fp = reader(open('Mus_musculus.GRCm38.79.ENSEMBL_names.txt'),dialect=excel_tab)
header = next(fp)
for row in fp:
gene2name[row[0]] = row[1]
len(gene2name)
list(gene2name.items())[:10]
from csv import reader, excel_tab
orfs = []
names = []
data = []
fin = reader(open("GSE88801_kallisto_TPMs_thresh10.cdt"),dialect=excel_tab)
header = next(fin)[2:]
for row in fin:
orfs.append(row[0])
names.append(gene2name[row[0]])
data.append([float(i) for i in row[2:]])
names[:10]
D = np.array(data)
D.shape
A = col_center(D)
A.shape
u,s,v = np.linalg.svd(A.T, full_matrices = False)
u.shape
v = s**2
fig = plt.figure()
plt.plot(v/sum(v),"bo")
# Simple blue->yellow non-overlapping gradient
cdict = {"red":((0.,0.,0.),(.5,0.,0.),(1.,1.,1.)),
"green":((0.,0.,0.),(.5,0.,0.),(1.,1.,1.)),
"blue":((0.,1.,1.),(.5,0.,0.),(1.,0.,0.))}
# map gradient to 256 actual RGBA values
cmap_yb = LinearSegmentedColormap("yb",cdict,256)
fig = plt.figure()
plt.imshow(u, interpolation="none", aspect="auto",cmap=cmap_yb)
fig = plt.figure()
plt.plot(u[:,0],label="PC1")
plt.plot(u[:,1],label="PC2")
plt.plot(u[:,2],label="PC3")
plt.legend()
P = np.dot(A,u)
P.shape
P.shape
fig = plt.figure()
plt.plot(P[:,0],P[:,1],"k,")
A = col_center(col_center(D.T).T)
u,s,v = np.linalg.svd(A.T, full_matrices = False)
u.shape
v = s**2
fig = plt.figure()
plt.plot(v/sum(v),"bo")
fig = plt.figure()
plt.imshow(u, interpolation="none", aspect="auto",cmap=cmap_yb)
fig = plt.figure()
plt.plot(u[:,0],label="PC1")
plt.plot(u[:,1],label="PC2")
plt.plot(u[:,2],label="PC3")
plt.legend()
P = np.dot(A,u)
P.shape
fig = plt.figure()
plt.plot(P[:,0],P[:,1],"k,")
cut2x = np.max(A,axis=1) >= 1
Acut2x = A[cut2x]
Acut2x.shape
Pcut2x = P[cut2x]
fig = plt.figure()
plt.plot(Pcut2x[:,0],Pcut2x[:,1],"k,")
import Bio.Cluster as Pycluster
%%time
tree = Pycluster.treecluster(Acut2x, dist="u", method="m")
Acut2x.shape,Pcut2x.shape,np.hstack((Acut2x,Pcut2x)).shape
record = Pycluster.Record()
record.data = np.hstack((Acut2x,Pcut2x))
record.geneid = orfs[:]
record.genename = names[:]
record.gweight = None
record.gorder = None
record.expid = header[:]+["PC%02d" % i for i in range(len(header))]
record.eweight = None
record.eorder = None
record.uniqid = "UNIQID"
record.save("PCA_clustering_example1.um", geneclusters = tree)
fp = open("PCA_clustering_example1.um.cdt")
header = next(fp).split("\t")
header[:10]
out = open("PCA_clustering_example1.um.annotated.cdt","w")
out.write("\t".join(header[:3]+["rank"]+header[3:]))
eweights = next(fp).split("\t")
eweights[:10]
out.write("\t".join(header[:3]+[""]+header[3:]))
for (n,line) in enumerate(fp):
row = line.split("\t")
out.write("\t".join(row[:3]+[str(n)]+row[3:]))
out.close()