import numpy as np
%matplotlib nbagg
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
def col_center(d):
mean = np.mean(d, axis = 0)
return d - mean
%cd ../data
from csv import reader, excel_tab
orfs = []
names = []
data = []
fin = reader(open("GSE88801_kallisto_TPMs_thresh10.cdt"),dialect=excel_tab)
header = next(fin)[2:]
for row in fin:
orfs.append(row[0])
names.append(row[1])
data.append([float(i) for i in row[2:]])
D = np.array(data)
D.shape
A = col_center(D)
u,s,v = np.linalg.svd(A.T, full_matrices = False)
u.shape
v = s**2
fig = plt.figure()
plt.plot(v/sum(v),"bo")
P = np.dot(A,u)
P.shape
fig = plt.figure()
plt.plot(P[:,0],P[:,1],"k,")
from matplotlib.colors import LinearSegmentedColormap
# Simple blue->yellow non-overlapping gradient
cdict = {"red":((0.,0.,0.),(.5,0.,0.),(1.,1.,1.)),
"green":((0.,0.,0.),(.5,0.,0.),(1.,1.,1.)),
"blue":((0.,1.,1.),(.5,0.,0.),(1.,0.,0.))}
# map gradient to 256 actual RGBA values
cmap_yb = LinearSegmentedColormap("yb",cdict,256)
fig = plt.figure()
plt.imshow(u, interpolation="none", aspect="auto",cmap=cmap_yb,vmin=-.2,vmax=.2)
A = col_center(col_center(D.T).T)
u,s,v = np.linalg.svd(A.T, full_matrices = False)
u.shape
P = np.dot(A,u)
v = s**2
fig = plt.figure()
plt.plot(v/sum(v),"bo")
fig = plt.figure()
plt.imshow(u, interpolation="none", aspect="auto",cmap=cmap_yb,vmin=-.2,vmax=.2)
fig = plt.figure()
plt.plot(P[:,0],P[:,1],"k,")
fig = plt.figure()
plt.plot(u[:,0],label="PC1")
plt.plot(u[:,1],label="PC2")
plt.plot(u[:,2],label="PC3")
plt.legend()
cut2x = np.max(A,axis=1) >= 1
Acut2x = A[cut2x]
Acut2x.shape
Pcut2x = P[cut2x]
fig = plt.figure()
plt.plot(Pcut2x[:,0],Pcut2x[:,1],"k,")
import Bio.Cluster as Pycluster
%%time
tree = Pycluster.treecluster(Acut2x, dist="u", method="m")
record = Pycluster.Record()
record.data = np.hstack((Acut2x,Pcut2x))
record.geneid = orfs[:]
record.genename = names[:]
record.gweight = None
record.gorder = None
record.expid = header[:]+["PC%02d" % i for i in range(len(header))]
record.eweight = None
record.eorder = None
record.uniqid = "UNIQID"
record.save("PCA_clustering_example1.um", geneclusters = tree)
!ls *exammple*
!ls *example*
fp = open("PCA_clustering_example1.um.cdt")
header = next(fp).split("\t")
header[:10]
out = open("PCA_clustering_example1.um.annotated.cdt","w")
out.write("\t".join(header[:3]+["rank"]+header[3:]))
eweights = next(fp).split("\t")
eweights[:10]
out.write("\t".join(header[:3]+[""]+header[3:]))
for (n,line) in enumerate(fp):
row = line.split("\t")
out.write("\t".join(row[:3]+[str(n)]+row[3:]))
out.close()
x = [[0,2],[2,3],[-1,5]]
x
def mykey(i):
return i[1]
y = sorted(x, key = mykey)
y
def mykey(i):
return i[0]
y = sorted(x, key = mykey)
y