import numpy as np

%matplotlib nbagg
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.colors import LinearSegmentedColormap

def col_center(d):
    mean = np.mean(d, axis = 0)
    return d - mean

def col_scale(d):
    norm = np.linalg.norm(d, axis = 0)
    return d/norm

def correlation_matrix(d):
    scaled = col_scale(col_center(d))
    return np.dot(scaled.T,scaled)

%cd ../data

/home/explorer/BMS270/data

!ls *.txt

GSE88801_kallisto_est_counts_thresh10.txt
GSE88801_kallisto_est_counts_thresh1.txt
Mus_musculus.GRCm38.79.ENSEMBL_names.txt

!head Mus_musculus.GRCm38.79.ENSEMBL_names.txt

!wget 'http://histo.ucsf.edu/BMS270/BMS270_2019/data/Mus_musculus.GRCm38.79.ENSEMBL_names.txt'

from csv import reader, excel_tab

gene2name = {}
fp = reader(open('Mus_musculus.GRCm38.79.ENSEMBL_names.txt'),dialect=excel_tab)
header = next(fp)
for row in fp:
    gene2name[row[0]] = row[1]

len(gene2name)

43629

list(gene2name.items())[:10]

[('ENSMUSG00000090897', 'Esp6Esp5'),
 ('ENSMUSG00000081711', 'Gm11785'),
 ('ENSMUSG00000021709', 'Erbb2ip'),
 ('ENSMUSG00000031224', 'Magee2'),
 ('ENSMUSG00000087203', 'Gm13986'),
 ('ENSMUSG00000097737', 'Gm26530'),
 ('ENSMUSG00000005148', 'Klf5'),
 ('ENSMUSG00000029439', 'Sfswap'),
 ('ENSMUSG00000077202', 'Gm25612'),
 ('ENSMUSG00000026797', 'Stxbp1')]

from csv import reader, excel_tab
orfs = []
names = []
data = []
fin = reader(open("GSE88801_kallisto_TPMs_thresh10.cdt"),dialect=excel_tab)
header = next(fin)[2:]
for row in fin:
    orfs.append(row[0])
    names.append(gene2name[row[0]])
    data.append([float(i) for i in row[2:]])

names[:10]

['Zranb2',
 'Lphn2',
 'Rpf1',
 'Ctbs',
 'Sap30bp',
 'Recql5',
 'Ahnak',
 'Nlrp1a',
 'Atp6v0d2',
 'Cdk10']

D = np.array(data)
D.shape

(9939, 36)

A = col_center(D)

A.shape

(9939, 36)

u,s,v = np.linalg.svd(A.T, full_matrices = False)
u.shape

(36, 36)

v = s**2
fig = plt.figure()
plt.plot(v/sum(v),"bo")

[<matplotlib.lines.Line2D at 0x7f8dc8a950f0>]

# Simple blue->yellow non-overlapping gradient
cdict = {"red":((0.,0.,0.),(.5,0.,0.),(1.,1.,1.)),
         "green":((0.,0.,0.),(.5,0.,0.),(1.,1.,1.)),
         "blue":((0.,1.,1.),(.5,0.,0.),(1.,0.,0.))}
# map gradient to 256 actual RGBA values
cmap_yb = LinearSegmentedColormap("yb",cdict,256)

fig = plt.figure()
plt.imshow(u, interpolation="none", aspect="auto",cmap=cmap_yb)

<matplotlib.image.AxesImage at 0x7f8dc89fca58>

fig = plt.figure()
plt.plot(u[:,0],label="PC1")
plt.plot(u[:,1],label="PC2")
plt.plot(u[:,2],label="PC3")
plt.legend()

<matplotlib.legend.Legend at 0x7f8dc89d8d68>

P = np.dot(A,u)
P.shape

(9939, 36)

P.shape

(9939, 36)

fig = plt.figure()
plt.plot(P[:,0],P[:,1],"k,")

[<matplotlib.lines.Line2D at 0x7f8dc8a19cc0>]

A = col_center(col_center(D.T).T)
u,s,v = np.linalg.svd(A.T, full_matrices = False)
u.shape

(36, 36)

v = s**2
fig = plt.figure()
plt.plot(v/sum(v),"bo")

[<matplotlib.lines.Line2D at 0x7f8dc8d9bdd8>]

fig = plt.figure()
plt.imshow(u, interpolation="none", aspect="auto",cmap=cmap_yb)

<matplotlib.image.AxesImage at 0x7f8dc8d6c320>

fig = plt.figure()
plt.plot(u[:,0],label="PC1")
plt.plot(u[:,1],label="PC2")
plt.plot(u[:,2],label="PC3")
plt.legend()

<matplotlib.legend.Legend at 0x7f8dc8cd0588>

P = np.dot(A,u)
P.shape

(9939, 36)

fig = plt.figure()
plt.plot(P[:,0],P[:,1],"k,")

[<matplotlib.lines.Line2D at 0x7f8dc8c39438>]

cut2x = np.max(A,axis=1) >= 1

Acut2x = A[cut2x]
Acut2x.shape

(4872, 36)

Pcut2x = P[cut2x]

fig = plt.figure()
plt.plot(Pcut2x[:,0],Pcut2x[:,1],"k,")

[<matplotlib.lines.Line2D at 0x7f8dc8c00cc0>]

import Bio.Cluster as Pycluster

%%time
tree = Pycluster.treecluster(Acut2x, dist="u", method="m")

CPU times: user 21.2 s, sys: 168 ms, total: 21.3 s
Wall time: 21.8 s

Acut2x.shape,Pcut2x.shape,np.hstack((Acut2x,Pcut2x)).shape

((4872, 36), (4872, 36), (4872, 72))

record = Pycluster.Record()
record.data = np.hstack((Acut2x,Pcut2x))
record.geneid = orfs[:]
record.genename = names[:]
record.gweight = None
record.gorder = None
record.expid = header[:]+["PC%02d" % i for i in range(len(header))]
record.eweight = None
record.eorder = None
record.uniqid = "UNIQID"
record.save("PCA_clustering_example1.um", geneclusters = tree)

fp = open("PCA_clustering_example1.um.cdt")
header = next(fp).split("\t")
header[:10]

['GID',
 'UNIQID',
 'NAME',
 'GWEIGHT',
 'BMDM_Live_1_4h',
 'BMDM_Live_1_24h',
 'BMDM_Live_2_4h',
 'BMDM_Live_2_24h',
 'BMDM_Live_3_4h',
 'BMDM_Live_3_24h']

out = open("PCA_clustering_example1.um.annotated.cdt","w")
out.write("\t".join(header[:3]+["rank"]+header[3:]))

839

eweights = next(fp).split("\t")
eweights[:10]

['EWEIGHT',
 '',
 '',
 '',
 '1.000000',
 '1.000000',
 '1.000000',
 '1.000000',
 '1.000000',
 '1.000000']

out.write("\t".join(header[:3]+[""]+header[3:]))

835

for (n,line) in enumerate(fp):
    row = line.split("\t")
    out.write("\t".join(row[:3]+[str(n)]+row[3:]))

out.close()