Goal: PCA of gene abundances in bone marrow derived macrophages (BMDMs) or an analogous cell line (J774s) at two time points after infection with either live Mycobacterium tuberculosis (Mtb), dead Mtb, or mock, based on kallisto estimates from the raw reads of Sci Rep 7:42225 (GSE88801)
import numpy as np
Set up plotting, including 3D plotting
%matplotlib nbagg
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
Array transforms from yesterday
def col_center(d):
mean = np.mean(d, axis = 0)
return d - mean
def col_scale(d):
norm = np.linalg.norm(d, axis = 0)
return d/norm
def correlation_matrix(d):
scaled = col_scale(col_center(d))
return np.dot(scaled.T,scaled)
Load gene by sample matrix of $\log_2(TPM)$ values
%cd ../data/
from csv import reader, excel_tab
orfs = []
names = []
data = []
fin = reader(open("GSE88801_kallisto_TPMs_thresh10.cdt"),dialect=excel_tab)
header = next(fin)[2:]
for row in fin:
orfs.append(row[0])
names.append(row[1])
data.append([float(i) for i in row[2:]])
D = np.array(data)
D.shape
fig = plt.figure()
plt.imshow(D, interpolation = "none", aspect = "auto")
plt.colorbar()
print(header)
P = D[:,(0,1)].T
P.shape
fig = plt.figure()
plt.plot(P[0],P[1],"k,")
Remove points exactly at the origin to avoid divide by zero
z = (np.linalg.norm(P, axis = 0) != 0.)
z.shape,z[0]
P = P[:,z]
P.shape
fig = plt.figure()
plt.plot(P[0],P[1],"k,")
C = correlation_matrix(P.T)
C.shape
fig = plt.figure()
plt.imshow(C, aspect = "auto", vmin = -1, vmax = 1)
plt.colorbar()
s2 = 1/np.sqrt(2)
R = np.array([[s2,s2],[-s2,s2]])
PR = np.dot(R,P)
PR.shape
fig = plt.figure()
plt.plot(PR[0],PR[1],"k,")
a = (P[0]+P[1])/2. # average (geometric mean)
m = (P[1]-P[0]) # difference
fig = plt.figure()
plt.plot(a,m,"k,")
AM = np.vstack((a,m))
AM.shape
C = correlation_matrix(AM.T)
fig = plt.figure()
plt.imshow(C, aspect = "auto", vmin = -1, vmax = 1)
plt.colorbar()
(eigenvalues, eigenvectors) = np.linalg.eigh(correlation_matrix(P.T))
print(eigenvectors)
print(eigenvalues)
np.sqrt(1./2)
P2 = np.dot(eigenvectors,P)
P2.shape
fig = plt.figure()
plt.plot(P2[0],P2[1],"k,")
fig = plt.figure()
plt.plot(-P2[1],P2[0],"k,")
u,s,v = np.linalg.svd(col_center(P.T).T, full_matrices=False)
print(u)
print(s)
v = s**2
v/sum(v)
eigenvalues/sum(eigenvalues)
A = col_center(D.T)
u,s,v = np.linalg.svd(A.T, full_matrices = False)
u.shape
v = s**2
fig = plt.figure()
plt.plot(v/sum(v),"bo")
projected = np.dot(A,u)
projected.shape
fig = plt.figure()
ax = fig.add_subplot(111, projection = "3d")
ax.plot(projected[:,0],projected[:,1],projected[:,2],"bo")
ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
ax.set_zlabel("PC3")
d={"a":3,5:"b","hello":[1,2,3]}
d["hello"]
d["hello"] = 5
d["hello"]
d[3]=7
colors = {"BMDM":{"Live":"#ff0000", # red
"Dead":"#00ff00", # green
"uninfected":"#0000ff"}, # blue
"J774":{"Live":"#ffaaaa",
"Dead":"#aaffaa",
"uninfected":"#aaaaff"}}
markers = {"4h":"^","24h":"o"}
colors = {"BMDM":{"Live":"red",
"Dead":"green",
"uninfected":"blue"},
"J774":{"Live":"magenta",
"Dead":"orange",
"uninfected":"cyan"}}
markers = {"4h":"^","24h":"o"}
list(enumerate(header))
fig = plt.figure()
ax = fig.add_subplot(111, projection = "3d")
ax.plot(projected[:,0],projected[:,1],projected[:,2],"k,")
for (n,name) in enumerate(header):
(strain,infection,rep,time) = name.split("_")
marker = markers[time]
color = colors[strain][infection]
ax.plot([projected[n,0]],[projected[n,1]],[projected[n,2]],
marker = marker, color = color, linestyle = "none")
ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
ax.set_zlabel("PC3")