%matplotlib notebook
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
Let's factor our previous correlation matrix code into convenient functions, adding code for the covariance (unscaled) matrix:
def center(d):
mean = np.mean(d, axis = 1)
mean = mean.reshape(mean.shape[0], 1)
return d - mean
def scale(d):
norm = np.linalg.norm(d, axis = 1)
norm = norm.reshape(norm.shape[0], 1)
return d/norm
def correlation_matrix(d):
scaled = scale(center(d))
return np.dot(scaled,scaled.T)
def covariance_matrix(d):
centered = center(d)
return np.dot(centered,centered.T)
Same thing, but cutting the other way
def center2(d):
mean = np.mean(d)
return d - mean
def scale2(d):
norm = np.linalg.norm(d)
return d/norm
def correlation_matrix2(d):
scaled = scale2(center2(d))
return np.dot(scaled,scaled.T)
def covariance_matrix2(d):
centered = center2(d)
return np.dot(centered,centered.T)
We'll start with our previous cut for genes observed in at least two samples.
import gzip
from csv import reader, excel_tab
fp = reader(gzip.open("GSE86922_Brodsky_GEO_processed.txt.gz"), dialect = excel_tab)
header = fp.next()
data = []
annotations = []
for row in fp:
annotations.append(row[:4])
data.append([float(i) for i in row[4:]])
# This is new -- we deallocate the reader object to close the file when we're done reading it
del fp
d = np.array(data)
anno = np.array(annotations)
thresh = np.log(10)/np.log(2)
x = (np.sum(d >= thresh, axis = 1) >= 2)
D = d[x,:]
Da = anno[x,:]
D.shape, Da.shape
fig = plt.figure()
plt.imshow(D,interpolation = "none", aspect = "auto")
plt.colorbar()
print list(enumerate(header[4:]))
P = D[:,(6,7)]
P.shape
fig = plt.figure()
plt.plot(P[:,0],P[:,1],"bo")
C = correlation_matrix(P.T)
fig = plt.figure()
plt.imshow(C, interpolation = "none", aspect = "auto", vmin=-1, vmax = 1)
plt.colorbar()
a = (P[:,0]+P[:,1])/2. # average
m = (P[:,1]-P[:,0]) # fold change
fig = plt.figure()
plt.plot(a,m,"bo")
What happens to the correlation matrix for the new coordinates?
AM = np.vstack((a,m)).T
AM.shape
C = correlation_matrix(AM.T)
fig = plt.figure()
plt.imshow(C, interpolation = "none", aspect = "auto",vmin=-1, vmax = 1)
plt.colorbar()
So, we've factored the data into a correlated component, $a$, and an uncorrelated component, $m$.
Let's think about the transform we just applied:
$P \left( \begin{array}{cc} \frac{1}{2} & -1 \\ \frac{1}{2} & 1 \end{array} \right) = \left( \begin{array}{cc} a & m \end{array} \right)$
If we think of the columns as vectors, we can normalize them to give unit vectors (giving a scaled result):
$P \left( \begin{array}{cc} \frac{1}{\sqrt{2}} & -\frac{1}{\sqrt{2}} \\ \frac{1}{\sqrt{2}} & \frac{1}{\sqrt{2}} \end{array} \right) = \left( \begin{array}{cc} a \sqrt{2} & \frac{m}{\sqrt{2}} \end{array} \right) = \left( \begin{array}{cc} a' & m' \end{array} \right)$
This has the form of a clockwise rotation about the origin by $\theta = \frac{\pi}{4}$:
$P \left( \begin{array}{cc} \cos(\theta) & -\sin(\theta) \\ \sin(\theta) & \cos(\theta) \end{array} \right) = \left( \begin{array}{cc} a' & m' \end{array} \right)$
Note that clockwise rotation of our data is equivalent to counterclockwise rotation of the axes. In general, matrices giving such rigid body transforms are orthogonal matrices.
In fact, we can factor out the correlations of any square symetric matrix (e.g., a correlation or covariance matrix) by a rotation about the origin, given by an orthogonal matrix. The columns of the orthogonal matrix give the axes of the new coordinate system and are called eigenvectors.
Here's one way to calculate them using numpy:
(eigenvalues, eigenvectors) = np.linalg.eigh(correlation_matrix(P.T))
print eigenvectors
print eigenvalues
np.sqrt(1./2)
A cheaper method, for the special case of matrices formed by the product of a rectangular matrix with its transpose, is singular value decomposition:
u,s,v = np.linalg.svd(scale(P).T, full_matrices=False)
print u
print s
Here is the relationship between the singular values and the eigenvalues
v = s**2
v/sum(v)
eigenvalues/sum(eigenvalues)
Note that, in this case, we get the same eigenvectors for diagonalization of the covariance matrix:
u,s,v = np.linalg.svd(center(P).T, full_matrices=False)
print u
print s
This is convenient, because it conserves euclidean distances among our data
T = D[:,(6,8,10)]
T.shape
fig = plt.figure()
ax = fig.add_subplot(111, projection="3d")
ax.plot(T[:,0],T[:,1],T[:,2],"bo")
ax.set_xlabel("WT")
ax.set_ylabel("ripk1")
ax.set_zlabel("ripk1/casp8")
C = center2(T)
fig = plt.figure()
ax = fig.add_subplot(111, projection="3d")
ax.plot(C[:,0],C[:,1],C[:,2],"bo")
ax.set_xlabel("WT")
ax.set_ylabel("ripk1")
ax.set_zlabel("ripk1/casp8")
u,s,v = np.linalg.svd(center2(T).T, full_matrices=False)
print u
print s
v = s**2
v/sum(v)
projected = np.dot(center2(T),u)
projected.shape
fig = plt.figure()
ax = fig.add_subplot(111, projection="3d")
ax.plot(projected[:,0],projected[:,1],projected[:,2],"bo")
ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
ax.set_zlabel("PC3")
L = 0
for i in (ax.get_xlim3d, ax.get_ylim3d, ax.get_zlim3d):
(a,b) = i()
L = max(L, -a, b)
ax.set_xlim3d(-L,L)
ax.set_ylim3d(-L,L)
ax.set_zlim3d(-L,L)
fig = plt.figure()
plt.plot(projected[:,0],projected[:,2],"bo")
u,s,v = np.linalg.svd(center2(D).T, full_matrices=False)
print u
print s
v = s**2
v/sum(v)
fig = plt.figure()
plt.plot(v/sum(v))
fig = plt.figure()
plt.imshow(u, interpolation = "none", aspect = "auto")
plt.colorbar()
projected = np.dot(center2(D),u)
projected.shape
fig = plt.figure()
ax = fig.add_subplot(111, projection="3d")
ax.plot(projected[:,0],projected[:,1],projected[:,2],"bo")
ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
ax.set_zlabel("PC3")
L = 0
for i in (ax.get_xlim3d, ax.get_ylim3d, ax.get_zlim3d):
(a,b) = i()
L = max(L, -a, b)
ax.set_xlim3d(-L,L)
ax.set_ylim3d(-L,L)
ax.set_zlim3d(-L,L)
fig = plt.figure()
plt.plot(projected[:,0],projected[:,1],"bo")
u,s,v = np.linalg.svd(center2(center(D)).T, full_matrices=False)
print u
print s
v = s**2
v/sum(v)
fig = plt.figure()
plt.plot(v/sum(v))
fig = plt.figure()
plt.imshow(u, interpolation = "none", aspect = "auto")
plt.colorbar()
projected = np.dot(center2(center(D)),u)
projected.shape
fig = plt.figure()
ax = fig.add_subplot(111, projection="3d")
ax.plot(projected[:,0],projected[:,1],projected[:,2],"bo")
ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
ax.set_zlabel("PC3")
L = 0
for i in (ax.get_xlim3d, ax.get_ylim3d, ax.get_zlim3d):
(a,b) = i()
L = max(L, -a, b)
ax.set_xlim3d(-L,L)
ax.set_ylim3d(-L,L)
ax.set_zlim3d(-L,L)
fig = plt.figure()
plt.plot(projected[:,0],projected[:,1],"bo")
IL6 = projected[Da[:,0] == "ENSMUSG00000025746"]
IL6
fig = plt.figure()
ax = fig.add_subplot(111, projection="3d")
ax.plot(projected[:,0],projected[:,1],projected[:,2],"bo")
ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
ax.set_zlabel("PC3")
L = 0
for i in (ax.get_xlim3d, ax.get_ylim3d, ax.get_zlim3d):
(a,b) = i()
L = max(L, -a, b)
ax.set_xlim3d(-L,L)
ax.set_ylim3d(-L,L)
ax.set_zlim3d(-L,L)
ax.plot(IL6[:,0],IL6[:,1],IL6[:,2],"ro")
print Da[np.logical_and(projected[:,0] > 8,projected[:,1] > .5),:]
u,s,v = np.linalg.svd(center(D), full_matrices=False)
print u
print s
v = s**2
v/sum(v)
fig = plt.figure()
plt.plot(v/sum(v))
projected = np.dot(center(D).T,u)
projected.shape
fig = plt.figure()
ax = fig.add_subplot(111, projection="3d")
ax.plot(projected[:,0],projected[:,1],projected[:,2],"bo")
ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
ax.set_zlabel("PC3")
L = 0
for i in (ax.get_xlim3d, ax.get_ylim3d, ax.get_zlim3d):
(a,b) = i()
L = max(L, -a, b)
ax.set_xlim3d(-L,L)
ax.set_ylim3d(-L,L)
ax.set_zlim3d(-L,L)
highlights = {}
for i in header[4:]:
if(i.startswith("WT")):
color = "black"
elif(i.startswith("Ripk3Casp8")):
color = "orange"
else:
color = "green"
if(i.find("LPS") > -1):
marker = "o"
else:
marker = "^"
highlights[i] = (color,marker)
highlights
fig = plt.figure()
ax = fig.add_subplot(111, projection="3d")
ax.plot(projected[:,0],projected[:,1],projected[:,2],marker = ".", color = "gray", linestyle="none")
ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
ax.set_zlabel("PC3")
L = 0
for i in (ax.get_xlim3d, ax.get_ylim3d, ax.get_zlim3d):
(a,b) = i()
L = max(L, -a, b)
ax.set_xlim3d(-L,L)
ax.set_ylim3d(-L,L)
ax.set_zlim3d(-L,L)
for (n, name) in enumerate(header[4:]):
(color, marker) = highlights[name]
ax.plot([projected[n,0]],[projected[n,1]],[projected[n,2]],marker = marker, color = color, linestyle = "none")