Goal: use matplotlib to plot heatmaps for comparing BLOSUM matrices
What do they look like?
!cat blosum45.txt
def load_blastmatrix(fname):
"""Parse a scoring matrix in NCBI format and return
it as a two-dimensional dictionary"""
fp = open(fname)
# kill file header
for line in fp:
if(not line.startswith("#")):
break
# index column headers
i2a = dict((i,a.strip()) for (i,a) in enumerate(line.split()))
# parse scores
b = {}
for line in fp:
w = line.rstrip("\r\n").split()
# row header
a = w[0]
# row values
d = {}
for (i, j) in enumerate(w):
if(i == 0):
continue
d[i2a[i]] = int(j)
b[a] = d
return b
blosum = [load_blastmatrix("blosum%s.txt" % i) for i in (45,62,80)]
alpha_aa = "ACDEFGHIKLMNPQRSTVWY"
array??
array??
array??
array??
array??
array??
def load_blastmatrix(fname):
"""Parse a scoring matrix in NCBI format and return
it as a two-dimensional dictionary"""
fp = open(fname)
# kill file header
for line in fp:
if(not line.startswith("#")):
break
# index column headers
i2a = dict((i,a.strip()) for (i,a) in enumerate(line.split()))
# parse scores
b = {}
for line in fp:
w = line.rstrip("\r\n").split()
# row header
a = w[0]
# row values
d = {}
for (i, j) in enumerate(w):
if(i == 0):
continue
d[i2a[i]] = int(j)
b[a] = d
return b
scores = [array([[b[i][j] for j in alpha_aa] for i in alpha_aa]) for b in blosum]
scores[0]
a = scores[0].reshape((400,))
fig = figure()
h = hist(a)
display(fig)
fig = figure()
plot(sorted(a),"bo")
display(fig)
# Simple green->red non-overlapping gradient
cdict = {"red":((0.,0.,0.),(.5,0.,0.),(1.,1.,1.)),
"green":((0.,1.,1.),(.5,0.,0.),(1.,0.,0.)),
"blue":((0.,0.,0.),(.5,0.,0.),(1.,0.,0.))}
# map gradient to 256 actual RGBA values
myc = matplotlib.colors.LinearSegmentedColormap("myc",cdict,256)
for i in scores:
fig = figure()
# expand color gradient from (0,1) to (-4,4)
imshow(i, cmap = myc, clim = (-4,4), interpolation = "nearest")
colorbar()
display(*(getfigs()[-3:]))
We'll use Pycluster to pass a distance matrix directly to Cluster3 and NetworkX to extract the leaves of the clustered tree in depth-first-search order.
(NetworkX and Pycluster can both be installed via Canopy's package manager. Pycluster can also be installed as part of the larger BioPython package).
import networkx as nx
try:
import Pycluster
except ImportError:
import Bio.Cluster as Pycluster
class ScoreCluster:
def __init__(self, S, alpha_aa = "ACDEFGHIKLMNPQRSTVWY"):
"""Initialize from numpy array of scaled log odds scores."""
(x,y) = S.shape
assert(x == y == len(alpha_aa))
# Interpret the largest score as a distance of zero
D = max(S.reshape(x**2))-S
# Maximum-linkage clustering, with a user-supplied distance matrix
tree = Pycluster.treecluster(distancematrix = D, method = "m")
# Use NetworkX to read out the amino-acids in clustered order
G = nx.DiGraph()
for (n,i) in enumerate(tree):
for j in (i.left, i.right):
G.add_edge(-(n+1),j)
self.ordering = [i for i in nx.dfs_preorder(G, -len(tree)) if(i >= 0)]
self.names = "".join(alpha_aa[i] for i in self.ordering)
self.C = self.permute(S)
def permute(self, S):
"""Given square matrix S in alphabetical order, return rows and columns
of S permuted to match the clustered order."""
return array([[S[i][j] for j in self.ordering] for i in self.ordering])
clustered = [ScoreCluster(i) for i in scores]
For comparison, choose a single clustering to apply to all three scoring matrices
clust45 = [clustered[0].permute(i) for i in scores]
# Simple green->red non-overlapping gradient
cdict = {"red":((0.,0.,0.),(.5,0.,0.),(1.,1.,1.)),
"green":((0.,1.,1.),(.5,0.,0.),(1.,0.,0.)),
"blue":((0.,0.,0.),(.5,0.,0.),(1.,0.,0.))}
# map gradient to 256 actual RGBA values
myc = matplotlib.colors.LinearSegmentedColormap("myc",cdict,256)
for i in clust45:
fig = figure()
# expand color gradient from (0,1) to (-4,4)
imshow(i, cmap = myc, clim = (-4,4), interpolation = "nearest")
xticks(range(len(clustered[0].names)),clustered[0].names)
yticks(range(len(clustered[0].names)),clustered[0].names)
colorbar()
display(*(getfigs()[-3:]))
clust62 = [clustered[1].permute(i) for i in scores]
# Simple green->red non-overlapping gradient
cdict = {"red":((0.,0.,0.),(.5,0.,0.),(1.,1.,1.)),
"green":((0.,1.,1.),(.5,0.,0.),(1.,0.,0.)),
"blue":((0.,0.,0.),(.5,0.,0.),(1.,0.,0.))}
# map gradient to 256 actual RGBA values
myc = matplotlib.colors.LinearSegmentedColormap("myc",cdict,256)
for i in clust62:
fig = figure()
# expand color gradient from (0,1) to (-4,4)
imshow(i, cmap = myc, clim = (-4,4), interpolation = "nearest")
xticks(range(len(clustered[1].names)),clustered[1].names)
yticks(range(len(clustered[1].names)),clustered[1].names)
colorbar()
display(*(getfigs()[-3:]))