Goal: Fit contrasts from tables S1 and S2 of the paper. Choose abundance and fold-change filters liberal enough for direct comparison with these supplements.
%cd ~/pdf/papers/SciRep_7_42225/
from CdtFile import CdtFile, CdtRow
from MsvUtil import Table, revdict, multirevdict
from SafeMath import safelog, safesub, safeadd, safesum
import gzip
from math import log
import os
import numpy as np
# Uncomment this line for newer Jupyter stacks (e.g., Debian 9 (stretch), current Enthought Canopy)
#%load_ext rpy2.ipython
# Uncomment this line for older IPython stacks (e.g., Debian 8 (jessie), Ubuntu 16.04 (xenial))
%load_ext rmagic
%%R
library(limma)
library(edgeR)
Hand compiled spreadsheet based on paper supplement, mapped to GEO IDs via read counts
samples = Table.fromCsv("sample_table_v2.csv")
print len(samples)
print samples[0]
ival = {"Live":0,"Dead":1,"uninfected":2}
samples = Table(header = samples.header[:],
rows = [list(j)
for j in sorted(samples, key = lambda i: (i["strain"],ival[i["infection"]],int(i["replicate"]),int(i["time"])))])
name2sample = dict((i["name"],i) for i in samples)
import re
gtf_re = re.compile(r'(?P<key>[\S]+)[\s]+"(?P<val>[^"]+)";')
transcript2gene = {}
gene2name = {}
for line in gzip.open("/home/bms270/BMS270_2017/Mus_musculus.GRCm38.79.gtf.gz"):
if(line.startswith("#")):
continue
fields = line.split("\t")
if(fields[2] == "transcript"):
transcript = None
gene = None
for anno in gtf_re.finditer(fields[8]):
if(anno.group("key") == "gene_id"):
gene = anno.group("val")
elif(anno.group("key") == "transcript_id"):
transcript = anno.group("val")
assert(None not in (gene,transcript))
transcript2gene[transcript] = gene
elif(fields[2] == "gene"):
gene = None
name = None
for anno in gtf_re.finditer(fields[8]):
if(anno.group("key") == "gene_id"):
gene = anno.group("val")
elif(anno.group("key") == "gene_name"):
name = anno.group("val")
assert(gene is not None)
if(name is None):
name = gene
gene2name[gene] = name
len(transcript2gene),len(gene2name)
gene2transcript = multirevdict(transcript2gene)
len(gene2transcript)
genes = None
cols = []
for i in samples["name"]:
table = Table.fromTdt(open(os.path.join(i,"abundance.tsv")))
if(genes is None):
genes = table["target_id"]
else:
assert(genes == table["target_id"])
# N.B.: Not log transforming yet so that we can sum over transcript TPMs for each gene
cols.append([float(i) for i in table["tpm"]])
trans_tpm = CdtFile(probes = [CdtRow(gid = i[0], uniqid = i[0], name = transcript2gene[i[0]],
ratios = i[1:])
for i in zip(*([genes]+cols))],
fieldnames = samples["name"],
eweights = [1]*len(samples["name"]))
len(trans_tpm)
genes = set(i.Name() for i in trans_tpm)
len(genes)
def gene_row(gene, cdt, gene2transcript):
rows = [cdt.GetUid(i) for i in gene2transcript[gene]]
return CdtRow(gid = gene, uniqid = gene, name = gene,
ratios = [safelog(sum(j)) for j in zip(*rows)])
# N.B.: At this point, we switch to log space
genes_tpm = CdtFile.fromPrototype(trans_tpm, probes = [gene_row(i, trans_tpm, gene2transcript) for i in genes])
len(genes_tpm)
Following the same method as for TPMs
genes2 = None
cols = []
for i in samples["name"]:
table = Table.fromTdt(open(os.path.join(i,"abundance.tsv")))
if(genes2 is None):
genes2 = table["target_id"]
else:
assert(genes2 == table["target_id"])
# N.B.: Not log transforming yet so that we can sum over transcript TPMs for each gene
cols.append([float(i) for i in table["est_counts"]])
trans_count = CdtFile(probes = [CdtRow(gid = i[0], uniqid = i[0], name = transcript2gene[i[0]],
ratios = i[1:])
for i in zip(*([genes2]+cols))],
fieldnames = samples["name"],
eweights = [1]*len(samples["name"]))
len(trans_count)
genes2 = set(i.Name() for i in trans_count)
genes == genes2
def gene_row(gene, cdt, gene2transcript):
rows = [cdt.GetUid(i) for i in gene2transcript[gene]]
return CdtRow(gid = gene, uniqid = gene, name = gene,
ratios = [sum(j) for j in zip(*rows)])
# Note that we *do not* log transform counts.
genes_count = CdtFile.fromPrototype(trans_tpm, probes = [gene_row(i, trans_count, gene2transcript) for i in genes])
len(genes_count)
def clip(x):
if(x <= 0.):
return 0.
return x
def clip_cdt(cdt):
return CdtFile.fromPrototype(
cdt,
probes = [CdtRow.fromPrototype(i, ratios = [clip(j) for j in i])
for i in cdt])
def threshold_cdt(cdt, thresh):
return CdtFile.fromPrototype(
cdt,
probes = [i for i in cdt if(max(i) >= thresh)])
Filter for genes with $TPM \geq 1$ in at least one sample (Normally, I would be more conservative here, but the paper reports a lot of low abundance differential genes, and I'd like to see how limma handles them. If we were being really careful, we would filter based on count rather than on TPM)
ct_genes = threshold_cdt(clip_cdt(genes_tpm), 0)
len(ct_genes), len(genes_tpm)
Note that this filter does not remove any genes
ct_count = CdtFile.fromPrototype(genes_count, probes = [genes_count.GetUid(i.Uniqid()) for i in ct_genes])
len(ct_count)
C = np.array([[clip(j) for j in i] for i in ct_count])
C.shape
name2row = dict((i.Name(),n+1) for (n,i) in enumerate(ct_count))
row2name = revdict(name2row)
map(len,(name2row,row2name))
state = []
cols = []
infection = []
strain = []
time = []
replicate = []
for i in samples:
cols.append(genes_count.fieldnames.index(i["name"]))
time.append(int(i["time"]))
infection.append(i["infection"])
strain.append(i["strain"])
replicate.append(i["replicate"])
state.append("%s.%s.%d" % (strain[-1],infection[-1], time[-1]))
%%R -i C
dge <- DGEList(counts=C)
dge <- calcNormFactors(dge)
%%R -i state,strain,infection,time -o d,cpm,fc,cn
state <- as.factor(state)
strain <- as.factor(strain)
infection <- as.factor(infection)
time <- as.factor(time)
d <- model.matrix(~0+state)
colnames(d) <- gsub("state","",colnames(d))
print(colnames(d))
v <- voom(dge, d, plot = TRUE)
fit <- lmFit(v, d)
contrast.matrix <- makeContrasts(# S1 contrast
BMDM.uninfected.24-BMDM.uninfected.4,
# S2 contrasts
BMDM.Live.4-BMDM.uninfected.4,BMDM.Dead.4-BMDM.uninfected.4,
BMDM.Live.24-BMDM.uninfected.24,BMDM.Dead.24-BMDM.uninfected.24,
J774.Live.4-J774.uninfected.4,J774.Dead.4-J774.uninfected.4,
J774.Live.24-J774.uninfected.24,J774.Dead.24-J774.uninfected.24,
levels=d)
fit2 <- contrasts.fit(fit, contrast.matrix)
fit2 <- eBayes(fit2)
print(summary(decideTests(fit2)))
fc <- fit$coefficients
cn <- colnames(fit$coefficients)
cpm <- v$E
Fold-change filter chosen to match minimum log2 fold change in the supplements. Normally, I would use something higher.
%%R
print(summary(decideTests(fit2, lfc=.27)))
%%R -o cnames
cnames <- colnames(fit2$contrasts)
%%R -o t0,t1,t2,t3,t4,t5,t6,t7,t8,t9
t0 <- topTable(fit2, n = 40000, p.value = 1.)
t0 <- cbind(Row.Names = rownames(t0), t0)
t1 <- topTable(fit2, coef="BMDM.uninfected.24 - BMDM.uninfected.4", n = 20000, lfc=.27, p.value=.05)
t1 <- cbind(Row.Names = rownames(t1), t1)
t2 <- topTable(fit2, coef="BMDM.Live.4 - BMDM.uninfected.4", n = 20000, lfc=.27, p.value=.05)
t2 <- cbind(Row.Names = rownames(t2), t2)
t3 <- topTable(fit2, coef="BMDM.Dead.4 - BMDM.uninfected.4", n = 20000, lfc=.27, p.value=.05)
t3 <- cbind(Row.Names = rownames(t3), t3)
t4 <- topTable(fit2, coef="BMDM.Live.24 - BMDM.uninfected.24", n = 20000, lfc=.27, p.value=.05)
t4 <- cbind(Row.Names = rownames(t4), t4)
t5 <- topTable(fit2, coef="BMDM.Dead.24 - BMDM.uninfected.24", n = 20000, lfc=.27, p.value=.05)
t5 <- cbind(Row.Names = rownames(t5), t5)
t6 <- topTable(fit2, coef="J774.Live.4 - J774.uninfected.4", n = 20000, lfc=.27, p.value=.05)
t6 <- cbind(Row.Names = rownames(t6), t6)
t7 <- topTable(fit2, coef="J774.Dead.4 - J774.uninfected.4", n = 20000, lfc=.27, p.value=.05)
t7 <- cbind(Row.Names = rownames(t7), t7)
t8 <- topTable(fit2, coef="J774.Live.24 - J774.uninfected.24", n = 20000, lfc=.27, p.value=.05)
t8 <- cbind(Row.Names = rownames(t8), t8)
t9 <- topTable(fit2, coef="J774.Dead.24 - J774.uninfected.24", n = 20000, lfc=.27, p.value=.05)
t9 <- cbind(Row.Names = rownames(t9), t9)
uid2t0 = dict((row2name[int(i[0])], list(i)) for i in t0)
t = [t1,t2,t3,t4,t5,t6,t7,t8,t9]
def contrast_name(s):
"""Return simplified contrast name, factoring common elements."""
terms = s.split(" - ")
a = terms[0].split(".")
b = terms[1].split(".")
A = []
B = []
C = []
for (i,j) in zip(a,b):
if(i == j):
C.append(i)
else:
A.append(i)
B.append(j)
return "%s/%s|%s" % (",".join(A),",".join(B),",".join(C))
dnames = map(contrast_name, cnames)
dnames
differential = {}
for (name, table) in zip(dnames,t):
differential[name] = {"up": set(row2name[int(i[0])] for i in table if(i[1] > 0)),
"down": set(row2name[int(i[0])] for i in table if(i[1] < 0))}
for (key,val) in sorted(differential.items(), key = lambda x: len(x[1]["up"])+len(x[1]["down"])):
print key, len(val["up"]), len(val["down"])
all_diff = set()
for group in differential.values():
for uids in group.values():
all_diff.update(uids)
len(all_diff)
def fit_ratios(uid, order = None):
a = uid2t0[uid][1:len(cnames)+1]
if(order is None):
return a
return [a[i] for i in order]
def diffsets_to_ratios(uid, diffs):
ratios = []
for contrast in diffs:
if(uid in contrast["up"]):
ratios.append(4.)
elif(uid in contrast["down"]):
ratios.append(-4.)
else:
ratios.append(0.)
return ratios
common = sorted(set(i.Uniqid() for i in ct_genes))
set(common) == set(uid2t0)
diffs = [differential[i] for i in dnames]
fieldnames = dnames+[i+" sig" for i in dnames]
probes = []
for i in common:
probes.append(CdtRow(gid = i, uniqid = i, name = gene2name[i],
ratios = (fit_ratios(i)+
diffsets_to_ratios(i,diffs))))
cdt_contrast = CdtFile(probes = probes,
fieldnames = fieldnames,
eweights = [1.]*len(fieldnames))
cdt_contrast.write(open("cdt_contrast.cdt","w"))
len(cdt_contrast)
cdt_contrast_sig = CdtFile.fromPrototype(
cdt_contrast,
probes = [i for i in cdt_contrast if(i.Uniqid() in all_diff)])
len(cdt_contrast_sig)
%%time
tree = cdt_contrast_sig.cluster(dist = "u", method = "m", cols=range(len(cdt_contrast_sig.fieldnames)//2))
cdt_contrast_sig.writeCdtGtr("cdt_contrast_sig.um",tree)