Goal: Fit contrasts from tables S1 and S2 of the paper. Choose abundance and fold-change filters liberal enough for direct comparison with these supplements.

In [1]:
%cd ~/pdf/papers/SciRep_7_42225/
/home/mvoorhie/pdf/papers/SciRep_7_42225

In [2]:
from CdtFile import CdtFile, CdtRow
from MsvUtil import Table, revdict, multirevdict
from SafeMath import safelog, safesub, safeadd, safesum
In [3]:
import gzip
from math import log
import os
import numpy as np
In [4]:
# Uncomment this line for newer Jupyter stacks (e.g., Debian 9 (stretch), current Enthought Canopy)
#%load_ext rpy2.ipython 
# Uncomment this line for older IPython stacks (e.g., Debian 8 (jessie), Ubuntu 16.04 (xenial))
%load_ext rmagic 
In [5]:
%%R
library(limma)
library(edgeR)

Load Annotations

Sample annotations

Hand compiled spreadsheet based on paper supplement, mapped to GEO IDs via read counts

In [6]:
samples = Table.fromCsv("sample_table_v2.csv")
print len(samples)
print samples[0]
36
name:		GSM2348248
infection:		uninfected
strain:		J774
time:		4
replicate:		1

In [7]:
ival = {"Live":0,"Dead":1,"uninfected":2}
samples = Table(header = samples.header[:],
                rows = [list(j) 
                        for j in sorted(samples, key = lambda i: (i["strain"],ival[i["infection"]],int(i["replicate"]),int(i["time"])))])
In [8]:
name2sample = dict((i["name"],i) for i in samples)

ENSEMBL Mouse Transcript ID -> Gene ID mapping

In [9]:
import re
gtf_re = re.compile(r'(?P<key>[\S]+)[\s]+"(?P<val>[^"]+)";')
In [10]:
transcript2gene = {}
gene2name = {}
for line in gzip.open("/home/bms270/BMS270_2017/Mus_musculus.GRCm38.79.gtf.gz"):
    if(line.startswith("#")):
        continue
    fields = line.split("\t")
    if(fields[2] == "transcript"):
        transcript = None
        gene = None
        for anno in gtf_re.finditer(fields[8]):
            if(anno.group("key") == "gene_id"):
                gene = anno.group("val")
            elif(anno.group("key") == "transcript_id"):
                transcript = anno.group("val")
        assert(None not in (gene,transcript))
        transcript2gene[transcript] = gene
    elif(fields[2] == "gene"):
        gene = None
        name = None
        for anno in gtf_re.finditer(fields[8]):
            if(anno.group("key") == "gene_id"):
                gene = anno.group("val")
            elif(anno.group("key") == "gene_name"):
                name = anno.group("val")
        assert(gene is not None)
        if(name is None):
            name = gene
        gene2name[gene] = name
len(transcript2gene),len(gene2name)
Out[10]:
(104129, 43629)
In [11]:
gene2transcript = multirevdict(transcript2gene)
len(gene2transcript)
Out[11]:
43629

Merge count and TPM values from kallisto

Step 1: TPMs

In [12]:
genes = None
cols = []
for i in samples["name"]:
    table = Table.fromTdt(open(os.path.join(i,"abundance.tsv")))
    if(genes is None):
        genes = table["target_id"]
    else:
        assert(genes == table["target_id"])
    # N.B.: Not log transforming yet so that we can sum over transcript TPMs for each gene
    cols.append([float(i) for i in table["tpm"]])

trans_tpm = CdtFile(probes = [CdtRow(gid = i[0], uniqid = i[0], name = transcript2gene[i[0]],
                                        ratios = i[1:])
                                 for i in zip(*([genes]+cols))],
                       fieldnames = samples["name"],
                       eweights = [1]*len(samples["name"]))
len(trans_tpm)
Out[12]:
88198
In [13]:
genes = set(i.Name() for i in trans_tpm)
len(genes)
Out[13]:
30735
In [14]:
def gene_row(gene, cdt, gene2transcript):
    rows = [cdt.GetUid(i) for i in gene2transcript[gene]]
    return CdtRow(gid = gene, uniqid = gene, name = gene,
                  ratios = [safelog(sum(j)) for j in zip(*rows)])

# N.B.: At this point, we switch to log space
genes_tpm = CdtFile.fromPrototype(trans_tpm, probes = [gene_row(i, trans_tpm, gene2transcript) for i in genes])
len(genes_tpm)
Out[14]:
30735

Step 2: estimated counts

Following the same method as for TPMs

In [15]:
genes2 = None
cols = []
for i in samples["name"]:
    table = Table.fromTdt(open(os.path.join(i,"abundance.tsv")))
    if(genes2 is None):
        genes2 = table["target_id"]
    else:
        assert(genes2 == table["target_id"])
    # N.B.: Not log transforming yet so that we can sum over transcript TPMs for each gene
    cols.append([float(i) for i in table["est_counts"]])

trans_count = CdtFile(probes = [CdtRow(gid = i[0], uniqid = i[0], name = transcript2gene[i[0]],
                                        ratios = i[1:])
                                 for i in zip(*([genes2]+cols))],
                       fieldnames = samples["name"],
                       eweights = [1]*len(samples["name"]))
len(trans_count)
Out[15]:
88198
In [16]:
genes2 = set(i.Name() for i in trans_count)
genes == genes2
Out[16]:
True
In [17]:
def gene_row(gene, cdt, gene2transcript):
    rows = [cdt.GetUid(i) for i in gene2transcript[gene]]
    return CdtRow(gid = gene, uniqid = gene, name = gene,
                  ratios = [sum(j) for j in zip(*rows)])

# Note that we *do not* log transform counts.
genes_count = CdtFile.fromPrototype(trans_tpm, probes = [gene_row(i, trans_count, gene2transcript) for i in genes])
len(genes_count)
Out[17]:
30735

TPM Filter

In [18]:
def clip(x):
    if(x <= 0.):
        return 0.
    return x

def clip_cdt(cdt):
    return CdtFile.fromPrototype(
           cdt,
           probes = [CdtRow.fromPrototype(i, ratios = [clip(j) for j in i])
                     for i in cdt])

def threshold_cdt(cdt, thresh):
    return CdtFile.fromPrototype(
           cdt,
           probes = [i for i in cdt if(max(i) >= thresh)])

Filter for genes with $TPM \geq 1$ in at least one sample (Normally, I would be more conservative here, but the paper reports a lot of low abundance differential genes, and I'd like to see how limma handles them. If we were being really careful, we would filter based on count rather than on TPM)

In [19]:
ct_genes = threshold_cdt(clip_cdt(genes_tpm), 0)
len(ct_genes), len(genes_tpm)
Out[19]:
(30735, 30735)

Note that this filter does not remove any genes

In [20]:
ct_count = CdtFile.fromPrototype(genes_count, probes = [genes_count.GetUid(i.Uniqid()) for i in ct_genes])
len(ct_count)
Out[20]:
30735

Stage for import into R

In [21]:
C = np.array([[clip(j) for j in i] for i in ct_count])
C.shape
Out[21]:
(30735, 36)
In [22]:
name2row = dict((i.Name(),n+1) for (n,i) in enumerate(ct_count))
row2name = revdict(name2row)
map(len,(name2row,row2name))
Out[22]:
[30735, 30735]
In [23]:
state = []
cols = []
infection = []
strain = []
time = []
replicate = []
for i in samples:
    cols.append(genes_count.fieldnames.index(i["name"]))
    time.append(int(i["time"]))
    infection.append(i["infection"])
    strain.append(i["strain"])
    replicate.append(i["replicate"])
    state.append("%s.%s.%d" % (strain[-1],infection[-1], time[-1]))

Limma fit

In [24]:
%%R -i C
dge <- DGEList(counts=C)
dge <- calcNormFactors(dge)
In [25]:
%%R -i state,strain,infection,time -o d,cpm,fc,cn
state <- as.factor(state)
strain <- as.factor(strain)
infection <- as.factor(infection)
time <- as.factor(time)
d <- model.matrix(~0+state)
colnames(d) <- gsub("state","",colnames(d))
print(colnames(d))

v <- voom(dge, d, plot = TRUE)
fit <- lmFit(v, d)

contrast.matrix <- makeContrasts(# S1 contrast
                                 BMDM.uninfected.24-BMDM.uninfected.4,
                                 # S2 contrasts
                                 BMDM.Live.4-BMDM.uninfected.4,BMDM.Dead.4-BMDM.uninfected.4,
                                 BMDM.Live.24-BMDM.uninfected.24,BMDM.Dead.24-BMDM.uninfected.24,
                                 J774.Live.4-J774.uninfected.4,J774.Dead.4-J774.uninfected.4,
                                 J774.Live.24-J774.uninfected.24,J774.Dead.24-J774.uninfected.24,
                                 levels=d)

fit2 <- contrasts.fit(fit, contrast.matrix)
fit2 <- eBayes(fit2)
print(summary(decideTests(fit2)))

fc <- fit$coefficients
cn <- colnames(fit$coefficients)
cpm <- v$E
 [1] "BMDM.Dead.24"       "BMDM.Dead.4"        "BMDM.Live.24"      
 [4] "BMDM.Live.4"        "BMDM.uninfected.24" "BMDM.uninfected.4" 
 [7] "J774.Dead.24"       "J774.Dead.4"        "J774.Live.24"      
[10] "J774.Live.4"        "J774.uninfected.24" "J774.uninfected.4" 
   BMDM.uninfected.24 - BMDM.uninfected.4 BMDM.Live.4 - BMDM.uninfected.4
-1                                   1862                            2036
0                                   26343                           26742
1                                    2530                            1957
   BMDM.Dead.4 - BMDM.uninfected.4 BMDM.Live.24 - BMDM.uninfected.24
-1                            1385                              2446
0                            27758                             26099
1                             1592                              2190
   BMDM.Dead.24 - BMDM.uninfected.24 J774.Live.4 - J774.uninfected.4
-1                              1281                             153
0                              28293                           30289
1                               1161                             293
   J774.Dead.4 - J774.uninfected.4 J774.Live.24 - J774.uninfected.24
-1                              91                              1794
0                            30369                             26974
1                              275                              1967
   J774.Dead.24 - J774.uninfected.24
-1                               379
0                              29969
1                                387

Fold-change filter chosen to match minimum log2 fold change in the supplements. Normally, I would use something higher.

In [26]:
%%R
print(summary(decideTests(fit2, lfc=.27)))
   BMDM.uninfected.24 - BMDM.uninfected.4 BMDM.Live.4 - BMDM.uninfected.4
-1                                   1862                            2036
0                                   26343                           26742
1                                    2530                            1957
   BMDM.Dead.4 - BMDM.uninfected.4 BMDM.Live.24 - BMDM.uninfected.24
-1                            1385                              2446
0                            27759                             26099
1                             1591                              2190
   BMDM.Dead.24 - BMDM.uninfected.24 J774.Live.4 - J774.uninfected.4
-1                              1281                             153
0                              28293                           30289
1                               1161                             293
   J774.Dead.4 - J774.uninfected.4 J774.Live.24 - J774.uninfected.24
-1                              91                              1794
0                            30369                             26974
1                              275                              1967
   J774.Dead.24 - J774.uninfected.24
-1                               379
0                              29969
1                                387

Extract fit contrasts

In [27]:
%%R -o cnames
cnames <- colnames(fit2$contrasts)
In [28]:
%%R -o t0,t1,t2,t3,t4,t5,t6,t7,t8,t9
t0 <- topTable(fit2, n = 40000, p.value = 1.)
t0 <- cbind(Row.Names = rownames(t0), t0)
t1 <- topTable(fit2, coef="BMDM.uninfected.24 - BMDM.uninfected.4", n = 20000, lfc=.27, p.value=.05)
t1 <- cbind(Row.Names = rownames(t1), t1)
t2 <- topTable(fit2, coef="BMDM.Live.4 - BMDM.uninfected.4", n = 20000, lfc=.27, p.value=.05)
t2 <- cbind(Row.Names = rownames(t2), t2)
t3 <- topTable(fit2, coef="BMDM.Dead.4 - BMDM.uninfected.4", n = 20000, lfc=.27, p.value=.05)
t3 <- cbind(Row.Names = rownames(t3), t3)
t4 <- topTable(fit2, coef="BMDM.Live.24 - BMDM.uninfected.24", n = 20000, lfc=.27, p.value=.05)
t4 <- cbind(Row.Names = rownames(t4), t4)
t5 <- topTable(fit2, coef="BMDM.Dead.24 - BMDM.uninfected.24", n = 20000, lfc=.27, p.value=.05)
t5 <- cbind(Row.Names = rownames(t5), t5)
t6 <- topTable(fit2, coef="J774.Live.4 - J774.uninfected.4", n = 20000, lfc=.27, p.value=.05)
t6 <- cbind(Row.Names = rownames(t6), t6)
t7 <- topTable(fit2, coef="J774.Dead.4 - J774.uninfected.4", n = 20000, lfc=.27, p.value=.05)
t7 <- cbind(Row.Names = rownames(t7), t7)
t8 <- topTable(fit2, coef="J774.Live.24 - J774.uninfected.24", n = 20000, lfc=.27, p.value=.05)
t8 <- cbind(Row.Names = rownames(t8), t8)
t9 <- topTable(fit2, coef="J774.Dead.24 - J774.uninfected.24", n = 20000, lfc=.27, p.value=.05)
t9 <- cbind(Row.Names = rownames(t9), t9)
In [29]:
uid2t0 = dict((row2name[int(i[0])], list(i)) for i in t0)
In [30]:
t = [t1,t2,t3,t4,t5,t6,t7,t8,t9]

Find union of differential genes

In [31]:
def contrast_name(s):
    """Return simplified contrast name, factoring common elements."""
    terms = s.split(" - ")
    a = terms[0].split(".")
    b = terms[1].split(".")
    A = []
    B = []
    C = []
    for (i,j) in zip(a,b):
        if(i == j):
            C.append(i)
        else:
            A.append(i)
            B.append(j)
    return "%s/%s|%s" % (",".join(A),",".join(B),",".join(C))
In [32]:
dnames = map(contrast_name, cnames)
In [33]:
dnames
Out[33]:
['24/4|BMDM,uninfected',
 'Live/uninfected|BMDM,4',
 'Dead/uninfected|BMDM,4',
 'Live/uninfected|BMDM,24',
 'Dead/uninfected|BMDM,24',
 'Live/uninfected|J774,4',
 'Dead/uninfected|J774,4',
 'Live/uninfected|J774,24',
 'Dead/uninfected|J774,24']
In [34]:
differential = {}
for (name, table) in zip(dnames,t):
    differential[name] = {"up":   set(row2name[int(i[0])] for i in table if(i[1] > 0)),
                          "down": set(row2name[int(i[0])] for i in table if(i[1] < 0))}
   
for (key,val) in sorted(differential.items(), key = lambda x: len(x[1]["up"])+len(x[1]["down"])):
    print key, len(val["up"]), len(val["down"])
Dead/uninfected|J774,4 275 91
Live/uninfected|J774,4 293 153
Dead/uninfected|J774,24 387 379
Dead/uninfected|BMDM,24 1161 1281
Dead/uninfected|BMDM,4 1591 1385
Live/uninfected|J774,24 1967 1794
Live/uninfected|BMDM,4 1957 2036
24/4|BMDM,uninfected 2530 1862
Live/uninfected|BMDM,24 2190 2446

In [35]:
all_diff = set()
for group in differential.values():
    for uids in group.values():
        all_diff.update(uids)
len(all_diff)
Out[35]:
9423

Export heatmaps in CDT format

In [36]:
def fit_ratios(uid, order = None):
    a = uid2t0[uid][1:len(cnames)+1]
    if(order is None):
        return a
    return [a[i] for i in order]
In [37]:
def diffsets_to_ratios(uid, diffs):
    ratios = []
    for contrast in diffs:
        if(uid in contrast["up"]):
            ratios.append(4.)
        elif(uid in contrast["down"]):
            ratios.append(-4.)
        else:
            ratios.append(0.)
    return ratios

Limma fit contrasts for all genes

In [38]:
common = sorted(set(i.Uniqid() for i in ct_genes))
In [39]:
set(common) == set(uid2t0)
Out[39]:
True
In [40]:
diffs = [differential[i] for i in dnames]
fieldnames = dnames+[i+" sig" for i in dnames]
probes = []
for i in common:
    probes.append(CdtRow(gid = i, uniqid = i, name = gene2name[i], 
                         ratios = (fit_ratios(i)+
                                   diffsets_to_ratios(i,diffs))))

cdt_contrast = CdtFile(probes = probes,
                       fieldnames = fieldnames,
                       eweights = [1.]*len(fieldnames))

cdt_contrast.write(open("cdt_contrast.cdt","w"))
In [41]:
len(cdt_contrast)
Out[41]:
30735

Limma fit contrasts for significantly differential genes

In [42]:
cdt_contrast_sig = CdtFile.fromPrototype(
   cdt_contrast, 
   probes = [i for i in cdt_contrast if(i.Uniqid() in all_diff)])
len(cdt_contrast_sig)
Out[42]:
9423
In [43]:
%%time
tree = cdt_contrast_sig.cluster(dist = "u", method = "m", cols=range(len(cdt_contrast_sig.fieldnames)//2))
cdt_contrast_sig.writeCdtGtr("cdt_contrast_sig.um",tree)
Building array...
Building distance matrix...

CPU times: user 2min 26s, sys: 264 ms, total: 2min 26s
Wall time: 2min 26s

Clustering...

In [43]: