Goal: Fit contrasts from tables S1 and S2 of the paper. Choose abundance and fold-change filters liberal enough for direct comparison with these supplements.

%cd ~/pdf/papers/SciRep_7_42225/

/home/mvoorhie/pdf/papers/SciRep_7_42225

from CdtFile import CdtFile, CdtRow
from MsvUtil import Table, revdict, multirevdict
from SafeMath import safelog, safesub, safeadd, safesum

import gzip
from math import log
import os
import numpy as np

# Uncomment this line for newer Jupyter stacks (e.g., Debian 9 (stretch), current Enthought Canopy)
#%load_ext rpy2.ipython 
# Uncomment this line for older IPython stacks (e.g., Debian 8 (jessie), Ubuntu 16.04 (xenial))
%load_ext rmagic

%%R
library(limma)
library(edgeR)

Load Annotations

Sample annotations

Hand compiled spreadsheet based on paper supplement, mapped to GEO IDs via read counts

samples = Table.fromCsv("sample_table_v2.csv")
print len(samples)
print samples[0]

36
name:		GSM2348248
infection:		uninfected
strain:		J774
time:		4
replicate:		1

ival = {"Live":0,"Dead":1,"uninfected":2}
samples = Table(header = samples.header[:],
                rows = [list(j) 
                        for j in sorted(samples, key = lambda i: (i["strain"],ival[i["infection"]],int(i["replicate"]),int(i["time"])))])

name2sample = dict((i["name"],i) for i in samples)

ENSEMBL Mouse Transcript ID -> Gene ID mapping

import re
gtf_re = re.compile(r'(?P<key>[\S]+)[\s]+"(?P<val>[^"]+)";')

transcript2gene = {}
gene2name = {}
for line in gzip.open("/home/bms270/BMS270_2017/Mus_musculus.GRCm38.79.gtf.gz"):
    if(line.startswith("#")):
        continue
    fields = line.split("\t")
    if(fields[2] == "transcript"):
        transcript = None
        gene = None
        for anno in gtf_re.finditer(fields[8]):
            if(anno.group("key") == "gene_id"):
                gene = anno.group("val")
            elif(anno.group("key") == "transcript_id"):
                transcript = anno.group("val")
        assert(None not in (gene,transcript))
        transcript2gene[transcript] = gene
    elif(fields[2] == "gene"):
        gene = None
        name = None
        for anno in gtf_re.finditer(fields[8]):
            if(anno.group("key") == "gene_id"):
                gene = anno.group("val")
            elif(anno.group("key") == "gene_name"):
                name = anno.group("val")
        assert(gene is not None)
        if(name is None):
            name = gene
        gene2name[gene] = name
len(transcript2gene),len(gene2name)

(104129, 43629)

gene2transcript = multirevdict(transcript2gene)
len(gene2transcript)

43629

Merge count and TPM values from kallisto

Step 1: TPMs

genes = None
cols = []
for i in samples["name"]:
    table = Table.fromTdt(open(os.path.join(i,"abundance.tsv")))
    if(genes is None):
        genes = table["target_id"]
    else:
        assert(genes == table["target_id"])
    # N.B.: Not log transforming yet so that we can sum over transcript TPMs for each gene
    cols.append([float(i) for i in table["tpm"]])

trans_tpm = CdtFile(probes = [CdtRow(gid = i[0], uniqid = i[0], name = transcript2gene[i[0]],
                                        ratios = i[1:])
                                 for i in zip(*([genes]+cols))],
                       fieldnames = samples["name"],
                       eweights = [1]*len(samples["name"]))
len(trans_tpm)

88198

genes = set(i.Name() for i in trans_tpm)
len(genes)

30735

def gene_row(gene, cdt, gene2transcript):
    rows = [cdt.GetUid(i) for i in gene2transcript[gene]]
    return CdtRow(gid = gene, uniqid = gene, name = gene,
                  ratios = [safelog(sum(j)) for j in zip(*rows)])

# N.B.: At this point, we switch to log space
genes_tpm = CdtFile.fromPrototype(trans_tpm, probes = [gene_row(i, trans_tpm, gene2transcript) for i in genes])
len(genes_tpm)

30735

Step 2: estimated counts

Following the same method as for TPMs

genes2 = None
cols = []
for i in samples["name"]:
    table = Table.fromTdt(open(os.path.join(i,"abundance.tsv")))
    if(genes2 is None):
        genes2 = table["target_id"]
    else:
        assert(genes2 == table["target_id"])
    # N.B.: Not log transforming yet so that we can sum over transcript TPMs for each gene
    cols.append([float(i) for i in table["est_counts"]])

trans_count = CdtFile(probes = [CdtRow(gid = i[0], uniqid = i[0], name = transcript2gene[i[0]],
                                        ratios = i[1:])
                                 for i in zip(*([genes2]+cols))],
                       fieldnames = samples["name"],
                       eweights = [1]*len(samples["name"]))
len(trans_count)

88198

genes2 = set(i.Name() for i in trans_count)
genes == genes2

True

def gene_row(gene, cdt, gene2transcript):
    rows = [cdt.GetUid(i) for i in gene2transcript[gene]]
    return CdtRow(gid = gene, uniqid = gene, name = gene,
                  ratios = [sum(j) for j in zip(*rows)])

# Note that we *do not* log transform counts.
genes_count = CdtFile.fromPrototype(trans_tpm, probes = [gene_row(i, trans_count, gene2transcript) for i in genes])
len(genes_count)

30735

TPM Filter

def clip(x):
    if(x <= 0.):
        return 0.
    return x

def clip_cdt(cdt):
    return CdtFile.fromPrototype(
           cdt,
           probes = [CdtRow.fromPrototype(i, ratios = [clip(j) for j in i])
                     for i in cdt])

def threshold_cdt(cdt, thresh):
    return CdtFile.fromPrototype(
           cdt,
           probes = [i for i in cdt if(max(i) >= thresh)])

Filter for genes with $TPM \geq 1$ in at least one sample (Normally, I would be more conservative here, but the paper reports a lot of low abundance differential genes, and I'd like to see how limma handles them. If we were being really careful, we would filter based on count rather than on TPM)

ct_genes = threshold_cdt(clip_cdt(genes_tpm), 0)
len(ct_genes), len(genes_tpm)

(30735, 30735)

Note that this filter does not remove any genes

ct_count = CdtFile.fromPrototype(genes_count, probes = [genes_count.GetUid(i.Uniqid()) for i in ct_genes])
len(ct_count)

30735

Stage for import into R

C = np.array([[clip(j) for j in i] for i in ct_count])
C.shape

(30735, 36)

name2row = dict((i.Name(),n+1) for (n,i) in enumerate(ct_count))
row2name = revdict(name2row)
map(len,(name2row,row2name))

[30735, 30735]

state = []
cols = []
infection = []
strain = []
time = []
replicate = []
for i in samples:
    cols.append(genes_count.fieldnames.index(i["name"]))
    time.append(int(i["time"]))
    infection.append(i["infection"])
    strain.append(i["strain"])
    replicate.append(i["replicate"])
    state.append("%s.%s.%d" % (strain[-1],infection[-1], time[-1]))

Limma fit

%%R -i C
dge <- DGEList(counts=C)
dge <- calcNormFactors(dge)

%%R -i state,strain,infection,time -o d,cpm,fc,cn
state <- as.factor(state)
strain <- as.factor(strain)
infection <- as.factor(infection)
time <- as.factor(time)
d <- model.matrix(~0+state)
colnames(d) <- gsub("state","",colnames(d))
print(colnames(d))

v <- voom(dge, d, plot = TRUE)
fit <- lmFit(v, d)

contrast.matrix <- makeContrasts(# S1 contrast
                                 BMDM.uninfected.24-BMDM.uninfected.4,
                                 # S2 contrasts
                                 BMDM.Live.4-BMDM.uninfected.4,BMDM.Dead.4-BMDM.uninfected.4,
                                 BMDM.Live.24-BMDM.uninfected.24,BMDM.Dead.24-BMDM.uninfected.24,
                                 J774.Live.4-J774.uninfected.4,J774.Dead.4-J774.uninfected.4,
                                 J774.Live.24-J774.uninfected.24,J774.Dead.24-J774.uninfected.24,
                                 levels=d)

fit2 <- contrasts.fit(fit, contrast.matrix)
fit2 <- eBayes(fit2)
print(summary(decideTests(fit2)))

fc <- fit$coefficients
cn <- colnames(fit$coefficients)
cpm <- v$E

 [1] "BMDM.Dead.24"       "BMDM.Dead.4"        "BMDM.Live.24"      
 [4] "BMDM.Live.4"        "BMDM.uninfected.24" "BMDM.uninfected.4" 
 [7] "J774.Dead.24"       "J774.Dead.4"        "J774.Live.24"      
[10] "J774.Live.4"        "J774.uninfected.24" "J774.uninfected.4" 
   BMDM.uninfected.24 - BMDM.uninfected.4 BMDM.Live.4 - BMDM.uninfected.4
-1                                   1862                            2036
0                                   26343                           26742
1                                    2530                            1957
   BMDM.Dead.4 - BMDM.uninfected.4 BMDM.Live.24 - BMDM.uninfected.24
-1                            1385                              2446
0                            27758                             26099
1                             1592                              2190
   BMDM.Dead.24 - BMDM.uninfected.24 J774.Live.4 - J774.uninfected.4
-1                              1281                             153
0                              28293                           30289
1                               1161                             293
   J774.Dead.4 - J774.uninfected.4 J774.Live.24 - J774.uninfected.24
-1                              91                              1794
0                            30369                             26974
1                              275                              1967
   J774.Dead.24 - J774.uninfected.24
-1                               379
0                              29969
1                                387

Fold-change filter chosen to match minimum log2 fold change in the supplements. Normally, I would use something higher.

%%R
print(summary(decideTests(fit2, lfc=.27)))

   BMDM.uninfected.24 - BMDM.uninfected.4 BMDM.Live.4 - BMDM.uninfected.4
-1                                   1862                            2036
0                                   26343                           26742
1                                    2530                            1957
   BMDM.Dead.4 - BMDM.uninfected.4 BMDM.Live.24 - BMDM.uninfected.24
-1                            1385                              2446
0                            27759                             26099
1                             1591                              2190
   BMDM.Dead.24 - BMDM.uninfected.24 J774.Live.4 - J774.uninfected.4
-1                              1281                             153
0                              28293                           30289
1                               1161                             293
   J774.Dead.4 - J774.uninfected.4 J774.Live.24 - J774.uninfected.24
-1                              91                              1794
0                            30369                             26974
1                              275                              1967
   J774.Dead.24 - J774.uninfected.24
-1                               379
0                              29969
1                                387

Extract fit contrasts

%%R -o cnames
cnames <- colnames(fit2$contrasts)

%%R -o t0,t1,t2,t3,t4,t5,t6,t7,t8,t9
t0 <- topTable(fit2, n = 40000, p.value = 1.)
t0 <- cbind(Row.Names = rownames(t0), t0)
t1 <- topTable(fit2, coef="BMDM.uninfected.24 - BMDM.uninfected.4", n = 20000, lfc=.27, p.value=.05)
t1 <- cbind(Row.Names = rownames(t1), t1)
t2 <- topTable(fit2, coef="BMDM.Live.4 - BMDM.uninfected.4", n = 20000, lfc=.27, p.value=.05)
t2 <- cbind(Row.Names = rownames(t2), t2)
t3 <- topTable(fit2, coef="BMDM.Dead.4 - BMDM.uninfected.4", n = 20000, lfc=.27, p.value=.05)
t3 <- cbind(Row.Names = rownames(t3), t3)
t4 <- topTable(fit2, coef="BMDM.Live.24 - BMDM.uninfected.24", n = 20000, lfc=.27, p.value=.05)
t4 <- cbind(Row.Names = rownames(t4), t4)
t5 <- topTable(fit2, coef="BMDM.Dead.24 - BMDM.uninfected.24", n = 20000, lfc=.27, p.value=.05)
t5 <- cbind(Row.Names = rownames(t5), t5)
t6 <- topTable(fit2, coef="J774.Live.4 - J774.uninfected.4", n = 20000, lfc=.27, p.value=.05)
t6 <- cbind(Row.Names = rownames(t6), t6)
t7 <- topTable(fit2, coef="J774.Dead.4 - J774.uninfected.4", n = 20000, lfc=.27, p.value=.05)
t7 <- cbind(Row.Names = rownames(t7), t7)
t8 <- topTable(fit2, coef="J774.Live.24 - J774.uninfected.24", n = 20000, lfc=.27, p.value=.05)
t8 <- cbind(Row.Names = rownames(t8), t8)
t9 <- topTable(fit2, coef="J774.Dead.24 - J774.uninfected.24", n = 20000, lfc=.27, p.value=.05)
t9 <- cbind(Row.Names = rownames(t9), t9)

uid2t0 = dict((row2name[int(i[0])], list(i)) for i in t0)

t = [t1,t2,t3,t4,t5,t6,t7,t8,t9]

Find union of differential genes

def contrast_name(s):
    """Return simplified contrast name, factoring common elements."""
    terms = s.split(" - ")
    a = terms[0].split(".")
    b = terms[1].split(".")
    A = []
    B = []
    C = []
    for (i,j) in zip(a,b):
        if(i == j):
            C.append(i)
        else:
            A.append(i)
            B.append(j)
    return "%s/%s|%s" % (",".join(A),",".join(B),",".join(C))

dnames = map(contrast_name, cnames)

dnames

['24/4|BMDM,uninfected',
 'Live/uninfected|BMDM,4',
 'Dead/uninfected|BMDM,4',
 'Live/uninfected|BMDM,24',
 'Dead/uninfected|BMDM,24',
 'Live/uninfected|J774,4',
 'Dead/uninfected|J774,4',
 'Live/uninfected|J774,24',
 'Dead/uninfected|J774,24']

differential = {}
for (name, table) in zip(dnames,t):
    differential[name] = {"up":   set(row2name[int(i[0])] for i in table if(i[1] > 0)),
                          "down": set(row2name[int(i[0])] for i in table if(i[1] < 0))}
   
for (key,val) in sorted(differential.items(), key = lambda x: len(x[1]["up"])+len(x[1]["down"])):
    print key, len(val["up"]), len(val["down"])

Dead/uninfected|J774,4 275 91
Live/uninfected|J774,4 293 153
Dead/uninfected|J774,24 387 379
Dead/uninfected|BMDM,24 1161 1281
Dead/uninfected|BMDM,4 1591 1385
Live/uninfected|J774,24 1967 1794
Live/uninfected|BMDM,4 1957 2036
24/4|BMDM,uninfected 2530 1862
Live/uninfected|BMDM,24 2190 2446

all_diff = set()
for group in differential.values():
    for uids in group.values():
        all_diff.update(uids)
len(all_diff)

9423

Export heatmaps in CDT format

def fit_ratios(uid, order = None):
    a = uid2t0[uid][1:len(cnames)+1]
    if(order is None):
        return a
    return [a[i] for i in order]

def diffsets_to_ratios(uid, diffs):
    ratios = []
    for contrast in diffs:
        if(uid in contrast["up"]):
            ratios.append(4.)
        elif(uid in contrast["down"]):
            ratios.append(-4.)
        else:
            ratios.append(0.)
    return ratios

Limma fit contrasts for all genes

common = sorted(set(i.Uniqid() for i in ct_genes))

set(common) == set(uid2t0)

True

diffs = [differential[i] for i in dnames]
fieldnames = dnames+[i+" sig" for i in dnames]
probes = []
for i in common:
    probes.append(CdtRow(gid = i, uniqid = i, name = gene2name[i], 
                         ratios = (fit_ratios(i)+
                                   diffsets_to_ratios(i,diffs))))

cdt_contrast = CdtFile(probes = probes,
                       fieldnames = fieldnames,
                       eweights = [1.]*len(fieldnames))

cdt_contrast.write(open("cdt_contrast.cdt","w"))

len(cdt_contrast)

30735

Limma fit contrasts for significantly differential genes

cdt_contrast_sig = CdtFile.fromPrototype(
   cdt_contrast, 
   probes = [i for i in cdt_contrast if(i.Uniqid() in all_diff)])
len(cdt_contrast_sig)

9423

%%time
tree = cdt_contrast_sig.cluster(dist = "u", method = "m", cols=range(len(cdt_contrast_sig.fieldnames)//2))
cdt_contrast_sig.writeCdtGtr("cdt_contrast_sig.um",tree)

Building array...
Building distance matrix...

CPU times: user 2min 26s, sys: 264 ms, total: 2min 26s
Wall time: 2min 26s

Clustering...