Goal: Look at the mapped reads relative to sequencing depth for Mucor and Mouse, as a guide for picking sample files and their respective depths.
%cd ~/pdf/papers/mBio_10_e02765/kallisto/
%matplotlib
import matplotlib.pyplot as plt
import re
from MsvUtil import Table
table = Table.fromCsv("../sample_table.csv")
print(len(table))
print(table[0])
set(table["mucor"])
set(table["condition"])
log_re = re.compile("processed (?P<total>[\d,]+) reads, (?P<aligned>[\d,]+) reads pseudoaligned")
depths = []
mucor_mapped = []
for i in table:
for line in open("{run}_rf.err".format(run=i["Run"])):
p = log_re.search(line)
if(p is not None):
depths.append(int(p.group("total").replace(",","")))
mucor_mapped.append(int(p.group("aligned").replace(",","")))
break
else:
raise ValueError, "Couldn't find depth line in kallisto quant log"
mouse_mapped = []
for (i,d) in zip(table,depths):
for line in open("{run}_rf_GRCm38.err".format(run=i["Run"])):
p = log_re.search(line)
if(p is not None):
assert(d == int(p.group("total").replace(",","")))
mouse_mapped.append(int(p.group("aligned").replace(",","")))
break
else:
raise ValueError, "Couldn't find depth line in kallisto quant log"
def color(sample):
if(sample["mucor"] == "un"):
return "red"
elif(sample["condition"] == "mac"):
if(sample["batch"] == "1"):
return "blue"
else:
return "cyan"
else:
return "green"
fig = plt.figure()
plt.scatter(depths,mucor_mapped,c = [color(i) for i in table])
plt.xlabel("Total reads")
plt.ylabel("Reads pseudo-aligned to transcriptome")
plt.title("Mucor kallisto")
fig
fig = plt.figure()
plt.scatter(depths,mouse_mapped,c = [color(i) for i in table])
plt.xlabel("Total reads")
plt.ylabel("Reads pseudo-aligned to transcriptome")
plt.title("Mouse kallisto")
fig
def color(sample):
if(sample["mucor"] == "un"):
return "red"
elif(sample["condition"] == "mac"):
if(sample["mucor"] == "R7B"):
return "blue"
elif(sample["mucor"] == "NRRL3631"):
return "cyan"
elif(sample["mucor"] == "atf1"):
return "purple"
else:
return "magenta"
else:
return "green"
def edge(sample):
if(sample["batch"] == "1"):
return "grey"
else:
return "black"
fig = plt.figure()
plt.scatter(depths,mucor_mapped,c = [color(i) for i in table], edgecolors = [edge(i) for i in table])
plt.xlabel("Total reads")
plt.ylabel("Reads pseudo-aligned to transcriptome")
plt.title("Mucor kallisto")
fig
fig = plt.figure()
plt.scatter(depths,mouse_mapped,c = [color(i) for i in table], edgecolors = [edge(i) for i in table])
plt.xlabel("Total reads")
plt.ylabel("Reads pseudo-aligned to transcriptome")
plt.title("Mouse kallisto")
fig
fig = plt.figure()
plt.scatter(mucor_mapped,mouse_mapped,c = [color(i) for i in table], edgecolors = [edge(i) for i in table])
plt.xlabel("Reads pseudo-aligned to mucor")
plt.ylabel("Reads pseudo-aligned to mouse")
plt.title("Mouse/Mucor kallisto")
fig
mvoorhie@ayanganna:~/pdf/papers/mBio_10_e02765/SRP154454$ time zcat SRR7541452_1.fastq.gz | \ stream_sampler.py -s 42 -f fastq - 6000000 | gzip > ~/SRR7541452.6M.fastq.gz real 2m31.357s user 3m13.668s sys 0m5.820s
That gives 313M, which is big, but let's start with it anyway.