Goal: Merge abundance.tsv files from kallisto into TPM and est_counts heatmaps

In [1]:
%cd ../data/
/home/explorer/BMS270/data

In [2]:
!wget http://histo.ucsf.edu/BMS270/BMS270_2019/data/GSE88801_kallisto.tar.gz
--2019-05-29 12:37:28--  http://histo.ucsf.edu/BMS270/BMS270_2019/data/GSE88801_kallisto.tar.gz
Resolving histo.ucsf.edu (histo.ucsf.edu)... 10.37.29.26
Connecting to histo.ucsf.edu (histo.ucsf.edu)|10.37.29.26|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 41688072 (40M) [application/x-gzip]
Saving to: ‘GSE88801_kallisto.tar.gz’

GSE88801_kallisto.t 100%[===================>]  39.76M  4.56MB/s    in 8.6s    

2019-05-29 12:37:36 (4.63 MB/s) - ‘GSE88801_kallisto.tar.gz’ saved [41688072/41688072]


In [4]:
!wget 'http://histo.ucsf.edu/BMS270/BMS270_2019/data/Mucci2.transcriptome.fasta.gz'
--2019-05-29 12:45:40--  http://histo.ucsf.edu/BMS270/BMS270_2019/data/Mucci2.transcriptome.fasta.gz
Resolving histo.ucsf.edu (histo.ucsf.edu)... 10.37.29.26
Connecting to histo.ucsf.edu (histo.ucsf.edu)|10.37.29.26|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4332692 (4.1M) [application/x-gzip]
Saving to: ‘Mucci2.transcriptome.fasta.gz’

Mucci2.transcriptom 100%[===================>]   4.13M  4.51MB/s    in 0.9s    

2019-05-29 12:45:41 (4.51 MB/s) - ‘Mucci2.transcriptome.fasta.gz’ saved [4332692/4332692]


In [5]:
!mkdir GSE88801
In [6]:
%cd GSE88801/
/home/explorer/BMS270/data/GSE88801

In [8]:
!tar -zxvf ../GSE88801_kallisto.tar.gz
GSM2348248/abundance.tsv
GSM2348249/abundance.tsv
GSM2348250/abundance.tsv
GSM2348251/abundance.tsv
GSM2348252/abundance.tsv
GSM2348253/abundance.tsv
GSM2348254/abundance.tsv
GSM2348255/abundance.tsv
GSM2348256/abundance.tsv
GSM2348257/abundance.tsv
GSM2348258/abundance.tsv
GSM2348259/abundance.tsv
GSM2348260/abundance.tsv
GSM2348261/abundance.tsv
GSM2348262/abundance.tsv
GSM2348263/abundance.tsv
GSM2348264/abundance.tsv
GSM2348265/abundance.tsv
GSM2348266/abundance.tsv
GSM2348267/abundance.tsv
GSM2348268/abundance.tsv
GSM2348269/abundance.tsv
GSM2348270/abundance.tsv
GSM2348271/abundance.tsv
GSM2348272/abundance.tsv
GSM2348273/abundance.tsv
GSM2348274/abundance.tsv
GSM2348275/abundance.tsv
GSM2348276/abundance.tsv
GSM2348277/abundance.tsv
GSM2348278/abundance.tsv
GSM2348279/abundance.tsv
GSM2348280/abundance.tsv
GSM2348281/abundance.tsv
GSM2348282/abundance.tsv
GSM2348283/abundance.tsv

In [4]:
%cd ../data/GSE88801/GSM2348248
/home/explorer/BMS270/data/GSE88801/GSM2348248

In [5]:
!ls
abundance.tsv

In [6]:
!head abundance.tsv
target_id	length	eff_length	est_counts	tpm
ENSMUST00000178537	12	6.20281	0	0
ENSMUST00000178862	14	6.94823	0	0
ENSMUST00000177564	16	7.66255	0	0
ENSMUST00000179664	11	5.81797	0	0
ENSMUST00000179883	16	7.66255	0	0
ENSMUST00000195858	10	5.42482	0	0
ENSMUST00000179932	12	6.20281	0	0
ENSMUST00000180001	17	8.0085	0	0
ENSMUST00000180266	17	8.0085	0	0

In [7]:
data = open("abundance.tsv").readlines()
In [8]:
len(data)
Out[8]:
88199
In [9]:
data[-1]
Out[9]:
'ENSMUST00000194300\t400\t250.785\t0\t0\n'
In [10]:
data[100]
Out[10]:
'ENSMUST00000103273\t370\t220.786\t0\t0\n'
In [12]:
data = []
est_counts = []
tpm = []
from csv import reader, excel_tab
fp = reader(open("abundance.tsv"),dialect=excel_tab)
next(fp)
for i in fp:
    data.append(i)
    tpm.append(float(i[-1]))
    est_counts.append(float(i[-2]))
In [13]:
%matplotlib nbagg
import matplotlib.pyplot as plt
In [14]:
plt.plot(sorted(tpm))
Out[14]:
[<matplotlib.lines.Line2D at 0x7f530dccbbe0>]
In [ ]: