Goal: Merge abundance.tsv files from kallisto into TPM and est_counts heatmaps

%cd ../data/

/home/explorer/BMS270/data

!wget http://histo.ucsf.edu/BMS270/BMS270_2019/data/GSE88801_kallisto.tar.gz

--2019-05-29 12:37:28--  http://histo.ucsf.edu/BMS270/BMS270_2019/data/GSE88801_kallisto.tar.gz
Resolving histo.ucsf.edu (histo.ucsf.edu)... 10.37.29.26
Connecting to histo.ucsf.edu (histo.ucsf.edu)|10.37.29.26|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 41688072 (40M) [application/x-gzip]
Saving to: ‘GSE88801_kallisto.tar.gz’

GSE88801_kallisto.t 100%[===================>]  39.76M  4.56MB/s    in 8.6s    

2019-05-29 12:37:36 (4.63 MB/s) - ‘GSE88801_kallisto.tar.gz’ saved [41688072/41688072]

!wget 'http://histo.ucsf.edu/BMS270/BMS270_2019/data/Mucci2.transcriptome.fasta.gz'

--2019-05-29 12:45:40--  http://histo.ucsf.edu/BMS270/BMS270_2019/data/Mucci2.transcriptome.fasta.gz
Resolving histo.ucsf.edu (histo.ucsf.edu)... 10.37.29.26
Connecting to histo.ucsf.edu (histo.ucsf.edu)|10.37.29.26|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4332692 (4.1M) [application/x-gzip]
Saving to: ‘Mucci2.transcriptome.fasta.gz’

Mucci2.transcriptom 100%[===================>]   4.13M  4.51MB/s    in 0.9s    

2019-05-29 12:45:41 (4.51 MB/s) - ‘Mucci2.transcriptome.fasta.gz’ saved [4332692/4332692]

!mkdir GSE88801

%cd GSE88801/

/home/explorer/BMS270/data/GSE88801

!tar -zxvf ../GSE88801_kallisto.tar.gz

GSM2348248/abundance.tsv
GSM2348249/abundance.tsv
GSM2348250/abundance.tsv
GSM2348251/abundance.tsv
GSM2348252/abundance.tsv
GSM2348253/abundance.tsv
GSM2348254/abundance.tsv
GSM2348255/abundance.tsv
GSM2348256/abundance.tsv
GSM2348257/abundance.tsv
GSM2348258/abundance.tsv
GSM2348259/abundance.tsv
GSM2348260/abundance.tsv
GSM2348261/abundance.tsv
GSM2348262/abundance.tsv
GSM2348263/abundance.tsv
GSM2348264/abundance.tsv
GSM2348265/abundance.tsv
GSM2348266/abundance.tsv
GSM2348267/abundance.tsv
GSM2348268/abundance.tsv
GSM2348269/abundance.tsv
GSM2348270/abundance.tsv
GSM2348271/abundance.tsv
GSM2348272/abundance.tsv
GSM2348273/abundance.tsv
GSM2348274/abundance.tsv
GSM2348275/abundance.tsv
GSM2348276/abundance.tsv
GSM2348277/abundance.tsv
GSM2348278/abundance.tsv
GSM2348279/abundance.tsv
GSM2348280/abundance.tsv
GSM2348281/abundance.tsv
GSM2348282/abundance.tsv
GSM2348283/abundance.tsv

%cd ../data/GSE88801/GSM2348248

/home/explorer/BMS270/data/GSE88801/GSM2348248

!ls

abundance.tsv

!head abundance.tsv

target_id	length	eff_length	est_counts	tpm
ENSMUST00000178537	12	6.20281	0	0
ENSMUST00000178862	14	6.94823	0	0
ENSMUST00000177564	16	7.66255	0	0
ENSMUST00000179664	11	5.81797	0	0
ENSMUST00000179883	16	7.66255	0	0
ENSMUST00000195858	10	5.42482	0	0
ENSMUST00000179932	12	6.20281	0	0
ENSMUST00000180001	17	8.0085	0	0
ENSMUST00000180266	17	8.0085	0	0

data = open("abundance.tsv").readlines()

len(data)

88199

data[-1]

'ENSMUST00000194300\t400\t250.785\t0\t0\n'

data[100]

'ENSMUST00000103273\t370\t220.786\t0\t0\n'

data = []
est_counts = []
tpm = []
from csv import reader, excel_tab
fp = reader(open("abundance.tsv"),dialect=excel_tab)
next(fp)
for i in fp:
    data.append(i)
    tpm.append(float(i[-1]))
    est_counts.append(float(i[-2]))

%matplotlib nbagg
import matplotlib.pyplot as plt

plt.plot(sorted(tpm))

[<matplotlib.lines.Line2D at 0x7f530dccbbe0>]