In [1]:
%pwd
Out[1]:
'/home/explorer/BMS270/Notebooks'
In [2]:
%cd ../data
/home/explorer/BMS270/data
In [3]:
%ls
sample_table.csv  SRR7541452.6M.fastq.gz
In [4]:
fp = open("sample_table.csv")
fp.readline()
Out[4]:
'Run,desc,condition,mucor,batch,rep\n'
In [5]:
'run\tdesc\tcondition\n'
Out[5]:
'run\tdesc\tcondition\n'
In [6]:
print('run\tdesc\tcondition\n')
run	desc	condition

In [7]:
'run  desc condition\n'
Out[7]:
'run  desc condition\n'
In [8]:
fp.readline()
Out[8]:
'SRR7541435,"Mucor atf1 mutants cocultured with mouse macrophages (cell line J774A.1) for 5 hours, replicate b",mac,atf1,2,b\n'
In [9]:
line = fp.readline()
In [10]:
line
Out[10]:
'SRR7541439,"Mucor atf2 mutants cocultured with mouse macrophages (cell line J774A.1) for 5 hours, replicate b",mac,atf2,2,b\n'
In [11]:
type(line)
Out[11]:
str
In [12]:
line.split()
Out[12]:
['SRR7541439,"Mucor',
 'atf2',
 'mutants',
 'cocultured',
 'with',
 'mouse',
 'macrophages',
 '(cell',
 'line',
 'J774A.1)',
 'for',
 '5',
 'hours,',
 'replicate',
 'b",mac,atf2,2,b']
In [13]:
line.split(",")
Out[13]:
['SRR7541439',
 '"Mucor atf2 mutants cocultured with mouse macrophages (cell line J774A.1) for 5 hours',
 ' replicate b"',
 'mac',
 'atf2',
 '2',
 'b\n']
In [14]:
fields = line.split(",")
In [15]:
type(fields)
Out[15]:
list
In [16]:
line
Out[16]:
'SRR7541439,"Mucor atf2 mutants cocultured with mouse macrophages (cell line J774A.1) for 5 hours, replicate b",mac,atf2,2,b\n'
In [17]:
from csv import reader
In [18]:
fp.close()
In [19]:
fp = open("sample_table.csv")
In [20]:
r = reader(fp)
In [21]:
next(r)
Out[21]:
['Run', 'desc', 'condition', 'mucor', 'batch', 'rep']
In [22]:
next(r)
Out[22]:
['SRR7541435',
 'Mucor atf1 mutants cocultured with mouse macrophages (cell line J774A.1) for 5 hours, replicate b',
 'mac',
 'atf1',
 '2',
 'b']
In [23]:
from urllib import urlopen
---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-23-dbf1dbb53f94> in <module>()
----> 1 from urllib import urlopen

ImportError: cannot import name 'urlopen'
In [24]:
!wget 'http://histo.ucsf.edu/BMS270/BMS270_2019/data/GSE88801_kallisto_TPMs_thresh10.cdt'
--2019-05-14 13:39:21--  http://histo.ucsf.edu/BMS270/BMS270_2019/data/GSE88801_kallisto_TPMs_thresh10.cdt
Resolving histo.ucsf.edu (histo.ucsf.edu)... 10.37.29.26
Connecting to histo.ucsf.edu (histo.ucsf.edu)|10.37.29.26|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6712929 (6.4M) [image/x-coreldrawtemplate]
Saving to: ‘GSE88801_kallisto_TPMs_thresh10.cdt’

GSE88801_kallisto_T 100%[===================>]   6.40M  4.06MB/s    in 1.6s    

2019-05-14 13:39:22 (4.06 MB/s) - ‘GSE88801_kallisto_TPMs_thresh10.cdt’ saved [6712929/6712929]

In [25]:
%ls
GSE88801_kallisto_TPMs_thresh10.cdt  sample_table.csv  SRR7541452.6M.fastq.gz
In [26]:
fp2 = open("GSE88801_kallisto_TPMs_thresh10.cdt")
In [27]:
fp2.readline()
Out[27]:
'ORF\tNAME\tBMDM_Live_1_4h\tBMDM_Live_1_24h\tBMDM_Live_2_4h\tBMDM_Live_2_24h\tBMDM_Live_3_4h\tBMDM_Live_3_24h\tBMDM_Dead_1_4h\tBMDM_Dead_1_24h\tBMDM_Dead_2_4h\tBMDM_Dead_2_24h\tBMDM_Dead_3_4h\tBMDM_Dead_3_24h\tBMDM_uninfected_1_4h\tBMDM_uninfected_1_24h\tBMDM_uninfected_2_4h\tBMDM_uninfected_2_24h\tBMDM_uninfected_3_4h\tBMDM_uninfected_3_24h\tJ774_Live_1_4h\tJ774_Live_1_24h\tJ774_Live_2_4h\tJ774_Live_2_24h\tJ774_Live_3_4h\tJ774_Live_3_24h\tJ774_Dead_1_4h\tJ774_Dead_1_24h\tJ774_Dead_2_4h\tJ774_Dead_2_24h\tJ774_Dead_3_4h\tJ774_Dead_3_24h\tJ774_uninfected_1_4h\tJ774_uninfected_1_24h\tJ774_uninfected_2_4h\tJ774_uninfected_2_24h\tJ774_uninfected_3_4h\tJ774_uninfected_3_24h\n'
In [28]:
from csv import excel_tab
In [29]:
r2 = reader(open("GSE88801_kallisto_TPMs_thresh10.cdt"))
In [30]:
next(r2)
Out[30]:
['ORF\tNAME\tBMDM_Live_1_4h\tBMDM_Live_1_24h\tBMDM_Live_2_4h\tBMDM_Live_2_24h\tBMDM_Live_3_4h\tBMDM_Live_3_24h\tBMDM_Dead_1_4h\tBMDM_Dead_1_24h\tBMDM_Dead_2_4h\tBMDM_Dead_2_24h\tBMDM_Dead_3_4h\tBMDM_Dead_3_24h\tBMDM_uninfected_1_4h\tBMDM_uninfected_1_24h\tBMDM_uninfected_2_4h\tBMDM_uninfected_2_24h\tBMDM_uninfected_3_4h\tBMDM_uninfected_3_24h\tJ774_Live_1_4h\tJ774_Live_1_24h\tJ774_Live_2_4h\tJ774_Live_2_24h\tJ774_Live_3_4h\tJ774_Live_3_24h\tJ774_Dead_1_4h\tJ774_Dead_1_24h\tJ774_Dead_2_4h\tJ774_Dead_2_24h\tJ774_Dead_3_4h\tJ774_Dead_3_24h\tJ774_uninfected_1_4h\tJ774_uninfected_1_24h\tJ774_uninfected_2_4h\tJ774_uninfected_2_24h\tJ774_uninfected_3_4h\tJ774_uninfected_3_24h']
In [31]:
r2 = reader(open("GSE88801_kallisto_TPMs_thresh10.cdt"),dialect=excel_tab)
In [32]:
del excel_tab
In [33]:
r2 = reader(open("GSE88801_kallisto_TPMs_thresh10.cdt"),dialect=excel_tab)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-33-daecd4fdfae2> in <module>()
----> 1 r2 = reader(open("GSE88801_kallisto_TPMs_thresh10.cdt"),dialect=excel_tab)

NameError: name 'excel_tab' is not defined
In [34]:
from csv import excel_tab
In [35]:
r2 = reader(open("GSE88801_kallisto_TPMs_thresh10.cdt"),dialect=excel_tab)
In [36]:
header = next(r2)
In [37]:
header
Out[37]:
['ORF',
 'NAME',
 'BMDM_Live_1_4h',
 'BMDM_Live_1_24h',
 'BMDM_Live_2_4h',
 'BMDM_Live_2_24h',
 'BMDM_Live_3_4h',
 'BMDM_Live_3_24h',
 'BMDM_Dead_1_4h',
 'BMDM_Dead_1_24h',
 'BMDM_Dead_2_4h',
 'BMDM_Dead_2_24h',
 'BMDM_Dead_3_4h',
 'BMDM_Dead_3_24h',
 'BMDM_uninfected_1_4h',
 'BMDM_uninfected_1_24h',
 'BMDM_uninfected_2_4h',
 'BMDM_uninfected_2_24h',
 'BMDM_uninfected_3_4h',
 'BMDM_uninfected_3_24h',
 'J774_Live_1_4h',
 'J774_Live_1_24h',
 'J774_Live_2_4h',
 'J774_Live_2_24h',
 'J774_Live_3_4h',
 'J774_Live_3_24h',
 'J774_Dead_1_4h',
 'J774_Dead_1_24h',
 'J774_Dead_2_4h',
 'J774_Dead_2_24h',
 'J774_Dead_3_4h',
 'J774_Dead_3_24h',
 'J774_uninfected_1_4h',
 'J774_uninfected_1_24h',
 'J774_uninfected_2_4h',
 'J774_uninfected_2_24h',
 'J774_uninfected_3_4h',
 'J774_uninfected_3_24h']
In [38]:
row = next(r2)
In [39]:
row
Out[39]:
['ENSMUSG00000028180',
 'ENSMUSG00000028180',
 '6.362550367427176',
 '6.727814970645067',
 '5.933769484723882',
 '6.324633855252411',
 '6.300011673343996',
 '6.459424077257147',
 '6.377834837091959',
 '6.303301109319118',
 '6.415947237367395',
 '6.2040371993842145',
 '6.530726874373919',
 '6.266216094983205',
 '6.393225244921796',
 '6.453224368240938',
 '6.594934102465421',
 '6.5445476712839925',
 '6.994021522245025',
 '6.571406961935287',
 '7.392274493375117',
 '7.44675991054851',
 '7.753084209066774',
 '7.977082169126867',
 '7.786213029758888',
 '7.705710549943006',
 '7.43057097325026',
 '7.52388064668888',
 '7.801612997259689',
 '8.034651247248537',
 '7.239582395795018',
 '8.016029021943783',
 '7.347024653690205',
 '7.612796416014002',
 '7.6544356196138885',
 '7.760881187689698',
 '7.461089453549775',
 '7.8056366735095875']
In [40]:
row[0]
Out[40]:
'ENSMUSG00000028180'
In [41]:
row[1]
Out[41]:
'ENSMUSG00000028180'
In [42]:
row[2]
Out[42]:
'6.362550367427176'
In [43]:
row[-1]
Out[43]:
'7.8056366735095875'
In [44]:
row[-2]
Out[44]:
'7.461089453549775'
In [45]:
len(row)
Out[45]:
38
In [46]:
row[37]
Out[46]:
'7.8056366735095875'
In [47]:
row[2:5]
Out[47]:
['6.362550367427176', '6.727814970645067', '5.933769484723882']
In [48]:
row[:5]
Out[48]:
['ENSMUSG00000028180',
 'ENSMUSG00000028180',
 '6.362550367427176',
 '6.727814970645067',
 '5.933769484723882']
In [49]:
row[30:]
Out[49]:
['7.239582395795018',
 '8.016029021943783',
 '7.347024653690205',
 '7.612796416014002',
 '7.6544356196138885',
 '7.760881187689698',
 '7.461089453549775',
 '7.8056366735095875']
In [50]:
a = ["a","b","c","d"]
In [51]:
b = a
In [52]:
b
Out[52]:
['a', 'b', 'c', 'd']
In [53]:
b[2]
Out[53]:
'c'
In [54]:
b[2] = "Hello"
In [55]:
b
Out[55]:
['a', 'b', 'Hello', 'd']
In [56]:
a
Out[56]:
['a', 'b', 'Hello', 'd']

Make a full copy of b

In [57]:
# Make a full copy of b
a = b[:]
In [58]:
a[2] = "c"
In [59]:
a
Out[59]:
['a', 'b', 'c', 'd']
In [60]:
b
Out[60]:
['a', 'b', 'Hello', 'd']
In [61]:
'a"
  File "<ipython-input-61-1bd6b95ca6c8>", line 1
    'a"
       ^
SyntaxError: EOL while scanning string literal
In [62]:
"""This is a first line
this is a second line
"""
Out[62]:
'This is a first line\nthis is a second line\n'
In [64]:
("this is a first line"
"this is still the first line"
 "this too")
Out[64]:
'this is a first linethis is still the first linethis too'
In [67]:
row[:5]
Out[67]:
['ENSMUSG00000028180',
 'ENSMUSG00000028180',
 '6.362550367427176',
 '6.727814970645067',
 '5.933769484723882']
In [68]:
float(row[2])
Out[68]:
6.362550367427176
In [70]:
int(float(row[2]))
Out[70]:
6
In [71]:
int(float(row[3]))
Out[71]:
6
In [72]:
a = float(row[2])
int(a)
Out[72]:
6
In [73]:
int(float(row[3])+.5)
Out[73]:
7
In [74]:
int(float(row[2])+.5)
Out[74]:
6
In [75]:
data = []
for i in row:
    data.append(float(i))
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-75-e024f4172e61> in <module>()
      1 data = []
      2 for i in row:
----> 3     data.append(float(i))

ValueError: could not convert string to float: 'ENSMUSG00000028180'
In [77]:
data = []
for i in row[2:]:
    print("hi",i)
    data.append(float(i))
hi 6.362550367427176
hi 6.727814970645067
hi 5.933769484723882
hi 6.324633855252411
hi 6.300011673343996
hi 6.459424077257147
hi 6.377834837091959
hi 6.303301109319118
hi 6.415947237367395
hi 6.2040371993842145
hi 6.530726874373919
hi 6.266216094983205
hi 6.393225244921796
hi 6.453224368240938
hi 6.594934102465421
hi 6.5445476712839925
hi 6.994021522245025
hi 6.571406961935287
hi 7.392274493375117
hi 7.44675991054851
hi 7.753084209066774
hi 7.977082169126867
hi 7.786213029758888
hi 7.705710549943006
hi 7.43057097325026
hi 7.52388064668888
hi 7.801612997259689
hi 8.034651247248537
hi 7.239582395795018
hi 8.016029021943783
hi 7.347024653690205
hi 7.612796416014002
hi 7.6544356196138885
hi 7.760881187689698
hi 7.461089453549775
hi 7.8056366735095875
In [78]:
data
Out[78]:
[6.362550367427176,
 6.727814970645067,
 5.933769484723882,
 6.324633855252411,
 6.300011673343996,
 6.459424077257147,
 6.377834837091959,
 6.303301109319118,
 6.415947237367395,
 6.2040371993842145,
 6.530726874373919,
 6.266216094983205,
 6.393225244921796,
 6.453224368240938,
 6.594934102465421,
 6.5445476712839925,
 6.994021522245025,
 6.571406961935287,
 7.392274493375117,
 7.44675991054851,
 7.753084209066774,
 7.977082169126867,
 7.786213029758888,
 7.705710549943006,
 7.43057097325026,
 7.52388064668888,
 7.801612997259689,
 8.034651247248537,
 7.239582395795018,
 8.016029021943783,
 7.347024653690205,
 7.612796416014002,
 7.6544356196138885,
 7.760881187689698,
 7.461089453549775,
 7.8056366735095875]
In [79]:
data[:5]
Out[79]:
[6.362550367427176,
 6.727814970645067,
 5.933769484723882,
 6.324633855252411,
 6.300011673343996]
In [80]:
min(data),max(data)
Out[80]:
(5.933769484723882, 8.034651247248537)
In [81]:
histogram = [0,0,0,0,0]
for i in data:
    #histogram[int(i/2)] = histogram[int(i/2)] + 1
    histogram[int(i/2)] += 1
In [82]:
histogram
Out[82]:
[0, 0, 1, 33, 2]
In [83]:
%matplotlib nbagg
import matplotlib.pyplot as plt
In [84]:
fig = plt.figure()
plt.plot(histogram)
Out[84]:
[<matplotlib.lines.Line2D at 0x7f20099f7080>]
In [85]:
fig.savefig("myplot.png")
In [86]:
fig.savefig("myplot.pdf")
In [87]:
%ls
GSE88801_kallisto_TPMs_thresh10.cdt  myplot.png        SRR7541452.6M.fastq.gz
myplot.pdf                           sample_table.csv
In [88]:
fig = plt.figure()
plt.hist(data)
Out[88]:
(array([ 1.,  5.,  8.,  3.,  0.,  1.,  3.,  5.,  7.,  3.]),
 array([ 5.93376948,  6.14385766,  6.35394584,  6.56403401,  6.77412219,
         6.98421037,  7.19429854,  7.40438672,  7.61447489,  7.82456307,
         8.03465125]),
 <a list of 10 Patch objects>)
In [89]:
fig = plt.figure()
plt.plot(sorted(data))
Out[89]:
[<matplotlib.lines.Line2D at 0x7f20096af940>]
In [ ]: