In [1]:
%pwd
Out[1]:
'/home/bms270/BMS270_2018'
In [2]:
!wget 'http://histo.ucsf.edu/BMS270/BMS270_2018/data/example1'
--2018-04-03 14:27:47--  http://histo.ucsf.edu/BMS270/BMS270_2018/data/example1
Resolving histo.ucsf.edu (histo.ucsf.edu)... 128.218.234.54
Connecting to histo.ucsf.edu (histo.ucsf.edu)|128.218.234.54|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3252152 (3.1M)
Saving to: ‘example1’

example1            100%[===================>]   3.10M  5.96MB/s    in 0.5s    

2018-04-03 14:27:47 (5.96 MB/s) - ‘example1’ saved [3252152/3252152]

In [3]:
!wget 'http://histo.ucsf.edu/BMS270/BMS270_2018/data/example2'
--2018-04-03 14:27:59--  http://histo.ucsf.edu/BMS270/BMS270_2018/data/example2
Resolving histo.ucsf.edu (histo.ucsf.edu)... 128.218.234.54
Connecting to histo.ucsf.edu (histo.ucsf.edu)|128.218.234.54|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3284814 (3.1M)
Saving to: ‘example2’

example2            100%[===================>]   3.13M  5.94MB/s    in 0.5s    

2018-04-03 14:28:00 (5.94 MB/s) - ‘example2’ saved [3284814/3284814]

In [4]:
%ls
Day1.ipynb  Day1_practice.ipynb  Day2.ipynb  example1  example2  Untitled.ipynb
In [5]:
fp = open("example1")
In [6]:
fp.read(10)
Out[6]:
'@SRR424424'
In [7]:
fp.read(10)
Out[7]:
'2.20068354'
In [8]:
fp.seek(0)
Out[8]:
0
In [ ]:
%cd /home/mvoorhie/
In [9]:
%magic
In [10]:
fp.readline()
Out[10]:
'@SRR4244242.20068354 20068354 length=112\n'
In [11]:
fp.readline()
Out[11]:
'CTGGGCTCCACCTCTAGGGTGATGGTCTTGCAGGTCAGGGTCTTCGCGAAGATCTGCATTATGACCTGATAACAAATGTGATGAAAGCACAAACCGCCCAGCGCGTCGAAAC\n'
In [12]:
fp.readline()
Out[12]:
'+SRR4244242.20068354 20068354 length=112\n'
In [13]:
fp.readline()
Out[13]:
'AAAA.FFFF))FFF)FFFFAF<FAF.FAFF))FAFFFFFFFFFF7)FF<FFA7FFFF.7F7FFAFFFF.AAFF<F.FF.<AFFFAAFF<F.)<FFAFF<.AF<FAFF.F..F\n'
In [14]:
fp.seek(0)
Out[14]:
0
In [15]:
name = fp.readline()
seq = fp.readline()
name2 = fp.readline()
qual = fp.readline()
In [16]:
name,name2
Out[16]:
('@SRR4244242.20068354 20068354 length=112\n',
 '+SRR4244242.20068354 20068354 length=112\n')
In [17]:
len(seq),len(qual)
Out[17]:
(113, 113)
In [18]:
seq[0]
Out[18]:
'C'
In [19]:
ord(seq[0])
Out[19]:
67
In [20]:
chr(67)
Out[20]:
'C'
In [21]:
ord('J')
Out[21]:
74
In [22]:
fp.seek(0)
Out[22]:
0
In [23]:
lines = []
for line in fp:
    lines.append(line)
In [24]:
len(lines)
Out[24]:
40000
In [25]:
line[0]
Out[25]:
'A'
In [26]:
lines[0]
Out[26]:
'@SRR4244242.20068354 20068354 length=112\n'
In [27]:
lines[-1]
Out[27]:
'AAAAAFFFFFFFFFFFFFFFFFFFFAFFFFFFFFFF<)FFFFFFF<FFFFAFFFFFF.FFFF<FFFFFF.FFFFFFFFFFFFFAF<FFFF<FFFFFFFF<AFFFFFFFFFFF.FFFFFF.F7FFFFFFFF7F.F7F.FFAFFF<AFA<)FF\n'
In [28]:
lines[:8]
Out[28]:
['@SRR4244242.20068354 20068354 length=112\n',
 'CTGGGCTCCACCTCTAGGGTGATGGTCTTGCAGGTCAGGGTCTTCGCGAAGATCTGCATTATGACCTGATAACAAATGTGATGAAAGCACAAACCGCCCAGCGCGTCGAAAC\n',
 '+SRR4244242.20068354 20068354 length=112\n',
 'AAAA.FFFF))FFF)FFFFAF<FAF.FAFF))FAFFFFFFFFFF7)FF<FFA7FFFF.7F7FFAFFFF.AAFF<F.FF.<AFFFAAFF<F.)<FFAFF<.AF<FAFF.F..F\n',
 '@SRR4244242.6143545 6143545 length=116\n',
 'TTTCACCTCAGTGACGCAGCCCTTCTCTCTCCAGTCCACAGTGTCAGGCAATGTCCGATTAGAGTATGACCTGAAAGTGACAGTCTTCGGAGACTGTCGGGGAATTCTCAGAGCAC\n',
 '+SRR4244242.6143545 6143545 length=116\n',
 'AAAAAFFFFFFFFFFFFFFFFFFFFFFFFFFF)FFF7.FFFFFFFFFFFFFFFFFFFF<F<FFFFFFFFFFFFFFFFFA<FFFFFFFFAF<FAFFFFF.FFFFFFFFAFFFAFFAF\n']
In [29]:
fp.close()
In [30]:
fp = open("example2")
In [31]:
fp.read(10)
---------------------------------------------------------------------------
UnicodeDecodeError                        Traceback (most recent call last)
<ipython-input-31-b2af43e26af9> in <module>()
----> 1 fp.read(10)

/home/bms270/.local/share/canopy/edm/envs/User/lib/python3.5/codecs.py in decode(self, input, final)
    319         # decode input (taking the buffer into account)
    320         data = self.buffer + input
--> 321         (result, consumed) = self._buffer_decode(data, self.errors, final)
    322         # keep undecoded input until the next call
    323         self.buffer = data[consumed:]

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xac in position 25: invalid start byte
In [32]:
!wget 'http://histo.ucsf.edu/BMS270/BMS270_2018/data/supp2data.cdt'
--2018-04-03 15:04:05--  http://histo.ucsf.edu/BMS270/BMS270_2018/data/supp2data.cdt
Resolving histo.ucsf.edu (histo.ucsf.edu)... 128.218.234.54
Connecting to histo.ucsf.edu (histo.ucsf.edu)|128.218.234.54|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1212035 (1.2M) [image/x-coreldrawtemplate]
Saving to: ‘supp2data.cdt’

supp2data.cdt       100%[===================>]   1.16M  6.45MB/s    in 0.2s    

2018-04-03 15:04:05 (6.45 MB/s) - ‘supp2data.cdt’ saved [1212035/1212035]

In [33]:
fp = open("supp2data.cdt")
In [34]:
fp.readline()
Out[34]:
'ORF\tNAME\talpha 0\talpha 7\talpha 14\talpha 21\talpha 28\talpha 35\talpha 42\talpha 49\talpha 56\talpha 63\talpha 70\talpha 77\talpha 84\talpha 91\talpha 98\talpha 105\talpha 112\talpha 119\tElu 0\tElu 30\tElu 60\tElu 90\tElu 120\tElu 150\tElu 180\tElu 210\tElu 240\tElu 270\tElu 300\tElu 330\tElu 360\tElu 390\tcdc15 10\tcdc15 30\tcdc15 50\tcdc15 70\tcdc15 90\tcdc15 110\tcdc15 130\tcdc15 150\tcdc15 170\tcdc15 190\tcdc15 210\tcdc15 230\tcdc15 250\tcdc15 270\tcdc15 290\tspo 0\tspo 2\tspo 5\tspo 7\tspo 9\tspo 11\tspo5 2\tspo5 7\tspo5 11\tspo- early\tspo- mid\theat 0\theat 10\theat 20\theat 40\theat 80\theat 160\tdtt 15\tdtt 30\tdtt 60\tdtt 120\tcold 0\tcold 20\tcold 40\tcold 160\tdiau a\tdiau b\tdiau c\tdiau d\tdiau e\tdiau f\tdiau g\n'
In [35]:
fp.readline()
Out[35]:
'YBR166C\tTYR1   TYROSINE BIOSYNTHESIS    PREPHENATE DEHYDROGENASE (NADP+\t0.33\t-0.17\t0.04\t-0.07\t-0.09\t-0.12\t-0.03\t-0.2\t-0.06\t-0.06\t-0.14\t-0.18\t-0.06\t-0.25\t0.06\t-0.12\t0.25\t0.43\t0.21\t-0.04\t-0.15\t-0.04\t0.21\t-0.14\t-0.03\t-0.07\t-0.36\t-0.14\t-0.42\t-0.34\t-0.23\t-0.17\t0.23\t0.3\t0.41\t-0.07\t-0.23\t-0.12\t0.16\t0.74\t0.14\t-0.49\t-0.32\t0.19\t0.23\t0.24\t0.28\t1.13\t-0.12\t0.1\t\t0.66\t0.62\t0.08\t0.62\t0.43\t0.5\t-0.25\t-0.51\t-0.67\t0.21\t-0.74\t-0.36\t-0.01\t0.38\t0.15\t-0.22\t-0.09\t0.33\t0.08\t0.39\t-0.17\t0.23\t0.2\t0.2\t-0.17\t-0.69\t0.14\t-0.27\n'
In [36]:
fp.readline()
Out[36]:
'YOR357C\tGRD19  SECRETION        GOLGI PROTEIN RETENTION\t-0.64\t-0.38\t-0.32\t-0.29\t-0.22\t-0.01\t-0.32\t-0.27\t-0.51\t-0.67\t-0.62\t-0.58\t-0.38\t-0.94\t-0.34\t-0.92\t-0.15\t0.03\t0.16\t-0.34\t-0.32\t-0.34\t-0.12\t-0.34\t-0.27\t-0.15\t-0.15\t-0.51\t-0.3\t-0.25\t-0.12\t-0.4\t0.98\t0.99\t0.25\t0.15\t0.08\t0.23\t0.18\t-0.29\t-0.45\t0.01\t-0.34\t-1.12\t-0.54\t-0.94\t-1.09\t-0.45\t-0.23\t-0.36\t0.08\t0.28\t0.18\t-0.12\t\t0.25\t-0.22\t-0.04\t-0.25\t0.04\t-0.03\t-0.07\t-0.04\t0.73\t-0.06\t0.54\t-0.09\t-0.29\t-0.1\t0.36\t-0.2\t-0.34\t-0.14\t-0.09\t0.06\t-0.17\t0.04\t-0.97\t-1.79\n'
In [37]:
rows = []
for line in open("supp2data.cdt"):
    rows.append(line.rstrip("\r\n").split("\t"))
In [38]:
rows[0]
Out[38]:
['ORF',
 'NAME',
 'alpha 0',
 'alpha 7',
 'alpha 14',
 'alpha 21',
 'alpha 28',
 'alpha 35',
 'alpha 42',
 'alpha 49',
 'alpha 56',
 'alpha 63',
 'alpha 70',
 'alpha 77',
 'alpha 84',
 'alpha 91',
 'alpha 98',
 'alpha 105',
 'alpha 112',
 'alpha 119',
 'Elu 0',
 'Elu 30',
 'Elu 60',
 'Elu 90',
 'Elu 120',
 'Elu 150',
 'Elu 180',
 'Elu 210',
 'Elu 240',
 'Elu 270',
 'Elu 300',
 'Elu 330',
 'Elu 360',
 'Elu 390',
 'cdc15 10',
 'cdc15 30',
 'cdc15 50',
 'cdc15 70',
 'cdc15 90',
 'cdc15 110',
 'cdc15 130',
 'cdc15 150',
 'cdc15 170',
 'cdc15 190',
 'cdc15 210',
 'cdc15 230',
 'cdc15 250',
 'cdc15 270',
 'cdc15 290',
 'spo 0',
 'spo 2',
 'spo 5',
 'spo 7',
 'spo 9',
 'spo 11',
 'spo5 2',
 'spo5 7',
 'spo5 11',
 'spo- early',
 'spo- mid',
 'heat 0',
 'heat 10',
 'heat 20',
 'heat 40',
 'heat 80',
 'heat 160',
 'dtt 15',
 'dtt 30',
 'dtt 60',
 'dtt 120',
 'cold 0',
 'cold 20',
 'cold 40',
 'cold 160',
 'diau a',
 'diau b',
 'diau c',
 'diau d',
 'diau e',
 'diau f',
 'diau g']
In [40]:
rows[0][0]
Out[40]:
'ORF'
In [42]:
rows[0][-1]
Out[42]:
'diau g'
In [45]:
rows[-1][:10]
Out[45]:
['YLR160C',
 'ASP3   ASPARAGINE UTILIZATION   L-ASPARAGINASE II',
 '0.07',
 '-0.04',
 '0.12',
 '-0.1',
 '0.06',
 '-0.32',
 '0.21',
 '-0.25']
In [46]:
print(rows[-1][:10])
['YLR160C', 'ASP3   ASPARAGINE UTILIZATION   L-ASPARAGINASE II', '0.07', '-0.04', '0.12', '-0.1', '0.06', '-0.32', '0.21', '-0.25']
In [50]:
data = []
fp = open("supp2data.cdt")
header = fp.readline()
for row in fp:
    d = []
    for field in row.rstrip("\r\n").split("\t")[2:]:
        try:
            d.append(float(field))
        except ValueError:
            print("{"+field+"}")
            print("["+row+"]")
            raise
    data.append(d)
{}
[YBR166C	TYR1   TYROSINE BIOSYNTHESIS    PREPHENATE DEHYDROGENASE (NADP+	0.33	-0.17	0.04	-0.07	-0.09	-0.12	-0.03	-0.2	-0.06	-0.06	-0.14	-0.18	-0.06	-0.25	0.06	-0.12	0.25	0.43	0.21	-0.04	-0.15	-0.04	0.21	-0.14	-0.03	-0.07	-0.36	-0.14	-0.42	-0.34	-0.23	-0.17	0.23	0.3	0.41	-0.07	-0.23	-0.12	0.16	0.74	0.14	-0.49	-0.32	0.19	0.23	0.24	0.28	1.13	-0.12	0.1		0.66	0.62	0.08	0.62	0.43	0.5	-0.25	-0.51	-0.67	0.21	-0.74	-0.36	-0.01	0.38	0.15	-0.22	-0.09	0.33	0.08	0.39	-0.17	0.23	0.2	0.2	-0.17	-0.69	0.14	-0.27
]
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-50-cbf83873fa3f> in <module>()
      6     for field in row.rstrip("\r\n").split("\t")[2:]:
      7         try:
----> 8             d.append(float(field))
      9         except ValueError:
     10             print("{"+field+"}")

ValueError: could not convert string to float: 
In [66]:
data = []
fp = open("supp2data.cdt")
header = fp.readline()
for row in fp:
    d = []
    for field in row.rstrip("\r\n").split("\t")[2:]:
        try:
            d.append(float(field))
        except ValueError:
            assert(field == "")
            d.append(None)
    data.append(d)
In [67]:
easy_data = []
fp = open("supp2data.cdt")
header = fp.readline()
for row in fp:
    d = []
    for field in row.rstrip("\r\n").split("\t")[2:]:
        try:
            d.append(float(field))
        except ValueError:
            assert(field == "")
            d.append(0.)
    easy_data.append(d)
In [68]:
len(easy_data)
Out[68]:
2467
In [69]:
len(data)
Out[69]:
2467
In [70]:
len(set(len(i) for i in data))
Out[70]:
1
In [71]:
set(len(i) for i in data)
Out[71]:
{79}
In [53]:
%matplotlib nbagg
In [54]:
import math
In [57]:
help(math.erf)
Help on built-in function erf in module math:

erf(...)
    erf(x)
    
    Error function at x.

In [58]:
math.sqrt(5)
Out[58]:
2.23606797749979
In [59]:
from math import sqrt,pi
In [60]:
sqrt(pi)
Out[60]:
1.7724538509055159
In [61]:
import matplotlib.pyplot as plt
In [73]:
plt.figure()
plt.plot(easy_data[0],easy_data[1],"bo")
Out[73]:
[<matplotlib.lines.Line2D at 0x7f1b99fba940>]
In [74]:
!wget 'http://histo.ucsf.edu/BMS270/BMS270_2018/code/stats.py'
--2018-04-03 15:45:14--  http://histo.ucsf.edu/BMS270/BMS270_2018/code/stats.py
Resolving histo.ucsf.edu (histo.ucsf.edu)... 128.218.234.54
Connecting to histo.ucsf.edu (histo.ucsf.edu)|128.218.234.54|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2062 (2.0K) [text/x-python]
Saving to: ‘stats.py’

stats.py            100%[===================>]   2.01K  --.-KB/s    in 0s      

2018-04-03 15:45:14 (17.1 MB/s) - ‘stats.py’ saved [2062/2062]

In [75]:
import stats
In [76]:
stats.__file__
Out[76]:
'/home/bms270/BMS270_2018/stats.py'
In [78]:
stats.pearson(easy_data[0],easy_data[1])
Out[78]:
0.12164363213592637
In [79]:
fig = plt.figure()
h = plt.hist(easy_data[0])
In [81]:
plt.figure()
plt.plot(sorted(easy_data[0]))
Out[81]:
[<matplotlib.lines.Line2D at 0x7f1b99e1ada0>]
In [ ]: