In [1]:
from urllib import urlretrieve
In [2]:
urlretrieve("http://histo.ucsf.edu/BMS270/BMS270_2017/data/GSE86922_Brodsky_GEO_processed.txt.gz",
           "GSE86922_Brodsky_GEO_processed.txt.gz")
Out[2]:
('GSE86922_Brodsky_GEO_processed.txt.gz',
 <httplib.HTTPMessage instance at 0x7f88f02dfea8>)
In [3]:
%ls *.gz
GSE86922_Brodsky_GEO_processed.txt.gz
In [4]:
open("GSE86922_Brodsky_GEO_processed.txt.gz").read(10)
Out[4]:
"\x1f\x8b\x08\x08'\x14\xd8W\x00\x03"
In [5]:
import gzip
In [6]:
fp = gzip.open("GSE86922_Brodsky_GEO_processed.txt.gz")
In [7]:
fp.read(10)
Out[7]:
'EnsemblID\t'
In [10]:
gzip.open("GSE86922_Brodsky_GEO_processed.txt.gz").readline()
Out[10]:
'EnsemblID\tEntrezID\tGeneName\tGeneSymbol\tWT_unstim_rep1\tWT_unstim_rep2\tRipk3_unstim_rep1\tRipk3_unstim_rep2\tRipk3Casp8_unstim_rep1\tRipk3Casp8_unstim_rep2\tWT_LPS.6hr_rep1\tWT_LPS.6hr_rep2\tRipk3_LPS.6hr_rep1\tRipk3_LPS.6hr_rep2\tRipk3Casp8_LPS.6hr_rep1\tRipk3Casp8_LPS.6hr_rep2\n'
In [11]:
fp = gzip.open("GSE86922_Brodsky_GEO_processed.txt.gz")
In [12]:
fp.next() # python3 fp.__next__()
Out[12]:
'EnsemblID\tEntrezID\tGeneName\tGeneSymbol\tWT_unstim_rep1\tWT_unstim_rep2\tRipk3_unstim_rep1\tRipk3_unstim_rep2\tRipk3Casp8_unstim_rep1\tRipk3Casp8_unstim_rep2\tWT_LPS.6hr_rep1\tWT_LPS.6hr_rep2\tRipk3_LPS.6hr_rep1\tRipk3_LPS.6hr_rep2\tRipk3Casp8_LPS.6hr_rep1\tRipk3Casp8_LPS.6hr_rep2\n'
In [13]:
fp.next() # python3 fp.__next__()
Out[13]:
'ENSMUSG00000000001\t14679\tguanine nucleotide binding protein (G protein), alpha inhibiting 3\tGnai3\t8.095852358\t8.219151897\t8.116617373\t8.127604792\t7.934172365\t8.028072233\t7.964154595\t7.910907456\t7.870409519\t7.842172144\t7.798445892\t7.831022299\n'
In [14]:
fp.next() # python3 fp.__next__()
Out[14]:
'ENSMUSG00000000003\t54192\tprobasin\tPbsn\t-5.053418441\t-5.109382793\t-5.52780226\t-5.777688187\t-4.729163359\t-5.379859359\t-5.463289207\t-4.84042738\t-4.92583293\t-5.147399754\t-5.645921977\t-5.307409508\n'
In [16]:
print fp.next()
ENSMUSG00000000028	12544	cell division cycle 45	Cdc45	4.845938482	4.76866812	4.859137985	4.927080052	4.595017188	4.610244605	1.867627671	1.979751582	1.815634057	1.328333676	1.961408337	1.930995232

In [17]:
fp.next().split()
Out[17]:
['ENSMUSG00000000031',
 'NA',
 'NA',
 'NA',
 '-5.053418441',
 '-5.109382793',
 '-3.942839759',
 '-4.192725686',
 '-4.729163359',
 '-3.057931264',
 '-5.463289207',
 '-4.84042738',
 '-4.92583293',
 '-5.147399754',
 '-5.645921977',
 '-5.307409508']
In [18]:
fp.next().split()
Out[18]:
['ENSMUSG00000000037',
 '107815',
 'sex',
 'comb',
 'on',
 'midleg-like',
 '2',
 '(Drosophila)',
 'Scml2',
 '-0.409562252',
 '-0.251401798',
 '-0.035949164',
 '-0.049767732',
 '-0.336845936',
 '0.046405396',
 '-1.375826366',
 '-3.255464879',
 '-1.755907928',
 '-2.82547166',
 '-2.838567055',
 '-1.60696979']
In [19]:
fp.next().split("\t")
Out[19]:
['ENSMUSG00000000049',
 '11818',
 'apolipoprotein H',
 'Apoh',
 '-5.053418441',
 '-5.109382793',
 '-5.52780226',
 '-5.777688187',
 '-4.729163359',
 '-5.379859359',
 '-5.463289207',
 '-4.84042738',
 '-4.92583293',
 '-3.562437254',
 '-5.645921977',
 '-5.307409508\n']
In [20]:
a = fp.next().split("\t")
a[-1] = a[-1][:-1]
a
Out[20]:
['ENSMUSG00000000056',
 '67608',
 'nuclear prelamin A recognition factor',
 'Narf',
 '5.46916309',
 '5.502564149',
 '5.441584261',
 '5.558260378',
 '5.505654072',
 '5.428300414',
 '2.872101148',
 '3.231034982',
 '2.808876691',
 '2.765489582',
 '2.809405244',
 '2.973361262']
In [22]:
fp.next().rstrip("\r\n").split("\t")
Out[22]:
['ENSMUSG00000000078',
 '23849',
 'Kruppel-like factor 6',
 'Klf6',
 '8.874266942',
 '8.884351708',
 '8.889720654',
 '8.858426597',
 '8.878975932',
 '8.812048069',
 '10.37672646',
 '10.36532769',
 '10.40665824',
 '10.4019337',
 '10.28728456',
 '10.31014405']
In [23]:
from csv import reader,excel_tab
In [41]:
import csv
In [24]:
r = reader(gzip.open("GSE86922_Brodsky_GEO_processed.txt.gz"),
           dialect = excel_tab)
In [25]:
r.next()
Out[25]:
['EnsemblID',
 'EntrezID',
 'GeneName',
 'GeneSymbol',
 'WT_unstim_rep1',
 'WT_unstim_rep2',
 'Ripk3_unstim_rep1',
 'Ripk3_unstim_rep2',
 'Ripk3Casp8_unstim_rep1',
 'Ripk3Casp8_unstim_rep2',
 'WT_LPS.6hr_rep1',
 'WT_LPS.6hr_rep2',
 'Ripk3_LPS.6hr_rep1',
 'Ripk3_LPS.6hr_rep2',
 'Ripk3Casp8_LPS.6hr_rep1',
 'Ripk3Casp8_LPS.6hr_rep2']
In [26]:
r.next()
Out[26]:
['ENSMUSG00000000001',
 '14679',
 'guanine nucleotide binding protein (G protein), alpha inhibiting 3',
 'Gnai3',
 '8.095852358',
 '8.219151897',
 '8.116617373',
 '8.127604792',
 '7.934172365',
 '8.028072233',
 '7.964154595',
 '7.910907456',
 '7.870409519',
 '7.842172144',
 '7.798445892',
 '7.831022299']
In [27]:
row = r.next()
In [28]:
row.__class__
Out[28]:
list
In [29]:
row[0]
Out[29]:
'ENSMUSG00000000003'
In [30]:
fields = row[:4]
fields
Out[30]:
['ENSMUSG00000000003', '54192', 'probasin', 'Pbsn']
In [31]:
for i in row[4:]:
    fields.append(float(i))
fields
Out[31]:
['ENSMUSG00000000003',
 '54192',
 'probasin',
 'Pbsn',
 -5.053418441,
 -5.109382793,
 -5.52780226,
 -5.777688187,
 -4.729163359,
 -5.379859359,
 -5.463289207,
 -4.84042738,
 -4.92583293,
 -5.147399754,
 -5.645921977,
 -5.307409508]
In [33]:
# Open gzipped, tab-delimited text file for reading as
# a sequence of lists
r = reader(gzip.open("GSE86922_Brodsky_GEO_processed.txt.gz"),
           dialect = excel_tab)
# Extract the column headers
header = r.next()
# This will be a table of gene annotations
annotations = []
# This will be a table of numerical values
data = []
# Parse annotations and values
for row in r:
    # First four columns are annotations
    annotations.append(row[:4])
    cells = []
    for i in row[4:]:
        cells.append(float(i))
    data.append(cells)
In [34]:
# Open gzipped, tab-delimited text file for reading as
# a sequence of lists
r = reader(gzip.open("GSE86922_Brodsky_GEO_processed.txt.gz"),
           dialect = excel_tab)
# Extract the column headers
header = r.next()
# This will be a table of gene annotations
annotations = []
# This will be a table of numerical values
data2 = []
# Parse annotations and values
rest = list(r)
for row in rest[:5]:
    # First four columns are annotations
    annotations.append(row[:4])
    cells = []
    for i in row[4:]:
        cells.append(float(i))
    data2.append(cells)
In [39]:
# Open gzipped, tab-delimited text file for reading as
# a sequence of lists
r = reader(gzip.open("GSE86922_Brodsky_GEO_processed.txt.gz"),
           dialect = excel_tab)
# Extract the column headers
header = r.next()
# This will be a table of gene annotations
annotations = []
# This will be a table of numerical values
data3 = []
# Parse annotations and values
rest = list(r)
for row in rest[:5]:
    # First four columns are annotations
    annotations.append(row[:4])
    for i in row[4:]:
        data3.append(i)
In [37]:
print data2
[[8.095852358, 8.219151897, 8.116617373, 8.127604792, 7.934172365, 8.028072233, 7.964154595, 7.910907456, 7.870409519, 7.842172144, 7.798445892, 7.831022299], [-5.053418441, -5.109382793, -5.52780226, -5.777688187, -4.729163359, -5.379859359, -5.463289207, -4.84042738, -4.92583293, -5.147399754, -5.645921977, -5.307409508], [4.845938482, 4.76866812, 4.859137985, 4.927080052, 4.595017188, 4.610244605, 1.867627671, 1.979751582, 1.815634057, 1.328333676, 1.961408337, 1.930995232], [-5.053418441, -5.109382793, -3.942839759, -4.192725686, -4.729163359, -3.057931264, -5.463289207, -4.84042738, -4.92583293, -5.147399754, -5.645921977, -5.307409508], [-0.409562252, -0.251401798, -0.035949164, -0.049767732, -0.336845936, 0.046405396, -1.375826366, -3.255464879, -1.755907928, -2.82547166, -2.838567055, -1.60696979]]
In [40]:
print data3
['8.095852358', '8.219151897', '8.116617373', '8.127604792', '7.934172365', '8.028072233', '7.964154595', '7.910907456', '7.870409519', '7.842172144', '7.798445892', '7.831022299', '-5.053418441', '-5.109382793', '-5.52780226', '-5.777688187', '-4.729163359', '-5.379859359', '-5.463289207', '-4.84042738', '-4.92583293', '-5.147399754', '-5.645921977', '-5.307409508', '4.845938482', '4.76866812', '4.859137985', '4.927080052', '4.595017188', '4.610244605', '1.867627671', '1.979751582', '1.815634057', '1.328333676', '1.961408337', '1.930995232', '-5.053418441', '-5.109382793', '-3.942839759', '-4.192725686', '-4.729163359', '-3.057931264', '-5.463289207', '-4.84042738', '-4.92583293', '-5.147399754', '-5.645921977', '-5.307409508', '-0.409562252', '-0.251401798', '-0.035949164', '-0.049767732', '-0.336845936', '0.046405396', '-1.375826366', '-3.255464879', '-1.755907928', '-2.82547166', '-2.838567055', '-1.60696979']
In [42]:
len(data)
Out[42]:
44340
In [43]:
len(data[0])
Out[43]:
12
In [44]:
%matplotlib notebook
import matplotlib.pyplot as plt
# from IPython.core.display import display
In [45]:
annotations[0]
Out[45]:
['ENSMUSG00000000001',
 '14679',
 'guanine nucleotide binding protein (G protein), alpha inhibiting 3',
 'Gnai3']
In [48]:
fig = plt.figure()
plt.plot(data[0])
Out[48]:
[<matplotlib.lines.Line2D at 0x7f88c88f4c50>]
In [49]:
fig = plt.figure()
plt.plot(data[0],"bo")
Out[49]:
[<matplotlib.lines.Line2D at 0x7f88c89a1b10>]
In [50]:
sample1 = []
for i in data:
    sample1.append(i[0])
len(sample1)
Out[50]:
44340
In [51]:
fig = plt.figure()
plt.plot(sample1)
Out[51]:
[<matplotlib.lines.Line2D at 0x7f88c827c0d0>]
In [52]:
fig = plt.figure()
h = plt.hist(sample1)
In [53]:
fig = plt.figure()
plt.plot(sorted(sample1))
Out[53]:
[<matplotlib.lines.Line2D at 0x7f88c6e1aa90>]
In [54]:
fig = plt.figure()
plt.plot(data[0],data[1],"bo")
Out[54]:
[<matplotlib.lines.Line2D at 0x7f88c6b8d710>]
In [55]:
fig = plt.figure()
plt.imshow(data)
Out[55]:
<matplotlib.image.AxesImage at 0x7f88c6787150>
In [56]:
fig = plt.figure()
plt.imshow(data, aspect = "auto")
Out[56]:
<matplotlib.image.AxesImage at 0x7f88c60ced10>
In [57]:
fig = plt.figure()
plt.imshow(data, interpolation = "none", aspect = "auto")
plt.colorbar()
Out[57]:
<matplotlib.colorbar.Colorbar at 0x7f88c4793390>
In [ ]: