In [1]:
%matplotlib notebook
import matplotlib.pyplot as plt
In [2]:
import gzip
from csv import reader, excel_tab
import numpy as np
In [3]:
fp = reader(gzip.open("GSE86922_Brodsky_GEO_processed.txt.gz"), dialect = excel_tab)
header = fp.next()
data = []
annotations = []
for row in fp:
    annotations.append(row[:4])
    data.append([float(i) for i in row[4:]])
# This is new -- we deallocate the reader object to close the file when we're done reading it
del fp
In [5]:
anno = np.array(annotations)
anno[0]
Out[5]:
array(['ENSMUSG00000000001', '14679',
       'guanine nucleotide binding protein (G protein), alpha inhibiting 3',
       'Gnai3'], 
      dtype='|S139')
In [6]:
anno.shape
Out[6]:
(44340, 4)
In [8]:
d = np.array(data)
thresh = np.log(10)/np.log(2)
In [9]:
x = (np.sum(d >= thresh, axis = 1) >= 2)
f = d[x,:]
fa = anno[x,:]
f.shape, fa.shape
Out[9]:
((9740, 12), (9740, 4))
In [10]:
x = (np.max(f, axis = 1) - np.min(f, axis = 1) >= 2)
f2 = f[x,:]
fa2 = fa[x,:]
f2.shape, fa2.shape
Out[10]:
((2779, 12), (2779, 4))
In [11]:
!ls *.cdt
clustered1_cm.cdt	    clustered1_cm_scaled.cdt
clustered1_cm_centered.cdt  est_counts.cdt
In [12]:
cdt = open("clustered1_cm_centered.cdt").readlines()
In [13]:
cdt[0]
Out[13]:
'GID\tUNIQID\tNAME\tGWEIGHT\tWT_unstim_rep1\tWT_unstim_rep2\tRipk3_unstim_rep1\tRipk3_unstim_rep2\tRipk3Casp8_unstim_rep1\tRipk3Casp8_unstim_rep2\tWT_LPS.6hr_rep1\tWT_LPS.6hr_rep2\tRipk3_LPS.6hr_rep1\tRipk3_LPS.6hr_rep2\tRipk3Casp8_LPS.6hr_rep1\tRipk3Casp8_LPS.6hr_rep2\n'
In [14]:
cdt[1]
Out[14]:
'EWEIGHT\t\t\t\t1.000000\t1.000000\t1.000000\t1.000000\t1.000000\t1.000000\t1.000000\t1.000000\t1.000000\t1.000000\t1.000000\t1.000000\n'
In [15]:
cdt[2]
Out[15]:
'GENE520X\tENSMUSG00000022221\tRipk3\t1.000000\t2.19086966825\t2.00234084225\t0.17825439925\t0.00667346725\t-0.05213574775\t-0.00872163275\t0.48554301925\t0.26040793925\t-0.85146982175\t-1.14610010875\t-1.72158431275\t-1.34407771175\n'
In [16]:
cdt[-1]
Out[16]:
'GENE2467X\tENSMUSG00000069049\tEif2s3y\t1.000000\t-2.79573406792\t-2.66825227892\t1.89912772008\t1.98051739808\t0.933590345083\t1.02446919208\t-3.30688816992\t-3.10783405092\t2.00283768808\t1.99665289108\t1.07452459408\t0.966988739083\n'
In [17]:
out = open("example.txt","w")
In [18]:
out.write("Hello, world")
In [19]:
out.write("5")
In [20]:
out.write(str(5))
In [21]:
out.close()
In [22]:
open("example.txt").read()
Out[22]:
'Hello, world55'
In [23]:
out = open("example2.txt","w")
out.write("Hello\n")
out.write("world\n")
In [24]:
out.close()
In [25]:
open("example2.txt").read()
Out[25]:
'Hello\nworld\n'
In [26]:
print open("example2.txt").read()
Hello
world

In [27]:
from csv import writer, excel_tab
In [28]:
out = writer(open("example3.txt","w"), dialect = excel_tab)
In [29]:
out.writerow(("a","b","hello, world"))
out.writerow(("1","2","3","4"))
del out
In [31]:
sum((1,2,3))
Out[31]:
6
In [32]:
print open("example3.txt").read()
a	b	hello, world
1	2	3	4

In [33]:
out = open("example4.txt","w")
out.write("\t".join(("hello","world"))+"\n")
out.write("\t".join(("1","3"))+"\n")
out.close()
In [34]:
open("example4.txt").read()
Out[34]:
'hello\tworld\n1\t3\n'
In [35]:
print open("example4.txt").read()
hello	world
1	3

In [36]:
"\t".join(("hello","world"))
Out[36]:
'hello\tworld'
In [37]:
code = {"A":"T","T":"A","G":"C","C":"G"}
In [38]:
code["A"]
Out[38]:
'T'
In [39]:
code["T"]
Out[39]:
'A'
In [40]:
code["T"] = "what?"
In [41]:
code
Out[41]:
{'A': 'T', 'C': 'G', 'G': 'C', 'T': 'what?'}
In [42]:
code["N"] = "N"
In [43]:
code
Out[43]:
{'A': 'T', 'C': 'G', 'G': 'C', 'N': 'N', 'T': 'what?'}
In [44]:
fa2[0]
Out[44]:
array(['ENSMUSG00000000028', '12544', 'cell division cycle 45', 'Cdc45'], 
      dtype='|S139')
In [50]:
name_to_anno = {}
for row in fa2:
    name_to_anno[row[0]] = row
In [46]:
len(name_to_anno)
Out[46]:
2734
In [47]:
len(fa2)
Out[47]:
2779
In [51]:
entrez_to_anno = {}
for row in fa2:
    entrez_to_anno[row[1]] = row
In [52]:
len(entrez_to_anno)
Out[52]:
2647
In [53]:
both_to_anno = {}
for row in fa2:
    both_to_anno[row[:2]] = row
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-53-375164faaf66> in <module>()
      1 both_to_anno = {}
      2 for row in fa2:
----> 3     both_to_anno[row[:2]] = row

TypeError: unhashable type: 'numpy.ndarray'
In [55]:
both_to_anno = {}
for row in fa2:
    both_to_anno[(row[0],row[1])] = row
In [57]:
both_to_anno = {}
for row in fa2:
    both_to_anno[row[0],row[1]] = row
In [56]:
len(both_to_anno)
Out[56]:
2779
In [58]:
def f(x):
    return x+1,x-1
In [59]:
f(3)
Out[59]:
(4, 2)
In [60]:
from csv import reader
In [64]:
for row in reader(open("clustered1_cm_centered.cdt"),excel_tab):
    print row
    break
['GID', 'UNIQID', 'NAME', 'GWEIGHT', 'WT_unstim_rep1', 'WT_unstim_rep2', 'Ripk3_unstim_rep1', 'Ripk3_unstim_rep2', 'Ripk3Casp8_unstim_rep1', 'Ripk3Casp8_unstim_rep2', 'WT_LPS.6hr_rep1', 'WT_LPS.6hr_rep2', 'Ripk3_LPS.6hr_rep1', 'Ripk3_LPS.6hr_rep2', 'Ripk3Casp8_LPS.6hr_rep1', 'Ripk3Casp8_LPS.6hr_rep2']
In [65]:
code["T"]
Out[65]:
'what?'
In [66]:
code["spam"]
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-66-389726eb53ff> in <module>()
----> 1 code["spam"]

KeyError: 'spam'
In [67]:
code.has_key("spam")
Out[67]:
False
In [68]:
try:
    x = code["spam"]
    print "it worked =)"
except:
    print "it didn't work =("
it didn't work =(
In [69]:
try:
    x = code["T"]
    print "it worked =)"
except:
    print "it didn't work =("
it worked =)
In [ ]: