In [1]:
%matplotlib notebook
import matplotlib.pyplot as plt
In [2]:
import gzip
from csv import reader, excel_tab
import numpy as np
In [3]:
fp = reader(gzip.open("GSE86922_Brodsky_GEO_processed.txt.gz"), dialect = excel_tab)
header = fp.next()
data = []
annotations = []
for row in fp:
    annotations.append(row[:4])
    data.append([float(i) for i in row[4:]])
# This is new -- we deallocate the reader object to close the file when we're done reading it
del fp
In [4]:
anno = np.array(annotations)
anno[0]
Out[4]:
array(['ENSMUSG00000000001', '14679',
       'guanine nucleotide binding protein (G protein), alpha inhibiting 3',
       'Gnai3'], 
      dtype='|S139')
In [5]:
d1 = {}
for line in anno:
    d1[line[0]] = line
    
d2 = {}
for line in anno:
    d2[line[0]] = line[2]
In [6]:
d1["ENSMUSG00000000001"]
Out[6]:
array(['ENSMUSG00000000001', '14679',
       'guanine nucleotide binding protein (G protein), alpha inhibiting 3',
       'Gnai3'], 
      dtype='|S139')
In [7]:
d2["ENSMUSG00000000001"]
Out[7]:
'guanine nucleotide binding protein (G protein), alpha inhibiting 3'
In [8]:
cdt = open("clustered1_cm_centered.cdt").readlines()
In [27]:
cdt[1].split("\t")
Out[27]:
['EWEIGHT',
 '',
 '',
 '',
 '1.000000',
 '1.000000',
 '1.000000',
 '1.000000',
 '1.000000',
 '1.000000',
 '1.000000',
 '1.000000',
 '1.000000',
 '1.000000',
 '1.000000',
 '1.000000\n']
In [9]:
cdt[2]
Out[9]:
'GENE520X\tENSMUSG00000022221\tRipk3\t1.000000\t2.19086966825\t2.00234084225\t0.17825439925\t0.00667346725\t-0.05213574775\t-0.00872163275\t0.48554301925\t0.26040793925\t-0.85146982175\t-1.14610010875\t-1.72158431275\t-1.34407771175\n'
In [10]:
line = cdt[2].split("\t")
In [16]:
print line
['GENE520X', 'ENSMUSG00000022221', 'Ripk3', '1.000000', '2.19086966825', '2.00234084225', '0.17825439925', '0.00667346725', '-0.05213574775', '-0.00872163275', '0.48554301925', '0.26040793925', '-0.85146982175', '-1.14610010875', '-1.72158431275', '-1.34407771175\n']
In [11]:
line[1]
Out[11]:
'ENSMUSG00000022221'
In [12]:
d1[line[1]]
Out[12]:
array(['ENSMUSG00000022221', '56532',
       'receptor-interacting serine-threonine kinase 3', 'Ripk3'], 
      dtype='|S139')
In [14]:
d2[line[1]]
Out[14]:
'receptor-interacting serine-threonine kinase 3'
In [18]:
x = [0,1,2,3,4,5]
In [19]:
x[:2]
Out[19]:
[0, 1]
In [20]:
x[2:]
Out[20]:
[2, 3, 4, 5]
In [23]:
x[:2]+["newdata"]+x[2:]
Out[23]:
[0, 1, 'newdata', 2, 3, 4, 5]
In [24]:
x = [[0,1,2,3],
     ["a","b","c","d"]]
In [25]:
x[0]
Out[25]:
[0, 1, 2, 3]
In [26]:
x[0][0]
Out[26]:
0
In [31]:
'this is a "string" you know'
"Mark's code"
"""Now I can "use" both it's cool
and more
"""
Out[31]:
'Now I can "use" both it\'s cool\nand more\n'
In [29]:
def add_annotations(original, newfile, annotations):
    """Given original, the name of a CDT file in JavaTreeView extended CDT format,
    and annotations, a two dimensional array of annotations, create newfile,
    in CDT format, inserting an annotation column."""
    # Index annotations
    d1 = {}
    for line in annotations:
        d1[line[0]] = line
        
    # Set up input and output streams
    fin = open(original)
    fout = open(newfile,"w")
    
    # Write header lines
    line = fin.next().split("\t")
    assert(line[0] == "GID")
    newline = line[:3]+["desc"]+line[3:]
    fout.write("\t".join(newline))
    
    line = fin.next().split("\t")
    assert(line[0] == "EWEIGHT")
    newline = line[:3]+[""]+line[3:]
    fout.write("\t".join(newline))   
    
    for line in fin:
        line = line.split("\t")
        a = d1[line[1]][2]
        newline = line[:3]+[a]+line[3:]
        fout.write("\t".join(newline))
    
    fin.close()
In [32]:
add_annotations("clustered1_cm_centered.cdt","annotated.cdt",annotations)
In [33]:
annotated = open("annotated.cdt").readlines()
In [34]:
annotated[0]
Out[34]:
'GID\tUNIQID\tNAME\tdesc\tGWEIGHT\tWT_unstim_rep1\tWT_unstim_rep2\tRipk3_unstim_rep1\tRipk3_unstim_rep2\tRipk3Casp8_unstim_rep1\tRipk3Casp8_unstim_rep2\tWT_LPS.6hr_rep1\tWT_LPS.6hr_rep2\tRipk3_LPS.6hr_rep1\tRipk3_LPS.6hr_rep2\tRipk3Casp8_LPS.6hr_rep1\tRipk3Casp8_LPS.6hr_rep2\n'
In [35]:
annotated[666]
Out[35]:
'GENE205X\tENSMUSG00000014074\tRnf168\tring finger protein 168\t1.000000\t0.981951118417\t0.951597410417\t1.05795343842\t1.07080781942\t0.852469509417\t0.830498468417\t-1.07987330458\t-0.918813444583\t-0.989383952583\t-1.17451714858\t-0.802828409583\t-0.779861504583\n'
In [ ]: