%matplotlib notebook
import matplotlib.pyplot as plt

import gzip
from csv import reader, excel_tab
import numpy as np

fp = reader(gzip.open("GSE86922_Brodsky_GEO_processed.txt.gz"), dialect = excel_tab)
header = fp.next()
data = []
annotations = []
for row in fp:
    annotations.append(row[:4])
    data.append([float(i) for i in row[4:]])
# This is new -- we deallocate the reader object to close the file when we're done reading it
del fp

anno = np.array(annotations)
anno[0]

array(['ENSMUSG00000000001', '14679',
       'guanine nucleotide binding protein (G protein), alpha inhibiting 3',
       'Gnai3'], 
      dtype='|S139')

d1 = {}
for line in anno:
    d1[line[0]] = line
    
d2 = {}
for line in anno:
    d2[line[0]] = line[2]

d1["ENSMUSG00000000001"]

array(['ENSMUSG00000000001', '14679',
       'guanine nucleotide binding protein (G protein), alpha inhibiting 3',
       'Gnai3'], 
      dtype='|S139')

d2["ENSMUSG00000000001"]

'guanine nucleotide binding protein (G protein), alpha inhibiting 3'

cdt = open("clustered1_cm_centered.cdt").readlines()

cdt[1].split("\t")

['EWEIGHT',
 '',
 '',
 '',
 '1.000000',
 '1.000000',
 '1.000000',
 '1.000000',
 '1.000000',
 '1.000000',
 '1.000000',
 '1.000000',
 '1.000000',
 '1.000000',
 '1.000000',
 '1.000000\n']

cdt[2]

'GENE520X\tENSMUSG00000022221\tRipk3\t1.000000\t2.19086966825\t2.00234084225\t0.17825439925\t0.00667346725\t-0.05213574775\t-0.00872163275\t0.48554301925\t0.26040793925\t-0.85146982175\t-1.14610010875\t-1.72158431275\t-1.34407771175\n'

line = cdt[2].split("\t")

print line

['GENE520X', 'ENSMUSG00000022221', 'Ripk3', '1.000000', '2.19086966825', '2.00234084225', '0.17825439925', '0.00667346725', '-0.05213574775', '-0.00872163275', '0.48554301925', '0.26040793925', '-0.85146982175', '-1.14610010875', '-1.72158431275', '-1.34407771175\n']

line[1]

'ENSMUSG00000022221'

d1[line[1]]

array(['ENSMUSG00000022221', '56532',
       'receptor-interacting serine-threonine kinase 3', 'Ripk3'], 
      dtype='|S139')

d2[line[1]]

'receptor-interacting serine-threonine kinase 3'

x = [0,1,2,3,4,5]

x[:2]

[0, 1]

x[2:]

[2, 3, 4, 5]

x[:2]+["newdata"]+x[2:]

[0, 1, 'newdata', 2, 3, 4, 5]

x = [[0,1,2,3],
     ["a","b","c","d"]]

x[0]

[0, 1, 2, 3]

x[0][0]

0

'this is a "string" you know'
"Mark's code"
"""Now I can "use" both it's cool
and more
"""

'Now I can "use" both it\'s cool\nand more\n'

def add_annotations(original, newfile, annotations):
    """Given original, the name of a CDT file in JavaTreeView extended CDT format,
    and annotations, a two dimensional array of annotations, create newfile,
    in CDT format, inserting an annotation column."""
    # Index annotations
    d1 = {}
    for line in annotations:
        d1[line[0]] = line
        
    # Set up input and output streams
    fin = open(original)
    fout = open(newfile,"w")
    
    # Write header lines
    line = fin.next().split("\t")
    assert(line[0] == "GID")
    newline = line[:3]+["desc"]+line[3:]
    fout.write("\t".join(newline))
    
    line = fin.next().split("\t")
    assert(line[0] == "EWEIGHT")
    newline = line[:3]+[""]+line[3:]
    fout.write("\t".join(newline))   
    
    for line in fin:
        line = line.split("\t")
        a = d1[line[1]][2]
        newline = line[:3]+[a]+line[3:]
        fout.write("\t".join(newline))
    
    fin.close()

add_annotations("clustered1_cm_centered.cdt","annotated.cdt",annotations)

annotated = open("annotated.cdt").readlines()

annotated[0]

'GID\tUNIQID\tNAME\tdesc\tGWEIGHT\tWT_unstim_rep1\tWT_unstim_rep2\tRipk3_unstim_rep1\tRipk3_unstim_rep2\tRipk3Casp8_unstim_rep1\tRipk3Casp8_unstim_rep2\tWT_LPS.6hr_rep1\tWT_LPS.6hr_rep2\tRipk3_LPS.6hr_rep1\tRipk3_LPS.6hr_rep2\tRipk3Casp8_LPS.6hr_rep1\tRipk3Casp8_LPS.6hr_rep2\n'

annotated[666]

'GENE205X\tENSMUSG00000014074\tRnf168\tring finger protein 168\t1.000000\t0.981951118417\t0.951597410417\t1.05795343842\t1.07080781942\t0.852469509417\t0.830498468417\t-1.07987330458\t-0.918813444583\t-0.989383952583\t-1.17451714858\t-0.802828409583\t-0.779861504583\n'