from urllib.request import urlopen

fp = urlopen("http://histo.ucsf.edu/BMS270/BMS270_2018/data/supp2data.cdt")

fp.readline()

b'ORF\tNAME\talpha 0\talpha 7\talpha 14\talpha 21\talpha 28\talpha 35\talpha 42\talpha 49\talpha 56\talpha 63\talpha 70\talpha 77\talpha 84\talpha 91\talpha 98\talpha 105\talpha 112\talpha 119\tElu 0\tElu 30\tElu 60\tElu 90\tElu 120\tElu 150\tElu 180\tElu 210\tElu 240\tElu 270\tElu 300\tElu 330\tElu 360\tElu 390\tcdc15 10\tcdc15 30\tcdc15 50\tcdc15 70\tcdc15 90\tcdc15 110\tcdc15 130\tcdc15 150\tcdc15 170\tcdc15 190\tcdc15 210\tcdc15 230\tcdc15 250\tcdc15 270\tcdc15 290\tspo 0\tspo 2\tspo 5\tspo 7\tspo 9\tspo 11\tspo5 2\tspo5 7\tspo5 11\tspo- early\tspo- mid\theat 0\theat 10\theat 20\theat 40\theat 80\theat 160\tdtt 15\tdtt 30\tdtt 60\tdtt 120\tcold 0\tcold 20\tcold 40\tcold 160\tdiau a\tdiau b\tdiau c\tdiau d\tdiau e\tdiau f\tdiau g\r\n'

fp.readline()

b'YBR166C\tTYR1   TYROSINE BIOSYNTHESIS    PREPHENATE DEHYDROGENASE (NADP+\t0.33\t-0.17\t0.04\t-0.07\t-0.09\t-0.12\t-0.03\t-0.2\t-0.06\t-0.06\t-0.14\t-0.18\t-0.06\t-0.25\t0.06\t-0.12\t0.25\t0.43\t0.21\t-0.04\t-0.15\t-0.04\t0.21\t-0.14\t-0.03\t-0.07\t-0.36\t-0.14\t-0.42\t-0.34\t-0.23\t-0.17\t0.23\t0.3\t0.41\t-0.07\t-0.23\t-0.12\t0.16\t0.74\t0.14\t-0.49\t-0.32\t0.19\t0.23\t0.24\t0.28\t1.13\t-0.12\t0.1\t\t0.66\t0.62\t0.08\t0.62\t0.43\t0.5\t-0.25\t-0.51\t-0.67\t0.21\t-0.74\t-0.36\t-0.01\t0.38\t0.15\t-0.22\t-0.09\t0.33\t0.08\t0.39\t-0.17\t0.23\t0.2\t0.2\t-0.17\t-0.69\t0.14\t-0.27\r\n'

fp.close()

from cdt_reader import parse_cdt
from stats import pearson

hello,world

(colnames,genes,annotations,data) = parse_cdt("supp2data.cdt",0.)

def dist_matrix(data):
    matrix = []
    for i in data:
        row = []
        for j in data:
            row.append(pearson(i,j))
        matrix.append(row)
    return matrix

dist_matrix(data[:3])

[[1.0, 0.12164363213592637, 0.357184444363415],
 [0.12164363213592637, 1.0, 0.2496676487356645],
 [0.357184444363415, 0.2496676487356645, 1.0]]

def dist_matrix2(data):
    matrix = []
    row = []
    for i in data:
        for j in data:
            row.append(pearson(i,j))
        matrix.append(row)
    return matrix

dist_matrix2(data[:3])

[[1.0,
  0.12164363213592637,
  0.357184444363415,
  0.12164363213592637,
  1.0,
  0.2496676487356645,
  0.357184444363415,
  0.2496676487356645,
  1.0],
 [1.0,
  0.12164363213592637,
  0.357184444363415,
  0.12164363213592637,
  1.0,
  0.2496676487356645,
  0.357184444363415,
  0.2496676487356645,
  1.0],
 [1.0,
  0.12164363213592637,
  0.357184444363415,
  0.12164363213592637,
  1.0,
  0.2496676487356645,
  0.357184444363415,
  0.2496676487356645,
  1.0]]

d = dist_matrix2(data[:3])
for i in d:
    print(i)

[1.0, 0.12164363213592637, 0.357184444363415, 0.12164363213592637, 1.0, 0.2496676487356645, 0.357184444363415, 0.2496676487356645, 1.0]
[1.0, 0.12164363213592637, 0.357184444363415, 0.12164363213592637, 1.0, 0.2496676487356645, 0.357184444363415, 0.2496676487356645, 1.0]
[1.0, 0.12164363213592637, 0.357184444363415, 0.12164363213592637, 1.0, 0.2496676487356645, 0.357184444363415, 0.2496676487356645, 1.0]

rows = []
a = [1,2,3]
rows.append(a)
a += [4,5,6]
rows.append(a)
print(rows)

[[1, 2, 3, 4, 5, 6], [1, 2, 3, 4, 5, 6]]

a[3] = "spam"
rows

[[1, 2, 3, 'spam', 5, 6], [1, 2, 3, 'spam', 5, 6]]

rows = []
a = [1,2,3]
rows.append(a[:])
a += [4,5,6]
rows.append(a[:])
print(rows)

[[1, 2, 3], [1, 2, 3, 4, 5, 6]]

def write_correlation_cdt(genes,annotations,matrix,fname):
    out = open(fname,"w")
    out.write("\t".join(["ORF","NAME"]+genes)+"\n")
    for (gene,anno,row) in zip(genes,annotations,matrix):
        srow = []
        for j in row:
            srow.append(str(j))
        out.write("\t".join([gene,anno]+srow)+"\n")
    out.close()

matrix = dist_matrix(data[:100])

write_correlation_cdt(genes[:100],annotations[:100],matrix,"corr.cdt")

import Bio.Cluster as Pycluster

%who

Pycluster	 a	 annotations	 colnames	 d	 data	 dist_matrix	 dist_matrix2	 fp	 
genes	 i	 matrix	 parse_cdt	 pearson	 rows	 urlopen	 write_correlation_cdt

import numpy as np

d = np.array(data)

d

array([[ 0.33, -0.17,  0.04, ..., -0.69,  0.14, -0.27],
       [-0.64, -0.38, -0.32, ...,  0.04, -0.97, -1.79],
       [-0.23,  0.19, -0.36, ..., -0.47, -0.43, -1.06],
       ..., 
       [ 0.08,  0.66,  0.49, ..., -0.47,  0.77, -0.12],
       [ 0.06, -0.2 ,  0.26, ...,  0.29, -0.25,  0.4 ],
       [ 0.07, -0.04,  0.12, ...,  0.28,  0.41, -0.1 ]])

d.shape

(2467, 79)

d.dtype

dtype('float64')

%%time
dist = Pycluster.distancematrix(d, dist = "u")

CPU times: user 524 ms, sys: 12 ms, total: 536 ms
Wall time: 535 ms

%%time
tree = Pycluster.treecluster(distancematrix = dist, method = "m")

CPU times: user 1.88 s, sys: 4 ms, total: 1.88 s
Wall time: 1.87 s

record = Pycluster.Record()
# Restore "None" values
record.data = d
record.geneid = genes
record.genename = annotations
record.gweight = None
record.gorder = None
record.expid = colnames
record.eweight = [1.]*len(colnames)
record.eorder = None
record.uniqid = "UNIQID"
record.save("clustered1_um", geneclusters = tree)

!ls -lhrt | tail

-rw-rw-r-- 1 bms270 bms270 3.4K Apr  5 14:51 cdt_reader.py
-rw-rw-r-- 1 bms270 bms270 4.2K Apr  5 15:47 example.csv
-rw-rw-r-- 1 bms270 bms270 216K Apr  5 16:06 Day3.ipynb
drwxrwxr-x 2 bms270 bms270 4.0K Apr  6 09:53 __pycache__
-rw-rw-r-- 1 bms270 bms270 118M Apr  6 14:03 correlation.cdt
-rw-rw-r-- 1 bms270 bms270 6.1K Apr  6 14:08 Untitled1.ipynb
-rw-rw-r-- 1 bms270 bms270 197K Apr  6 14:37 corr.cdt
-rw-rw-r-- 1 bms270 bms270 106K Apr  6 15:45 clustered1_um.gtr
-rw-rw-r-- 1 bms270 bms270 1.3M Apr  6 15:45 clustered1_um.cdt
-rw-rw-r-- 1 bms270 bms270  13K Apr  6 15:46 Day4.ipynb