%pwd
%cd ~/BMS270/data/
%ls
!ls
!head -c 64 sample_table.csv | hexdump -C
Coverting among hexadecimal, decimal, and strings in python
0x52
chr(0x52)
"\x52"
ord("R")
"%x" % ord("R")
fname = "sample_table.csv"
fname
type(fname)
fp = open(fname)
fp
fp.readline()
fp.readline()
fp.readline()
fp.readline().rstrip()
fp.readline().rstrip().split(",")
from csv import reader
r = reader(fp)
next(r)
row = next(r)
row
!head -c 64 sample_table.oldmac | hexdump -C
fname = "sample_table.oldmac"
fp = open(fname)
fp.readline()
fp
fp = open(fname,newline="\n")
fp.readline()
!head -c 64 sample_table.windows | hexdump -C
fname = "sample_table.windows"
fp = open(fname)
fp.readline()
fp = open(fname,newline="\n")
fp.readline()
!head -c 64 SRR7541452.6M.fastq.gz | hexdump -C
fname = "SRR7541452.6M.fastq.gz"
fp = open(fname)
fp.readline()
fp = open("SRR7541452.6M.fastq.gz","rb")
fp
fp.read(10)
fp.seek(0)
s = fp.read(10)
s
s[4:8]
import time
t = 0
for i in reversed(s[4:8]):
t = t << 8
t += i
t
time.strftime("%Y-%m-%d %H:%M:%s", time.localtime(t))
fp.close()
import gzip
fp = gzip.open(fname)
fp
fp.readline()
fp.readline()
fp.readline()
fp.readline()
fp = gzip.open(fname,"rt")
fp
fp.readline()
for i in range(10):
print(fp.readline())
Every 4th line of the FASTQ file gives the per-base quality scores (probability of a miscall) encoded as single characters.
$P = 10^{\frac{-Q}{10}}$
$Q = ord(C)-q_0$
$q_0 = 33$ 64 for older Illumina files
q0 = 33
C = "D"
Q = ord(C)-q0
P = 10**(-Q/10)
P
C = "I"
Q = ord(C)-q0
P = 10**(-Q/10)
P
fp.seek(0)
header = fp.readline().rstrip()
print(header)
seq = fp.readline().rstrip()
print(seq)
header2 = fp.readline().rstrip()
print(header2)
quals = fp.readline().rstrip()
print(quals)
Qs = []
for C in quals:
Q = ord(C)-q0
Qs.append(Q)
print(Qs)
len(seq)
%matplotlib nbagg
import matplotlib.pyplot as plt
fig = plt.figure()
plt.plot(Qs)
for i in range(20):
header = fp.readline().rstrip()
seq = fp.readline().rstrip()
header2 = fp.readline().rstrip()
quals = fp.readline().rstrip()
Qs = []
for C in quals:
Q = ord(C)-q0
Qs.append(Q)
plt.plot(Qs)