#!/usr/local/bin/python
# Time-stamp: <FastaFile.py 2008-02-19 14:55:56 Mark Voorhies>
# Test script to load and browse a genome in FASTA format

def complement(sequence):
    comp = {"a":"t","t":"a","c":"g","g":"c","n":"n",
            "A":"T","T":"A","C":"G","G":"C","N":"N"}
    retval = ""
    for i in range(len(sequence) - 1, -1, -1):
        try:
            temp = comp[sequence[i]]
            retval += temp
        except:
            pass
    return retval

class FastaFile:
    def __init__(self,
                 # Object with read() method (e.g. builtin file object
                 # or gzip file object)
                 # 12/16/2005: or file can be a string
                 file):
        import re

        # Make a compiled regular expression to parse the file
        # (Note: This is __way__ faster than parsing line by line)

        fasta_re = re.compile(
            # Header lines are marked by the '>' sign
            # We allow headers to begin in the middle of a line
            # We remove whitespace at the ends of the header
            ">[\s]*(?P<header>.*?)[\s]*$"+
            # Sequence is anything that is not a header line
            # We currently count any text in comments (';.*$') as
            # sequence (could parse this out at the same time as
            # whitespace)
            "(?P<seq>[^>]*)",
            # Use multiline mode to parse an entire FASTA file in one go
            re.M)
        name_re = re.compile(
            "^(?P<name>[\S]+)")

        seq_re = re.compile("[^A-Za-z]+")

        #print "Parsing FASTA file..."
        if(type(file) == type("abc")):
            parsed = fasta_re.findall(file)
        else:
            parsed = fasta_re.findall(file.read())
        #print "Removing garbage..."
        self.seqs = {}
        self.headers = {}
        for i in parsed:
            k = name_re.search(i[0]).group("name")
            if(self.seqs.has_key(k)):
                print "WARNING: overwriting", k
            # Note: could use a user-supplied re for garbage removal
            #self.seqs[k] = re.sub("[^A-Za-z]+","",i[1])
            self.seqs[k] = seq_re.sub("",i[1])
            self.headers[k] = i[0]
        #print "Read %d sequences" % len(self.seqs)

    # Mod this and __init__ so that it can be used for persistance
    def __repr__(self):
        return "<FASTA file object "+self.seqs.__repr__()+">"
    def __str__(self):
        return "FASTA file with %d sequences" % (len(self.seqs))

    def __len__(self):
        return len(self.seqs)
    def __getitem__(self, i):
        return self.seqs[i]