#!/usr/bin/env python
# Time-stamp: <Lucien.py 2013-03-06 16:12:17 Mark Voorhies>

"""
Identify Hc NPS/PKS domain substrate specificity by searching G217B
with hmm model against available fungal genomes, constructing a tree
and pulling literature for co-clustering domains
"""

import subprocess
import sys
from optparse import OptionParser

from HmmerTools import HmmSearch 
from FastaFile import FastaFile
from Sequence import ProteinSequence
from GenomeFactory import GenomeFactory
from ClustalTools import MultipleAlignment, NewHampshireGraph

import time

import os.path

class HmmSearchTarget:
    """An HmmSearchTarget is a set of protein sequences suitable for
    searching by hmmsearch.

    Minimally, an HmmSearchTarget can emit a stream of FASTA formatted
    sequences for searching, where the FASTA headers contain gene names
    that play nicely with downstream phylogenetic analysis (e.g., short
    enough to play well with CLUSTAL, source organism/strain easy to identify,
    etc.)
    """
    def __init__(self, fastafile, name = None):
        """Construct from path to FASTA formatted protein file."""
        self.fastafile = fastafile
        self.name = name
        if(self.name is None):
            start = self.fastafile.rfind("/")+1
            stop = self.fastafile.rfind("_pep")
            if(stop > start):
                self.name = self.fastafile[start:stop]
            else:
                self.name = "Untitled"

    def fasta_stream(self):
        return open(self.fastafile)

class DbHmmSearchTarget(HmmSearchTarget):
    """An HmmSearchTarget using BLASTP databases (as pointed
    to by the Genome.sql database) as its sequence source."""
    def __init__(self, genome_name, db = None, name = None):
        self.db = db
        if(db is None):
            # All we want from GenomeFactory is a lightweight
            # list of protein BLAST databases
            self.db = GenomeFactory(geneset_caching = "cached",
                                    assembly_caching = "lazy")
        else:
            self.db = db
            
        self.genome = self.db.getGenome(genome_name)
        self.blastdb = self.genome.proteinBlastDb()

        if(name is None):
            self.name = self.genome.Name()
        else:
            self.name = name

        # This is a place to hold references to fasta_streams to avoid
        # closing them early.
        # TODO: make sure that these don't hang around forever
        self.pipes = []

    def fasta_stream(self):
        p = subprocess.Popen(("fastacmd", "-d", self.blastdb, "-D","1"),
                             stdout = subprocess.PIPE)
        self.pipes.append(p)
        return p.stdout

def tree_pipeline(jobname, querydomain, full_fasta_name, hit_names,
                  bootstrap):
    hmmalign = ".".join((jobname,"hmmalign"))
    subprocess.check_call(("hmmalign",
                           "-o", hmmalign,
                           "--informat", "fasta",
                           querydomain,
                           full_fasta_name))

    # Convert from Stockholm to Clustal format, clipping to just
    # the domain alignment

    from hashlib import sha1
        
    hmmaln = ".".join((jobname,"aln"))
    alignment = MultipleAlignment.fromStockholm(
        open(hmmalign))

    from MsvUtil import revdict
    class NameMapper:
        def __init__(self, names):
            self.long2short = dict(
                (i, sha1(i).hexdigest()[:12])
                for i in names)
            self.short2long = revdict(self.long2short)

        def to_short(self, name):
            return self.long2short[name]
        
        def to_long(self, name):
            return self.short2long[name]

    name_mapper = NameMapper(hit_names)

    rf = alignment.colAnnotations["RF"]
    start = rf.find("x")
    assert(start > -1)
    stop = rf.rfind("x")+1

    motifaln = alignment[start:stop]

    motifaln.remap_names(name_mapper.to_short)

    motifaln.writeClustal(open(hmmaln,"w"))

    motifaln.remap_names(name_mapper.to_long)

    # Generate bootstrapped neighbor-joining tree
    if(bootstrap == 0):
        (njmode, njsuffix) = ("-tree","ph")
    else:
        (njmode, njsuffix) = ("-bootstrap=%d" % bootstrap, "phb")

    subprocess.check_call(("clustalw",
                           "-infile=%s.aln" % jobname,
                           "-type=PROTEIN",
                           njmode))

    # Reformat tree for viewing in JavaTreeView

    tree = NewHampshireGraph.fromPhb(open(".".join((jobname,njsuffix))),
                                     mapper = name_mapper.to_long)
    tree.setRoot()
    tree.makeBtree()

    # (cdt,gtr) corresponding to just the motif hit
    gtr = ".".join((jobname,"gtr"))
    cdt = ".".join((jobname,"cdt"))
    tree.writeGtr(open(gtr,"w"))
    tree.writeCdt(open(cdt,"w"),motifaln)

    # (cdt,gtr) with tree from motif hit but full hmm alignment
    gtr = ".".join((jobname,"full","gtr"))
    cdt = ".".join((jobname,"full","cdt"))
    tree.writeGtr(open(gtr,"w"))
    tree.writeCdt(open(cdt,"w"),alignment)

if(__name__=="__main__"):
    #--------------------------------------------------
    # Parse input
    #--------------------------------------------------

    parser = OptionParser(
        usage = "usage: %prog [options] query.hmm [target1 [target2 [...]]]")

    parser.add_option(
        "-j", "--jobname", dest = "jobname",
        help = "Prefix output files with JOBNAME",
        metavar = "JOBNAME",
        default = None)

    parser.add_option(
        "-E", "--expect", dest = "expect",
        help = "E-value threshold, passed to the -E option in hmmsearch",
        metavar = "EXPECT",
        type = "float",
        default = 1e-6)

    parser.add_option(
        "-F", "--fastacmd", dest = "fastacmd",
        help = "path to NCBI fastacmd executable",
        metavar = "FASTACMD",
        default = "fastacmd")

    parser.add_option(
        "-H", "--hmmsearch", dest = "hmmsearch",
        help = "path to HMMer3 hmmsearch executable",
        metavar = "HMMSEARCH",
        default = "hmmsearch")

    parser.add_option(
        "-t", "--tree", dest = "tree",
        help = "If given, generate neighbor-joining tree formatted for JavaTreeView.",
        action = "store_true")

    parser.add_option(
        "-b", "--bootstrap", dest = "bootstrap",
        help = "Number of bootstrap runs for neighbor-joining tree.  0 to generate a tree without bootstrapping",
        type = "int",
        default = 1000)

    parser.add_option(
        "-G", "--use-genomedb", dest = "use_genomedb",
        help = "If given, load genomes from database.  Default is to load from path to FASTA files.",
        action = "store_true")

    parser.add_option(
        "-g", "--genomedir", dest = "genomedir",
        help = "Default prefix for paths to gene set FASTA files.  Ignored if genome database is used.",
        metavar = "GENOMEDIR",
        # N.B.: In a default installation of HistoBase, we do not expect
        #       uncompressed gene set FASTA files to live _anywhere_ on
        #       the system.
        default = "/home/ahenderson/Laboratory/Labs/Sil/Database/HistoBase/data/genomes/")

    (options, args) = parser.parse_args()

    if(len(args) < 1):
        parser.print_help()
        sys.exit(1)

    # Get query domain
    #   TODO: possible sources = hmm file, pfam ID for fetch from local
    #         repository or web
    
    querydomain = args[0]
    targets = args[1:]

    # Get list of target genomes.  Since there are several possible routes
    # for specifying targets, we will dispatch to route-dependent factory
    # functions.

    if(options.use_genomedb):
        print "Loading gene sets from database"
        # All we want from GenomeFactory is a lightweight
        # list of protein BLAST databases
        factory = GenomeFactory(geneset_caching = "cached",
                                assembly_caching = "lazy")
        genesets = [DbHmmSearchTarget(genome, factory)
                    for genome in factory.genomeNames(annotated = True)
                    if((len(targets) == 0) or
                       (genome in targets))]
        print "Loaded %d annotated gene sets" % len(genesets)

    else:
        genomedir = options.genomedir

        genesets = [HmmSearchTarget(i.rstrip()) for i in
                    Popen(("find",genomedir,"-iname","*.fasta"),
                          stdout = PIPE).stdout]

        if(len(targets) > 0):
            genesets = [i for i in genesets if(i.name in targets)]

    # Path to hmmer3 hmmsearch program
              
    hmmsearch = options.hmmsearch

    # Output file names
    if(options.jobname):
        jobname = options.jobname
    else:
        if(querydomain.endswith(".hmm")):
            jobname = querydomain[:-4]
        else:
            jobname = querydomain

    #--------------------------------------------------
    # Find and process domain hits for each target
    #--------------------------------------------------

    # open output fasta files
    domain_fasta = open(jobname+".domain-hits.fasta","w")
    full_fasta_name = jobname+".full-hits.fasta"
    full_fasta = open(full_fasta_name,"w")

    hit_names = []

    gff = open(jobname+".gff","w")
    
    for geneset in genesets:
        name = geneset.name
        
        print "Searching %s..." % name

        p = subprocess.Popen((hmmsearch,"-E",str(options.expect),
                              querydomain,"-"),
                             stdin = geneset.fasta_stream(),
                             stdout = subprocess.PIPE)

        search_result = HmmSearch(p.stdout,"hmmer3")
        print len(search_result.hits)
        print search_result.acc2hit.keys()

        # TODO: all writeSvg needs is the lengths -- rewrite the interface
        #       to make this more obvious
        fasta = FastaFile(geneset.fasta_stream())
        # TODO: writeSvg currently crashes on empty input -- fix this!
        if(len(search_result.hits) > 0):
            search_result.writeSvg(open(jobname+"."+name+".svg","w"), fasta)
            search_result.writeGff(gff, prefix = jobname)
        for hit in search_result.hits:
            #pull entire hit sequence
            sequence=fasta[hit.accession]
            #pull accession ID for this sequence
            accession = hit.accession
            if(accession.startswith("lcl|")):
                accession = accession[4:]
            hit_names.append(accession)
            full_fasta.write(
                ProteinSequence(sequence).FormatFasta(name = accession))
            count=1
            for domain in hit.domains:
                #pull domain in target sequence that aligned with hmm and pull that sequence for collection.
                domainseq=sequence[domain.seq_f-1:domain.seq_t]
                #assign a systematic name for these domains that includes accession ID and domain indicator
                name="%s_A-%d" % (accession,count)
                count=count+1
                domain_fasta.write(">%s\n"%name)
                domain_fasta.write(domainseq+"\n")
                #write output to a fasta file
    #close fasta files
    domain_fasta.close()
    full_fasta.close()
    gff.close()
                          
    #--------------------------------------------------
    # Align full sequence hits to domain and generate
    # bootstrapped neighbor-joining tree
    #--------------------------------------------------

    if(options.tree):
        tree_pipeline(jobname, querydomain, full_fasta_name, hit_names,
                      options.bootstrap)