#!/usr/bin/env python
"""
Called by PENPP web server to calculate entropy values (and email it to users).
By HUI GUO
Mar. 8 2012
"""
import MySQLdb
import os, sys, re
import math
import smtplib
import calentropy
from Bio import SeqIO
from email.mime.text import MIMEText
from subprocess import call
import mimetypes
from email.MIMEMultipart import MIMEMultipart
from email.MIMEBase import MIMEBase
from email.MIMEText import MIMEText
from email.MIMEAudio import MIMEAudio
from email.MIMEImage import MIMEImage
from email import Encoders
from email.Encoders import encode_base64

def BLAST (infile, outpath):
    """
    input: protein sequences in fasta format.
    output: BLAST output.
    function: search input protein sequences against known proteins.
    """
    call("blastp -query "+infile+" -db /home/hui/Documents/Django_project/snpdb/search/PENPP/protein_database/Proteins.fasta -out "+outpath+".blastout -evalue 1e-3 -outfmt 6", shell=True)
    return outpath+'.blastout'

def Muscle (blastout, queryseqfile, outpath, curs):
    """
    input: BLAST output.
    output: list of result.
    function: get orthologous group based on best hit from blastout; align query sequence to existing alignment of orthologous group; return result.
    """
    qid2seq={}
    qid2besthit={}
    fp=open(queryseqfile)
    for seq_record in SeqIO.parse(fp, "fasta"):
        qid2seq[seq_record.id]=str(seq_record.seq)
    fp.close()
    fp=open(blastout)
    for line in fp:
        arr=line.split('\t')
        if arr[0] not in qid2besthit:
            qid2besthit[arr[0]] = arr[1][8:]
    fp.close()

    rslt=[]
    norslt=[]
    out1=outpath+'.existingaln'
    out2=outpath+'.newseq'
    muscle_out=outpath+'.muscleout'
    for qid, seq in qid2seq.items():
        fw_in1=open(out1,'w')
        fw_in2=open(out2,'w')
        qid_new,mut=qid.split(',')
        oriaa=mut[0]
        mutaa=mut[-1]
        pos=mut[1:-1]
        if qid not in qid2besthit:   
            norslt.append([qid_new, oriaa, pos, mutaa, '-'])
        else:
            fw_in2.write('>'+qid_new+'\n'+seq)
            try:
                curs.execute("select cluster_id from alignments where gene_id='%s'" % qid2besthit[qid])
                cluster_id = (curs.fetchone())[0]
                curs.execute("select gene_id, alignment from alignments where cluster_id='%s'" % cluster_id)
                alignment=curs.fetchall()
            except:
                norslt.append([qid_new, oriaa, pos, mutaa, '-'])
                continue
            for row in alignment: fw_in1.write('>'+row[0]+'\n'+row[1]+'\n')         
            fw_in1.flush()
            os.fsync(fw_in1.fileno())
            fw_in2.flush()
            os.fsync(fw_in2.fileno())
            fw_in1.close()
            fw_in2.close()
            call("muscle -profile -in1 "+out1+" -in2 "+out2+" -out "+muscle_out, shell=True) 
            fp=open(muscle_out)
            alignment=[]
            for seq_record in SeqIO.parse(fp, "fasta"):
                alignment.append([seq_record.id, str(seq_record.seq)])
            fp.close()
            try:
                rslt.append(calentropy.CalEntropy(qid_new,oriaa,pos,mutaa,alignment))
            except:
                norslt.append([qid_new, oriaa, pos, mutaa, '-'])
    return rslt, norslt


def main():
    usrsession=''
    emailadd=''
    myfile=sys.argv[1]
    dirname=os.path.dirname(myfile)
    filename=os.path.basename(myfile)
    if re.search(',', filename):
        usrsession, emailadd = filename.split(',')
    else:
        usrsession = filename
    prefix=os.path.join(dirname,usrsession)
    blastout=BLAST(myfile, prefix)
    curs=calentropy.Connectdb()
    result, noresult=Muscle(blastout, myfile, prefix,curs)
  
    outpath=os.path.join(dirname,usrsession)+'.txt'
    fw=open(outpath,'w')
    header=['Gene_id', 'Position', 'Reference_AA', 'Mutated_AA', 'FIS']
    fw.write('\t'.join(header)+'\n')
    for row in sorted(result, key=lambda x: float(x[4]), reverse=True): fw.write('\t'.join(row)+'\n')
    for row in noresult: fw.write('\t'.join(row)+'\n')
    fw.flush()
    os.fsync(fw.fileno()) 
    fw.close()
    if emailadd!='': calentropy.SentEmail(emailadd,outpath)
    call("rm "+prefix+".blastout", shell=True)
    call("rm "+prefix+".existingaln", shell=True)
    call("rm "+prefix+".muscleout", shell=True)
    call("rm "+prefix+".newseq", shell=True)
    call("rm "+prefix, shell=True)

if __name__ == "__main__":
    main()

