import sys, time
from ast import literal_eval

def get_orf_info(nov_tis, ge_cds, ge_seq):
    ge_aTIS, ge_stop=list(map(int, ge_cds.split("_")))
    
    stop_co=["TAA", "TAG", "TGA"]
    for i in range(nov_tis+2, len(ge_seq), 3):
        test=ge_seq[i:i+3]
        if test in stop_co:
            stop_nu=i+1
            novel_seq=ge_seq[nov_tis-1:stop_nu+2]
            region=str(nov_tis)+"_"+str(stop_nu+2)
            break
        else:
            stop_nu=len(ge_seq)
            novel_seq=ge_seq[nov_tis-1:]
            region=str(nov_tis)+"_"+str(len(ge_seq))


    if nov_tis<ge_aTIS:
        po_novel="5UTR"
        if stop_nu+2 >=ge_aTIS:
            overlapped="Yes"
        else:
            overlapped="NO"
    elif nov_tis>ge_aTIS:
        po_novel="dTIS"
        if stop_nu+2 <= ge_stop:
            overlapped="With_CDS"
        elif len(ge_seq)>= stop_nu+2 >ge_stop:
            overlapped="3UTR_Extended"
        elif len(ge_seq)==stop_nu:
            overlapped="No_STOP"


    else:
        print "##############position error", nov_tis, ge_aTIS, ge_seq


    if (abs(ge_aTIS-nov_tis))%3==0:
        inframed="Yes"
    else:
        inframed="NO"

    return po_novel, region, inframed, overlapped, len(novel_seq), novel_seq


def get_TIS(inp_coverage):
    
    an_tis_i={}
    tis_i={}
    for l1 in inp_coverage:
        if not l1.startswith("#"):
            data=l1.strip().split("\t")
            tis_5=literal_eval(data[3].strip('"'))
            tis_aTI=literal_eval(data[4].strip('"'))
            tis_d=literal_eval(data[5].strip('"'))
            tis_3=literal_eval(data[6].strip('"'))
            
            an_tis_i[data[0]]=data[1]+"_"+data[2]
            num_ge_tis=str(len(tis_5)+len(tis_aTI)+len(tis_d)+len(tis_3))

            id=""

            if len(tis_5)>0:
                #print tis_5
                for i_t in tis_5:
                    id=data[0]+"_"+i_t
                    if id in tis_i.keys():
                        print "ERROR: duplictes of TIS:5UTR", id
                    else:
                        tis_i[id]=[]
                        tis_i[id].extend((data[1], data[2], "5UTR", num_ge_tis))
            
            if len(tis_d)>0:
                for i_t in tis_d:
                    id=data[0]+"_"+i_t
                    if id in tis_i.keys():
                        print "ERROR: duplictes of TIS", id
                    else:
                        tis_i[id]=[]
                        tis_i[id].extend((data[1], data[2], "dTIS", num_ge_tis))

            if len(tis_3)>0:
                for i_t in tis_3:
                    id=data[0]+"_"+i_t
                    if id in tis_i.keys():
                        print "ERROR: duplictes of TIS", id
                    else:
                        tis_i[id]=[]
                        tis_i[id].extend((data[1], data[2], "3UTR", num_ge_tis))
    return tis_i, an_tis_i

def get_sequence(inp_coverage):
    seq_data={}
    for l1 in inp_coverage:
        if l1.startswith("Solyc") or l1.startswith("AT"):
            data=l1.strip().split("\t")
            seq_data[data[0]]=data[1]
    return seq_data

with open(sys.argv[1], "r") as inp_coverage:
    all_TIS, gene_tis_info=get_TIS(inp_coverage)

with open(sys.argv[2], "r") as inp_coverage:
    gene_seq=get_sequence(inp_coverage)

with open(sys.argv[1]+".seq_v2", "w") as output:
    output.write("#python %s\n" %(" ".join(sys.argv)))
    output.write("#TIS_ID\tCDS_start\tCDS_eds\tTIS_location\tTIS_number_onGene\tlocation\tnew_ORF_region\tlInframed_toATIS\toverlapped\tslength\tSeq\n")
    for tis_ii in all_TIS.keys():
        id_d=tis_ii.split("_")
        id_dd=get_orf_info(int(id_d[1]), gene_tis_info[id_d[0]], gene_seq[id_d[0]])
        output.write("%s\t%s\t%s\n"%(tis_ii, "\t".join(all_TIS[tis_ii]), "\t".join(map(str,id_dd))))
