
def read_seq(k, og_pair):
    muscle=open("5_muscle/CDS/"+og_pair[k]+".vs."+k+"-muscle.txt.masked", "r")

    line = muscle.readline()
    mu_po={}
    original_po={}
    or_seq={}
    mu_seq={}
    while line:
        if line.startswith(">"):
            acc = line.strip()[1:]
            mu_po[acc]={}
            original_po[acc]={}
            n_gap=0
            n_seq=0
            n_i=0
        else:
            inf = line.strip()
            mu_seq[acc]=inf
            for i in inf:
                n_i+=1
                if i == "X" or i == "-":
                    n_gap+=1
                else:
                    n_seq+=1
                    mu_po[acc][n_seq-1]=n_i-1
                    original_po[acc][n_i-1]=n_seq-1
                    if acc not in or_seq.keys():
                        or_seq[acc]=i
                    else:
                        or_seq[acc]=or_seq[acc]+i
        line = muscle.readline()
    return mu_po, original_po, mu_seq, or_seq
    muscle.close()

def site_rate(TIS_g, TIS_g_og, k, mu_po, or_po, mu_seq, or_seq, og_pair, out, cds_gene_sl, cds_gene_AT):

    sites_sl=TIS_g[k]
    try:
        sites_at=TIS_g_og[og_pair[k]]
    except KeyError:
        sites_at=[]
    
    for i in sites_sl:
        print i
        tis_gene_id=k+"_"+i
        
        po_5=int(i.split("_")[0])-1
        align_po_sl=mu_po[k][po_5]
        align_sl_codon=mu_seq[k][mu_po[k][po_5]:((mu_po[k][po_5])+3)]
        align_at_codon=mu_seq[og_pair[k]][mu_po[k][po_5]:(mu_po[k][po_5])+3]
        
        if mu_po[k][po_5]>=4:
            align_sl_seq=mu_seq[k][(mu_po[k][po_5]-4):(mu_po[k][po_5]+5)]
            align_at_seq=mu_seq[og_pair[k]][(mu_po[k][po_5]-4):(mu_po[k][po_5]+5)]
        else:
            align_sl_seq=mu_seq[k][0:(mu_po[k][po_5]+5)]
            align_at_seq=mu_seq[og_pair[k]][0:(mu_po[k][po_5]+5)]

        seq_si=0
        for a_i in range(0, len(align_sl_seq)):
            if align_sl_seq[a_i] == align_at_seq[a_i]:
                seq_si+=1
        seq_si_per=seq_si*100.0/(len(align_sl_seq))
    
        align_po_sl=mu_po[k][po_5]
        
        try:
            ori_po_at=str(or_po[og_pair[k]][align_po_sl]+1)
        except KeyError:
            ori_po_at="-"

        ge_TIS={}
        for ii in sites_at:
            ii_po=ii.split("_")
            ge_TIS[ii_po[0]]=ii

 

        if ori_po_at in ge_TIS.keys():
            tis_gene_id_OG=og_pair[k]+"_"+ge_TIS[ori_po_at]
            out.write(("%s\t%s\t%s\t%s\tTIS_conservated\t%s\t%s\t%s\t%s\t%s\n")%(tis_gene_id, tis_gene_id_OG, cds_gene_sl[k], cds_gene_AT[og_pair[k]] ,seq_si_per, align_sl_codon, align_at_codon, align_sl_seq, align_at_seq))

        elif len(ge_TIS)>0:
            out.write(("%s\t%s\t%s\t%s\tTIS_compesatory\t%s\t%s\t%s\t%s\t%s\n")%(tis_gene_id, og_pair[k], cds_gene_sl[k], cds_gene_AT[og_pair[k]], seq_si_per, align_sl_codon, align_at_codon, align_sl_seq, align_at_seq))
        
        else:
            out.write(("%s\t%s\t%s\t%s\tTIS_unique\t%s\t%s\t%s\t%s\t%s\n")%(tis_gene_id, og_pair[k], cds_gene_sl[k], cds_gene_AT[og_pair[k]], seq_si_per, align_sl_codon, align_at_codon, align_sl_seq, align_at_seq))
         

def get_TIS(file, cds_po):
    in1=open(file,"r")
    ge={}
    for l1 in in1:
        if l1.startswith("#") or l1 =="\n":
            pass
        else:
            ll1=l1.strip().split("\t")[0].split("_")
            cds_start=cds_po[ll1[0]]
            
            tis_po=str(int(ll1[1])-cds_start+1)
            
            tis_id=tis_po+"_"+ll1[2]
            if ll1[0] not in ge.keys():
                ge[ll1[0]]=[]
                ge[ll1[0]].append(tis_id)
            else:
                ge[ll1[0]].append(tis_id)
    in1.close()
    return ge
        

def get_cds_start(file):
    in1=open(file,"r")
    ge={}
    for l1 in in1:
        if l1.startswith("#") or l1.startswith("ID")or l1 =="\n":
            pass
        else:
            ll1=l1.strip().split("\t")
            #1-base position
            ge[ll1[0]]=int(ll1[2])
    in1.close()
    return ge


import sys

og_list=open(sys.argv[1], "r")
og_same={}
for l2 in og_list:
    if l2.startswith("#") or "pattern" in l2:
        pass
    else:
        ll2=l2.strip().split("\t")

        og_same[(ll2[1])]=ll2[0]
        
og_list.close()

cds_s_gene_SL=get_cds_start(sys.argv[2])
cds_s_gene_AT=get_cds_start(sys.argv[3])


TIS_gene_SL=get_TIS(sys.argv[4], cds_s_gene_SL)
TIS_gene_AT=get_TIS(sys.argv[5], cds_s_gene_AT)

out_file=sys.argv[4]+".conservation"
out=open(out_file, "w")
out.write("#%s\n"%" ".join(sys.argv))
out.write("#TIS_SL_id_relativeToCDS-5end\t5UTR_lnegth_SL\tTIS_AT_id_relativeToCDS-5end\t5UTR_lnegth_AT\tTIS_conservation_status\tflankingseqeince_similarity(%)\taligned_TIScodon_SL\taligned_TIScodon_AT\taligned_flankingseqeince_SL\taligned_flankingseqeince_AT\n")

for k in TIS_gene_SL.keys():
    if k in og_same.keys():
        print k
        try:
            mu_po, ori_po, mu_seq, or_seq=read_seq(k, og_same)
            site_rate(TIS_gene_SL, TIS_gene_AT, k, mu_po, ori_po, mu_seq, or_seq, og_same, out, cds_s_gene_SL, cds_s_gene_AT)
        except IOError:
            print k

out.close()

