import sys, time


def get_sequence(inp_coverage):
    seq_data={}
    for l1 in inp_coverage:
        if l1.startswith("Solyc"):
            data=l1.strip().split("\t")
            seq_data[data[0]]=data[1]
    return seq_data

def get_TIS(inp_coverage):
    nn=0
    tis_ge={}
    for l1 in inp_coverage:
        if not l1.startswith("#"):
            data=l1.strip().split("\t")
            
            ge=data[0].split("_")[0]
            #1-based
            po_in=int(data[0].split("_")[1])
            if ge not in tis_ge.keys():
                tis_ge[ge]=[]
                tis_ge[ge].append(po_in)
                nn+=1
            else:
                tis_ge[ge].append(po_in)
                nn+=1
    return tis_ge



def get_data(tis_perge, ge_seq):
    data_range={}
    for i in range(-7, 6):
        data_range[i]=[]
    
    for ge_i in tis_perge.keys():
        tis_l=tis_perge[ge_i]
        seq_g=ge_seq[ge_i]
        for tis_i in tis_l:
            if tis_i>6:
                for ii in range(-7, 6):
                    data_range[ii].append(seq_g[tis_i+ii])

    return data_range


def write_da(final_da, output):
    a_nt={}
    t_nt={}
    c_nt={}
    g_nt={}
    
    for i in range(-7, 6):
        output.write("position_%s\t"%(str(i)))
        da_all=final_da[i]
        num_all=len(da_all)
        a_nt[i]=float(da_all.count("A"))/num_all
        t_nt[i]=float(da_all.count("T"))/num_all
        c_nt[i]=float(da_all.count("C"))/num_all
        g_nt[i]=float(da_all.count("G"))/num_all
    output.write("\nA_base\t")

    for i in range(-7, 6):
        output.write("%s\t"%(a_nt[i]))
    output.write("\nT_base\t")

    for i in range(-7, 6):
        output.write("%s\t"%(t_nt[i]))
    output.write("\nC_base\t")
    for i in range(-7, 6):
        output.write("%s\t"%(c_nt[i]))
    output.write("\nG_base\t")
    for i in range(-7, 6):
        output.write("%s\t"%(g_nt[i]))
    output.write("\n")


with open(sys.argv[1], "r") as inp_coverage:
    gene_seq=get_sequence(inp_coverage)

                       
output=open(sys.argv[2]+".TIS_flankingsequences.atcg", "w")
output.write("#python %s\n" %(" ".join(sys.argv)))
output.write("#file_name\tA_base\tT_base\tC_base\tG_base\n")

print "input file number:", len((sys.argv[2:]))
for f_i in range(2, len(sys.argv)):
    with open(sys.argv[f_i], "r") as inp_coverage:
        tis_pergene=get_TIS(inp_coverage)
        
        result_tis=get_data(tis_pergene, gene_seq)

        name=sys.argv[f_i].split("/")[-1]
        output.write("%s\t"%(name))
        write_da(result_tis ,output)

output.close()


        
