import sys, time
def get_sequence(inp_coverage):
    seq_data={}
    for l1 in inp_coverage:
        if l1.startswith("Solyc"):
            data=l1.strip().split("\t")
            seq_data[data[0]]=data[1]
    return seq_data

def get_location(inp_coverage):
    get_lo={}
    for l1 in inp_coverage:
        if l1.startswith("Solyc"):
            data=l1.strip().split("\t")
            get_lo[data[0]]=data[2]
    return get_lo


def get_TIS(inp_coverage):
    nn=0
    tis_ge={}
    for l1 in inp_coverage:
        if not l1.startswith("#"):
            data=l1.strip().split("\t")
            
            ge=data[0].split("_")[0]
            #1-based
            po_in=int(data[0].split("_")[1])
            if ge not in tis_ge.keys():
                tis_ge[ge]=[]
                tis_ge[ge].append(po_in)
                nn+=1
            else:
                tis_ge[ge].append(po_in)
                nn+=1
    return tis_ge



with open(sys.argv[1], "r") as inp_coverage:
    gene_seq=get_sequence(inp_coverage)

with open(sys.argv[2], "r") as inp_coverage:
    gene_lo=get_location(inp_coverage)

output=open("TIS_flankingsequences.atcg.background", "w")
output.write("#python %s\n" %(" ".join(sys.argv)))

for f_i in range(3, len(sys.argv)):
    with open(sys.argv[f_i], "r") as inp_coverage:
        tis_pergene=get_TIS(inp_coverage)
        
        data_range={}
        data_range["A"]=0
        data_range["T"]=0
        data_range["C"]=0
        data_range["G"]=0
    
        for ge_i in tis_pergene.keys():
            utr=int(gene_lo[ge_i])
            seq_g=gene_seq[ge_i][0:utr]
            
            data_range["A"]=data_range["A"]+seq_g.count("A")
            data_range["T"]=data_range["T"]+seq_g.count("T")
            data_range["C"]=data_range["C"]+seq_g.count("C")
            data_range["G"]=data_range["G"]+seq_g.count("G")

        sum=float(data_range["A"]+data_range["T"]+data_range["C"]+data_range["G"])

        name=sys.argv[f_i].split("/")[-1]
        output.write("#file_background\tA_base\tT_base\tC_base\tG_base\n")
        output.write("%s\t%s\t%s\t%s\t%s\n"%(name, data_range["A"]/sum, data_range["T"]/sum, data_range["C"]/sum, data_range["G"]/sum))

output.close()


        
