#!/usr/bin/env python

import argparse
import genome
import numpy

parser = argparse.ArgumentParser(description = "Calculates AT content for upstream, CDS, introns, and downstream sequences of a gene in a gtf")

parser.add_argument('--gtf',help = "gtf file")
parser.add_argument('--genome', help = "genome in fasta format")
parser.add_argument('--truncate_names', default = True, type = bool, help = "whether or not to truncate names in genome fasta def lines")
parser.add_argument('--binsize', default = 200, type = int, help = 'size for smoothing bins')

args = parser.parse_args()

genome_in = genome.Genome(args.genome, truncate_names = args.truncate_names)
genome_in.read_gff(args.gtf)
binsize = args.binsize

def chunkIt(seq, num):
    avg = len(seq) / float(num)
    out = []
    last = 0.0
    while last < len(seq):
        out.append(seq[int(last):int(last + avg)])
        last += avg
    return out

at_dict = { "upstream":numpy.zeros(binsize),"cds":numpy.zeros(binsize),"intron":numpy.zeros(binsize),"downstream":numpy.zeros(binsize)}
all_dict = {"upstream":numpy.zeros(binsize),"cds":numpy.zeros(binsize),"intron":numpy.zeros(binsize),"downstream":numpy.zeros(binsize)}


for gene_name in genome_in.annotations.gene:
    seqs = {"upstream":"", "downstream":"","cds":"","intron":""}
    gene = genome_in.annotations.gene[gene_name]
    coords = gene.get_coords()
    #get upstream and downstream seq
    if coords[0] > 1000:
        leftseq = genome_in.genome_sequence[gene.seqid][coords[0]  - 1000: coords[0] ]
    else:
        leftseq = genome_in.genome_sequence[gene.seqid][: coords[0]]
    if len(genome_in.genome_sequence[gene.seqid]) - coords[1] > 1000:
        rightseq = genome_in.genome_sequence[gene.seqid][coords[1]:coords[1]+1000]
    else:
        rightseq = genome_in.genome_sequence[gene.seqid][coords[1]:]
    if gene.strand == "+":
        seqs["upstream"],seqs["downstream"] = leftseq,rightseq
    elif gene.strand == "-":
        seqs["upstream"],seqs["downstream"] = rightseq,leftseq
    cds_seq_list = gene.get_fasta(seperate_cds = True).split('\n')[1:]
    intron_seq = gene.get_fasta(genomic = True)
    for sequence in cds_seq_list:
        intron_seq = intron_seq.replace(sequence,"",1)
    seqs['cds'] = "".join(cds_seq_list)
    seqs['intron'] = intron_seq
    for feature in ['upstream','downstream','cds','intron']:
        seq_chunks =chunkIt(seqs[feature],binsize)
        if seq_chunks != []:
            for i in range(binsize):
                at_dict[feature][i] += seq_chunks[i].count('A') + seq_chunks[i].count('T') + seq_chunks[i].count('a') + seq_chunks[i].count('t')
                all_dict[feature][i] += len(seq_chunks[i])


for feature in ['upstream','cds','intron','downstream']:
#    print 1
#    print 0
    for i in range(binsize):
        print at_dict[feature][i] * 1.0 / all_dict[feature][i]
