#!/usr/bin/env python
import os
import sys
import re
import argparse
import math
import collections



class Gene:
    def __init__(self,geneid,gtfs):
        self.geneid=geneid
        self.gtfs=gtfs
        
        smallest=1000000000
        largest=0
        for g in gtfs:
            if g.start < smallest:
                smallest=g.start
            if g.end>largest:
                largest=g.end
        self.start=smallest
        self.end=largest
        
    def get_feat(self,feat):
        toret=[]
        for g in self.gtfs:
            if g.feat==feat:
                toret.append(g)
        return toret
    
    def get_code(self):
        tr={}
        for i in range(self.start,self.end+1):
            tr[i]=1
        
        for t in self.get_feat("exon"):
            for i in range(t.start,t.end+1):
                tr[i]=2

        for t in self.get_feat("three_prime_UTR"):
            for i in range(t.start,t.end+1):
                tr[i]=3     

        for t in self.get_feat("five_prime_UTR"):
            for i in range(t.start,t.end+1):
                tr[i]=4               


        for t in self.get_feat("CDS"):
            for i in range(t.start,t.end+1):
                tr[i]=5
        
        return tr
              
        
        



def get_genelist(gtfes):
    geneh=collections.defaultdict(lambda:[])
    for g in gtfes:
        gid=g.geneid
        geneh[gid].append(g)
    
    toret=[]
    for geneid,gtfs in geneh.items():
        toret.append(Gene(geneid,gtfs))
    return toret



class GTFentry:

    def __init__(self,chr,feat,start,end,geneid):
        self.chr=chr
        self.feat=feat
        self.start=start
        self.end=end
        self.geneid=geneid

    
def load_gtf(file,euchr):
    """
    3L	Cufflinks	exon	13569022	13571086	566	-	.	gene_id "FBgn0026376"; transcript_id "CUFF.6507.2"; exon_number "1"; parent_feature "mRNA";
    3L	Cufflinks	exon	13586900	13587252	566	-	.	gene_id "FBgn0026376"; transcript_id "CUFF.6507.2"; exon_number "7"; parent_feature "mRNA";
    3L	Cufflinks	exon	13571146	13571298	566	-	.	gene_id "FBgn0026376"; transcript_id "CUFF.6507.2"; exon_number "2"; parent_feature "mRNA";
    """
    gtfh=collections.defaultdict(lambda:[])
    for l in open(file):
        a=l.rstrip("\n").split("\t")
        chr  =a[0]
        if chr not in euchr:
            continue
        feat =a[2]
        start=int(a[3])
        end  =int(a[4])
        t=a[8]
        b=t.split(" ")
        geneid=b[1]
        geneid=geneid[1:-2]
        e=GTFentry(chr,feat,start,end,geneid)
        gtfh[chr].append(e)
    return gtfh


parser = argparse.ArgumentParser(description="""           
Description
-----------
Summary statistics
""",formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
""")
parser.add_argument("--gtf", type=str, required=True, dest="gtf", default=None, help="annotation" )
args = parser.parse_args()

trans={1:"intron",2:"exon",3:"3p",4:"5p",5:"cds"}

euchr={"X":20638010,"2L":21106324,"2R":18996377,"3L":22263419,"3R":26981232,"4":1107140}

gtfchrh=load_gtf(args.gtf,euchr)
stat=[0,0,0,0,0,0]

for chr in euchr.keys():
    chrann=gtfchrh[chr]
    genelist=get_genelist(chrann)

    
    # Generate the hash
    tr={}
    tr=collections.defaultdict(lambda:0)
    for g in genelist:
        c=g.get_code()
        for k,v in c.items():
            if(v>tr[k]):
                tr[k]=v
    
    for i in range(1,euchr[chr]+1):
        idx=0
        if(i in tr):
            idx=tr[i]
        stat[idx]+=1

stat.append(sum(stat))
print "\t".join([str(i) for i in stat])
            
        
    
    
    
    
    
    
    



