"""Module to compare GTF files and anotate each transcripts 
with the number of time it was found in the GTF"""

import tqdm
import re

gene_id_reg = re.compile('gene_id "(.*?)"')
transcript_id_reg = re.compile('transcript_id "(.*?)"')
exon_number_reg = re.compile('exon_number "(.*?)"')

class Exon():
    """Contains GTF infos about exons"""
    def __init__(self, gtf_line):
        """Init and Exon from a gtf line"""
        sp = gtf_line.split('\t')
        attr = sp[8]
        self.gene_id = gene_id_reg.search(attr).group(1)
        self.transcript_id = transcript_id_reg.search(attr).group(1)
        self.exon_number = exon_number_reg.search(attr).group(1)
        self.chrom = sp[0]
        self.start = int(sp[3])
        self.end = int(sp[4])
        self.strand = sp[6]

    def __str__(self):
        p = "{c}:{s}-{e}({strand})\t{gid}\t{tid}\t{exon_number}".format(
            c=self.chrom,
            s=self.start, 
            e=self.end,
            strand=self.strand,
            gid=self.gene_id,
            tid=self.transcript_id,
            exon_number=self.exon_number)
        return p

    def __repr__(self):
        return self.__str__().replace('\t', ' ') 

    def __eq__(self, other):
        if (self.chrom == other.chrom and 
            abs(self.start - other.start) < 100 and  # start withih 100bp
            abs(self.end - other.end) < 100):  # end withih 100bp
            return True
        else:
            return False
        

class Transcript():
    """Transcript is a collection of exons"""
    def __init__(self):
        self.exons = []
        self.transcript_id = ''
        self.n = len(self.exons)
        self.found = 1

    def __str__(self):
        tr_p = "Transcript {tid} with {n} exons:\n".format(
            tid=self.transcript_id,
            n=self.n)
        exons_p = "\n".join([str(ex) for ex in self.exons])
        return tr_p + exons_p

    def __repr__(self):
        return "{tid}|{n}".format(tid=self.transcript_id,
                                  n=self.n)

    def __getitem__(self, key):
        return self.exons[key]

    def __iadd__(self, exon):
        """Add an exon to the Transcript"""
        if self.exons == []:
            self.chrom = exon.chrom
            self.start = exon.start
            self.end = exon.end
            self.strand = exon.strand
            self.gene_id = exon.gene_id
            self.transcript_id = exon.transcript_id
        elif self.end < exon.end:
            self.end = exon.end
        self.exons.append(exon)
        self.n = len(self.exons)
        return self

    def __eq__(self, other):
        if (self.n == other.n and 
            all([ex_s == ex_o 
                 for (ex_s, ex_o) in zip(self.exons, other.exons)])):
            return True
        else:
            return False

    def to_bed(self):
        return "\t".join([self.chrom, self.start, self.end, 
                          self.transcript_id, str(self.n), self.strand])


def get_transcr_from_gtf(filename):
    """From a GTF file, extract all the transcripts and exons and return 
    a list of all transcripts objects.
    """
    my_transcripts = []
    n = 0
    for i,line in enumerate(open(filename)):
        if line.startswith('#'):
            continue
        else:
            sp = line.split('\t')
            if sp[2] == 'transcript':
                tr = Transcript()
                my_transcripts.append(tr)
                n += 1
            elif sp[2] == 'exon':
                tr += Exon(line)
            if n > 100:
                break
    return my_transcripts


def count_transcripts(transcripts_ls):
    """Counts occurence of the transcripts in the list"""
    for i, tr in enumerate(tqdm.tqdm(transcripts_ls)):
        found = 0
        for j, tr_dup in enumerate(transcripts_ls):
            if tr == tr_dup:
                found += 1
        tr.found = found

        
def main():
    f = '/home/jduc/Desktop/laia/transpoc/all_chim.gtf'
    a = get_transcr_from_gtf(f)
    print(a[:5])
    count_transcripts(a)
    for i in range(5):
        print(a[i].found)
 
if __name__ == "__main__":
    main()

