#!/usr/bin/env python
import pandas as pd
from GGR import utils
from sys import argv

gtf = argv[1]
gene_id_to_gene_type_df = argv[2]
out_prefix = argv[3]

gene_id_to_gene_type_df = pd.read_csv(gene_id_to_gene_type_df, sep='\t')
gene_id_to_gene_type = {gene_id:gene_type for gene_id,gene_type in zip(gene_id_to_gene_type_df.gene_id, gene_id_to_gene_type_df.gene_type)}

fh_TSS_by_gene_type = {gene_type:open('%s%s.TSS.bed'%(out_prefix, gene_type), 'w') for gene_type in sorted(set(gene_id_to_gene_type.values()))}
fh_TES_by_gene_type = {gene_type:open('%s%s.TES.bed'%(out_prefix, gene_type), 'w') for gene_type in sorted(set(gene_id_to_gene_type.values()))}

fh_TSS_all = open('%sTSS.bed'%(out_prefix), 'w')
fh_TES_all = open('%sTES.bed'%(out_prefix), 'w')

with open(gtf, 'r') as f:
    for i, line in enumerate(f):
        if i % 10000 == 0: print i
        if '#' in line:
            continue
        parsed = utils.gencode_gtf_line_parser(line)
        if parsed['annotation_type'] == 'gene':
            if parsed['strand'] == '+':
                
                fh_TSS_by_gene_type[parsed['gene_type']].write('\t'.join([parsed['chrom'], str(parsed['chromStart']), str(parsed['chromStart'] + 1), parsed['gene_id']]) + '\n')
                fh_TSS_all.write('\t'.join([parsed['chrom'], str(parsed['chromStart']), str(parsed['chromStart'] + 1), parsed['gene_id']]) + '\n')
                
                fh_TES_by_gene_type[parsed['gene_type']].write('\t'.join([parsed['chrom'], str(parsed['chromEnd']), str(parsed['chromEnd'] + 1), parsed['gene_id']]) + '\n')
                fh_TES_all.write('\t'.join([parsed['chrom'], str(parsed['chromEnd']), str(parsed['chromEnd'] + 1), parsed['gene_id']]) + '\n')
                
            if parsed['strand'] == '-':
                
                fh_TSS_by_gene_type[parsed['gene_type']].write('\t'.join([parsed['chrom'], str(parsed['chromEnd']-1), str(parsed['chromEnd']), parsed['gene_id']]) + '\n')
                fh_TSS_all.write('\t'.join([parsed['chrom'], str(parsed['chromEnd']-1), str(parsed['chromEnd']), parsed['gene_id']]) + '\n')
                
                fh_TES_by_gene_type[parsed['gene_type']].write('\t'.join([parsed['chrom'], str(parsed['chromStart']-1), str(parsed['chromStart']), parsed['gene_id']]) + '\n')
                fh_TES_all.write('\t'.join([parsed['chrom'], str(parsed['chromStart']-1), str(parsed['chromStart']), parsed['gene_id']]) + '\n')


fh_TSS_all.close()
fh_TES_all.close()

for gene_type in sorted(set(gene_id_to_gene_type.values())):
    fh_TSS_by_gene_type[gene_type].close()
    fh_TES_by_gene_type[gene_type].close()