#!/usr/bin/env python
from GGR import utils
from collections import Counter 
import json
import pandas as pd
from sys import argv

gtf = argv[1]
gene_id_to_gene_type_out = argv[2]
transcript_id_to_transcript_type_out = argv[3]

gene_id_to_gene_type = {}
transcript_id_to_transcript_type = {}

with open(gtf, 'r') as f:
    for line in f:
        if '#' == line[0]:
            continue
        parsed = utils.gencode_gtf_line_parser(line)
        if parsed['annotation_type'] == 'gene':
            gene_id_to_gene_type[parsed['gene_id']] = parsed['gene_type']
        elif parsed['annotation_type'] == 'transcript':
            transcript_id_to_transcript_type[parsed['transcript_id']] = parsed['transcript_type']
        else:
            pass

with open('/data/reddylab/Reference_Data/Gencode/v22/gencode.v22.gene_id_to_gene_type.json', "w") as f:
    json.dump(gene_id_to_gene_type, f, indent=4)

with open('/data/reddylab/Reference_Data/Gencode/v22/gencode.v22.transcript_id_to_transcript_type.json', "w") as f:
    json.dump(transcript_id_to_transcript_type, f, indent=4)


gene_id_to_gene_type_df = pd.DataFrame()
gene_id_to_gene_type_df['gene_id'] = gene_id_to_gene_type.keys()
gene_id_to_gene_type_df['gene_type'] = gene_id_to_gene_type.values()

transcript_id_to_transcript_type_df = pd.DataFrame()
transcript_id_to_transcript_type_df['transcript_id'] = transcript_id_to_transcript_type.keys()
transcript_id_to_transcript_type_df['transcript_type'] = transcript_id_to_transcript_type.values()

gene_id_to_gene_type_df.to_csv(gene_id_to_gene_type_out, sep='\t', index=False)
transcript_id_to_transcript_type_df.to_csv(transcript_id_to_transcript_type_out, sep='\t', index=False)