#re order gtf

import sys

#read gtf
#read in gtf
def read_gtf():
    gtf_file = sys.argv[1]
    gene_list = []
    gene_feature_dict = {}
    other_features_dict = {}
    with open(gtf_file, 'r') as gtf:
        for line in gtf:
            new_line = line.split("\t")
            feature = new_line[2]
            gene_info = new_line[8].split("; ")
            for val in gene_info:
                if val.startswith("gene_id"):
                    val_split = val.split(" ")
                    gene_id = val_split[1].strip("\"")
            if feature == "gene":
                gene_feature_dict.update({gene_id:new_line})
                gene_list.append(gene_id)
            else:
                if gene_id in other_features_dict:
                    other_features_dict[gene_id].append(new_line)
                elif gene_id not in other_features_dict:
                    other_features_dict.update({gene_id:[new_line]})
    return gene_list, gene_feature_dict, other_features_dict


#sorting out other features into each transcript
def sort_isoforms():
    gene_list, gene_feature_dict, other_features_dict = read_gtf()
    output = sys.argv[2]
    with open(output, 'a') as out:
        for gene in gene_list:
            single_gene = gene_feature_dict[gene]
            final_gene = "\t".join(single_gene)
            out.write(final_gene)
            single_gene_transcripts = other_features_dict[gene]
            transcript_dict = {}
            numerical_ids_list = []
            for single in single_gene_transcripts:
                gene_info = single[8].split("; ")
                for val in gene_info:
                    if val.startswith("transcript_id"):
                        val_split = val.split(" ")
                        transcript_id = val_split[1].strip("\"")
                        numerical_id = transcript_id.strip("PB.")
                        numerical_ids_list.append(numerical_id)
                if transcript_id in transcript_dict:
                    transcript_dict[transcript_id].append(single)
                elif transcript_id not in transcript_dict:
                    transcript_dict.update({transcript_id:[single]})
            set_sorted_id_list = list(set(sorted(numerical_ids_list, key=float)))
            for id in set_sorted_id_list:
                single_id_order = {}
                string_id = "PB." + str(id)
                if string_id in transcript_dict:
                    single_id_lines = transcript_dict[string_id]
                    for single_line in single_id_lines:
                        feature = single_line[2]
                        if feature == "transcript":
                            single_id_order.update({"1":[single_line]})
                        elif feature == "exon":
                            if "2" in single_id_order:
                                single_id_order["2"].append(single_line)
                            elif "2" not in single_id_order:
                                single_id_order.update({"2":[single_line]})
                        elif feature == "CDS":
                            if "3" in single_id_order:
                                single_id_order["3"].append(single_line)
                            elif "3" not in single_id_order:
                                single_id_order.update({"3":[single_line]})
                        elif feature == "start_codon":
                            if "4" in single_id_order:
                                single_id_order["4"].append(single_line)
                            elif "4" not in single_id_order:
                                single_id_order.update({"4":[single_line]})
                        elif feature == "stop_codon":
                            if "5" in single_id_order:
                                single_id_order["5"].append(single_line)
                            elif "5" not in single_id_order:
                                single_id_order.update({"5":[single_line]})
                        elif feature == "five_prime_utr":
                            if "6" in single_id_order:
                                single_id_order["6"].append(single_line)
                            elif "6" not in single_id_order:
                                single_id_order.update({"6":[single_line]})
                        elif feature == "three_prime_utr":
                            if "7" in single_id_order:
                                single_id_order["7"].append(single_line)
                            elif "7" not in single_id_order:
                                single_id_order.update({"7":[single_line]})
                correct_order = ["1", "2", "3", "4", "5", "6", "7"]
                for order in correct_order:
                    if order in single_id_order:
                        single_order = single_id_order[order]
                        for v in single_order:
                            final = "\t".join(v)
                            out.write(final)
sort_isoforms()
