#creating partial gtf for isoseq data
#this will only include gene and transcript features
#gtf format will have 9 columns = chr num, source, feature, start, end, score, strand, frame, attributes
#source will be PacBio, attributes will include (when applicable): gene id, transcript id, gene source, transcript source, gene biotype, transcript biotype, exon number, gene name
#if a value is not available, this will show up as "." if only value in column or be left blank if in the attribute section
#there are 150 isoforms that are a bit weird and will be handled manually; will remove these from this script
#will format each feature separately
#to run script: python3 Create.Partial.Isoseq.GTF.py <source as string; in this case all sources will be PacBio> <classification file for combined analyses> < bed file with transcript start and end positions> <full Ensembl GTF> <weird isoforms text file> <output gtf>
#Author: Alice Naftaly, June 2020

import sys

#read classification file
#returns dictionary with key = isoform and value = [isoform, chr_num, strand, gene id, transcript id, protein coding]
def read_class():
    class_file = sys.argv[2]
    class_dict = {}
    with open(class_file, 'r') as class_info:
        for line in class_info:
            if line.startswith("PB"):
                new_line = line.split("\t")
                isoform = new_line[0]
                chr_num = new_line[1]
                strand = new_line[2]
                gene_id = new_line[6]
                transcript_id = new_line[7]
                protein_coding = new_line[27]
                dict_value = [isoform, chr_num, strand, gene_id, transcript_id, protein_coding]
                class_dict.update({isoform:dict_value})
    return class_dict

#read bed file with all transcript start and stop positions
#returns dictionary with key == isoform and value == [isoform, chr_num, start_pos, end_pos, strand]
def read_bed():
    bed_file = sys.argv[3]
    bed_dict = {}
    with open(bed_file, 'r') as bed_info:
        for line in bed_info:
            new_line = line.split()
            chr_num = new_line[0]
            start_pos = new_line[1]
            end_pos = new_line[2]
            isoform = new_line[3]
            strand = new_line[5]
            dict_value = [isoform, chr_num, start_pos, end_pos, strand]
            bed_dict.update({isoform:dict_value})
    return bed_dict

#read in ensembl GTF
#will read in the ensembl gtf in a few ways
#this returns chr num, gene id, and gene name if available
def read_ensembl_gtf_genes():
    gtf_file = sys.argv[4]
    gene_gtf_dict = {}
    gene_name = ""
    with open(gtf_file, 'r') as gtf:
        for line in gtf:
            new_line = line.split("\t")
            chr_num = new_line[0]
            feature = new_line[2]
            if feature == "gene":
                gene_info = new_line[8].split(";")
                for value in gene_info:
                    if value.startswith("gene_id"):
                        gene_id_full = value.split(" ")
                        gene_id = gene_id_full[1].strip("\"")
                    elif value.startswith(" gene_name"):
                        gene_name_full = value.split(" ")
                        gene_name = gene_name_full[2].strip("\"")
                if len(gene_name) == 0:
                    dict_value = [chr_num, gene_id, "."]
                elif len(gene_name) > 0:
                    dict_value = [chr_num, gene_id, gene_name]
                if gene_id in gene_gtf_dict:
                    gene_gtf_dict[gene_id].append(dict_value)
                elif gene_id not in gene_gtf_dict:
                    gene_gtf_dict.update({gene_id:[dict_value]})
    return gene_gtf_dict

#this returns chr num, gene id, transcript id, gene name if available
def read_ensembl_gtf_transcripts():
    gtf_file = sys.argv[4]
    transcripts_gtf_dict = {}
    gene_name = ""
    with open(gtf_file, 'r') as gtf:
        for line in gtf:
            new_line = line.split("\t")
            chr_num = new_line[0]
            feature = new_line[2]
            if feature == "transcript":
                gene_info = new_line[8].split(";")
                for value in gene_info:
                    if value.startswith("gene_id"):
                        gene_id_full = value.split(" ")
                        gene_id = gene_id_full[1].strip("\"")
                    elif value.startswith(" gene_name"):
                        gene_name_full = value.split(" ")
                        gene_name = gene_name_full[2].strip("\"")
                    elif value.startswith(" transcript_id"):
                        transcript_id_full = value.split(" ")
                        transcript_id = transcript_id_full[2].strip("\"")
                if len(gene_name) == 0:
                    dict_value = [chr_num, gene_id, transcript_id, "."]
                elif len(gene_name) > 0:
                    dict_value = [chr_num, gene_id, transcript_id, gene_name]
                if transcript_id in transcripts_gtf_dict:
                    transcripts_gtf_dict[transcript_id].append(dict_value)
                elif transcript_id not in transcripts_gtf_dict:
                    transcripts_gtf_dict.update({transcript_id:[dict_value]})
    return transcripts_gtf_dict


#weird isoforms to be handled manually
#returns list of isoforms to remove from other files
def pull_weird_isoforms():
    isoforms_file = sys.argv[5]
    weird_isoforms = []
    with open(isoforms_file, 'r') as isoforms:
        for line in isoforms:
            isoform = line.strip("\n")
            weird_isoforms.append(isoform)
    return weird_isoforms

#remove weird isoforms from classification, bed, gtf, fasta, and faa files
def filtered_class():
    weird_isoforms = pull_weird_isoforms()
    class_dict = read_class()
    for isoform in weird_isoforms:
        if isoform in class_dict:
            del class_dict[isoform]
    return class_dict

def filtered_bed():
    weird_isoforms = pull_weird_isoforms()
    bed_dict = read_bed()
    for isoform in weird_isoforms:
        if isoform in bed_dict:
            del bed_dict[isoform]
    return bed_dict

#create gene feature
#gene feature should have the following values:
#chr num, source=PacBio, feature=gene, start, end, score=".", strand, frame = ".", attributes = gene id, gene source, gene biotype, gene name if available
#this function pulls the gene position by taking the lowest "start" and the highest "end"; start and end are relative giving everything is in increasing order (- strand goes the opposite direction)
#returns dictionary with key == gene and value == [chr num, strand, protein coding ability, start pos, end pos]
def pull_gene_positions():
    class_dict = filtered_class()
    bed_dict = filtered_bed()
    gene_dict = {}
    final_gene_pos_dict = {}
    for isoform in class_dict:
        single_isoform_class = class_dict[isoform]
        if isoform in bed_dict:
            single_bed_isoform = bed_dict[isoform]
            chr_num = single_bed_isoform[1]
            strand = single_isoform_class[2]
            protein_coding = single_isoform_class[5]
            gene_id = single_isoform_class[3]
            isoform_start = single_bed_isoform[2]
            isoform_end = single_bed_isoform[3]
            dict_value = [chr_num, strand, protein_coding, isoform_start, isoform_end]
            if gene_id in gene_dict:
                gene_dict[gene_id].append(dict_value)
            elif gene_id not in gene_dict:
                gene_dict.update({gene_id:[dict_value]})
    for gene in gene_dict:
        single_gene = gene_dict[gene]
        if len(single_gene) == 1:
            single = single_gene[0]
            final_gene_pos_dict.update({gene:single})
        elif len(single_gene) > 1:
            start_pos = []
            end_pos = []
            for transcript in single_gene:
                transcript_chr_num = transcript[0]
                transcript_strand = transcript[1]
                transcript_coding = transcript[2]
                start_pos.append(int(transcript[3]))
                end_pos.append(int(transcript[4]))
            final_start = min(start_pos)
            final_end = max(end_pos)
            new_dict_value = [transcript_chr_num, transcript_strand, transcript_coding, final_start, final_end]
            final_gene_pos_dict.update({gene:new_dict_value})
    return final_gene_pos_dict

#creates final gene feature
def create_gene_feature():
    gene_positions = pull_gene_positions()
    source = sys.argv[1]
    ensembl_genes = read_ensembl_gtf_genes()
    gene_feature_dict = {}
    for gene in gene_positions:
        single_gene_position = gene_positions[gene]
        chr_num = single_gene_position[0]
        strand = single_gene_position[1]
        coding_potential = single_gene_position[2]
        if coding_potential == "coding":
            final_potential = "protein_coding"
        elif coding_potential == "non_coding":
            final_potentail = "nonprotein_coding"
        start_pos = single_gene_position[3]
        end_pos = single_gene_position[4]
        if gene in ensembl_genes:
            single_ensembl = ensembl_genes[gene][0]
            if single_ensembl[2] == ".":
                gene_attributes = "gene_id \%s; gene_source \%s; gene_biotype %s;" % (str(gene), str(source), str(final_potential))
            else:
                gene_name = single_ensembl[2]
                gene_attributes = "gene_id %s; gene_source %s; gene_biotype %s; gene_name %s;" % (str(gene), str(source), str(final_potential), str(gene_name))
            gene_feature = [str(chr_num), str(source), "gene", str(start_pos), str(end_pos), ".", str(strand), ".", gene_attributes]
            gene_feature_dict.update({gene:gene_feature})
        elif gene not in ensembl_genes:
            gene_attributes = "gene_id %s; gene_source %s; gene_biotype %s;" % (str(gene), str(source), str(final_potential))
            gene_feature = [str(chr_num), str(source), "gene", str(start_pos), str(end_pos), ".", str(strand), ".", gene_attributes]
            gene_feature_dict.update({gene:gene_feature})
    return gene_feature_dict


#create transcript feature
#this will include:
#chr num, source=PacBio, feature=transcript, start, end, score=".", strand, frame = ".", attributes = gene id, gene source, gene biotype, gene name if available, transcript id, transcript source, transcript biotype; isoform id
#difference between transcript and isoform id: transcript id will be either ENSGACT or novel and isoform id is PB.XXXX.X
def create_transcript_feature():
    class_dict = filtered_class()
    bed_dict = filtered_bed()
    ensembl_dict = read_ensembl_gtf_transcripts()
    source = sys.argv[1]
    transcript_feature_dict = {}
    for isoform in class_dict:
        single_isoform_class = class_dict[isoform]
        if isoform in bed_dict:
            single_bed_isoform = bed_dict[isoform]
            chr_num = single_bed_isoform[1]
            strand = single_isoform_class[2]
            gene_id = single_isoform_class[3]
            transcript_id = single_isoform_class[4]
            coding_potential = single_isoform_class[5]
            if coding_potential == "coding":
                final_potential = "protein_coding"
            elif coding_potential == "non_coding":
                final_potentail = "nonprotein_coding"
            isoform_start = single_bed_isoform[2]
            isoform_end = single_bed_isoform[3]
            if transcript_id in ensembl_dict:
                single_ensembl = ensembl_dict[transcript_id][0]
                if single_ensembl[3] == ".":
                    transcript_attributes = "gene_id %s; gene_source %s; gene_biotype %s; transcript_id %s; transcript_source %s; transcript_biotype %s; isoform_id %s;" % (str(gene_id), str(source), str(final_potential), str(transcript_id), str(source), str(final_potential), str(isoform))
                else:
                    gene_name = single_ensembl[3]
                    transcript_attributes = "gene_id %s; gene_source %s; gene_biotype %s; gene_name %s; transcript_id %s; transcript_source %s; transcript_biotype %s; isoform_id %s;" % (str(gene_id), str(source), str(final_potential), str(gene_name), str(transcript_id), str(source), str(final_potential), str(isoform))
                transcript_feature = [str(chr_num), str(source), "transcript", str(isoform_start), str(isoform_end), ".", str(strand), ".", transcript_attributes]
                if gene_id in transcript_feature_dict:
                    transcript_feature_dict[gene_id].append(transcript_feature)
                elif gene_id not in transcript_feature_dict:
                    transcript_feature_dict.update({gene_id:[transcript_feature]})
            elif transcript_id not in ensembl_dict:
                transcript_attributes = "gene_id %s; gene_source %s; gene_biotype %s; transcript_id %s; transcript_source %s; transcript_biotype %s; isoform_id %s;" % (str(gene_id), str(source), str(final_potential), str(transcript_id), str(source), str(final_potential), str(isoform))
                transcript_feature = [str(chr_num), str(source), "transcript", str(isoform_start), str(isoform_end), ".", str(strand), ".", transcript_attributes]
                if gene_id in transcript_feature_dict:
                    transcript_feature_dict[gene_id].append(transcript_feature)
                elif gene_id not in transcript_feature_dict:
                    transcript_feature_dict.update({gene_id:[transcript_feature]})
    return transcript_feature_dict

#write partial gtf_file
def write():
    gene_features = create_gene_feature()
    transcript_features = create_transcript_feature()
    output = sys.argv[6]
    with open(output, 'a') as out:
        for gene in gene_features:
            single_gene = gene_features[gene]
            transcripts = transcript_features[gene]
            final_gene = "\t".join(single_gene)
            out.write(final_gene + "\n")
            for t in transcripts:
                final_transcript = "\t".join(t)
                out.write(final_transcript + "\n")

write()
