import sys

#returns dictionary with key = isoform and value = [isoform, chr_num, strand, gene id, transcript id, protein coding]
def read_class():
    class_file = sys.argv[1]
    class_dict = {}
    with open(class_file, 'r') as class_info:
        for line in class_info:
            if line.startswith("PB"):
                new_line = line.split("\t")
                isoform = new_line[0]
                chr_num = new_line[1]
                strand = new_line[2]
                gene_id = new_line[6]
                transcript_id = new_line[7]
                protein_coding = new_line[27]
                dict_value = [isoform, chr_num, strand]
                class_dict.update({isoform:dict_value})
    return class_dict

#read bed file with all transcript start and stop positions
#returns dictionary with key == isoform and value == [isoform, chr_num, start_pos, end_pos, strand]
def read_bed():
    bed_file = sys.argv[2]
    bed_dict = {}
    with open(bed_file, 'r') as bed_info:
        for line in bed_info:
            new_line = line.split()
            chr_num = new_line[0]
            start_pos = new_line[1]
            end_pos = new_line[2]
            isoform = new_line[3]
            strand = new_line[5]
            dict_value = [isoform, chr_num, start_pos, end_pos, strand]
            bed_dict.update({isoform:dict_value})
    return bed_dict


#read in isoseq gtf with exon positions
#returns dictionary with key == isoform id and value == list of exons with [strand, exon start, exon end]
#flipped the order of exons for - strand
def read_isoseq_gtf():
    gtf_file = sys.argv[3]
    exon_dict = {}
    final_exon_dict = {}
    with open(gtf_file, 'r') as gtf:
        for line in gtf:
            new_line = line.split()
            exon_start = new_line[3]
            exon_end = new_line[4]
            strand = new_line[6]
            isoform_id_full = new_line[9].strip(";")
            isoform_id = isoform_id_full.strip("\"")
            dict_value = [strand, exon_start, exon_end]
            if isoform_id in exon_dict:
                exon_dict[isoform_id].append(dict_value)
            elif isoform_id not in exon_dict:
                exon_dict.update({isoform_id:[dict_value]})
    for isoform in exon_dict:
        single_isoform = exon_dict[isoform]
        if len(single_isoform) == 1:
            single = single_isoform[0]
            final_exon_dict.update({isoform:[single]})
        elif len(single_isoform) > 1:
            strand = single_isoform[0][0]
            if strand == "+":
                final_exon_dict.update({isoform:single_isoform})
            elif strand == "-":
                single_isoform.reverse()
                final_exon_dict.update({isoform:single_isoform})
    return final_exon_dict


#Compare exon start and chr/scaffold
def compare():
    bed_dict = read_bed()
    class_dict = read_class()
    gtf_dict = read_isoseq_gtf()
    wrong_chr = 0
    plus_strand_wrong = 0
    minus_strand_wrong = 0
    for isoform in class_dict:
        single_class = class_dict[isoform]
        class_chr_num = single_class[1]
        single_bed = bed_dict[isoform]
        bed_chr_num = single_bed[1]
        bed_start_pos = int(single_bed[2])
        bed_end_pos = single_bed[3]
        bed_strand = single_bed[4]
        single_exon_list = gtf_dict[isoform]
        if class_chr_num != bed_chr_num:
            #print(isoform)
            wrong_chr += 1
        else:
            if bed_strand == "+":
                first_exon = single_exon_list[0]
                exon_start = int(first_exon[1])
                diff = abs(exon_start - bed_start_pos)
                if diff != 1:
                    print(isoform)
                    print(diff)
                    print(exon_start)
                    print(bed_start_pos)
                    plus_strand_wrong += 1
            elif bed_strand == "-":
                first_exon = single_exon_list[len(single_exon_list)-1]
                exon_start = int(first_exon[1])
                diff = abs(exon_start - bed_start_pos)
                if diff != 1:
                    #print(isoform)
                    minus_strand_wrong += 1
    #print(wrong_chr)
    #print(plus_strand_wrong)
    #print(minus_strand_wrong)

compare()
