#need to count overlap between maker isoforms from Iso-Seq to ensembl annotations
#will read in Y chromosome bed file with positions of all isoforms on V5 assembly (Y chr)
#then will read in Mike's csv file to count overlap between isoforms and ensembl transcripts
#basically will assign isoforms as novel or overlapping ensembl annotation
#to run script: python3 Classify.Y.annotations.py <bed file with isoform positions> <csv file with positions of ensembl transcripts> <output file>
#Author: Alice Naftaly, March 2021

import sys

#read bed file:
#returns dictionary with key == isoform and value == start pos, end pos (does not take into account strand)
def read_bed():
    bed_file = sys.argv[1]
    bed_dict = {}
    with open(bed_file, 'r') as bed:
        for line in bed:
            new_line = line.split()
            start_pos = int(new_line[1])
            end_pos = int(new_line[2])
            isoform = new_line[3]
            dict_value = [start_pos, end_pos]
            bed_dict.update({isoform:dict_value})
    return bed_dict


#read csv
#returns dictionary with key == Ensembl transcript id and value == start pos, end pos
def read_csv():
    csv_file = sys.argv[2]
    csv_dict = {}
    with open(csv_file, 'r') as csv:
        for line in csv:
            if line.startswith("ENSGACT"):
                new_line = line.split(",")
                transcript_id = new_line[0]
                pos_1 = int(new_line[2])
                pos_2 = int(new_line[3])
                if pos_1 < pos_2:
                    start_pos = pos_1
                    end_pos = pos_2
                elif pos_1 > pos_2:
                    start_pos = pos_2
                    end_pos = pos_1
                dict_value = [start_pos, end_pos]
                csv_dict.update({transcript_id:dict_value})
    return csv_dict

#compare transcripts:
def compare():
    y_isoforms = read_bed()
    ensembl_transcripts = read_csv()
    y_iso_summary = {}
    for iso in y_isoforms:
        single_iso = y_isoforms[iso]
        y_start = single_iso[0]
        y_end = single_iso[1]
        overlap = []
        for transcript in ensembl_transcripts:
            single_transcript = ensembl_transcripts[transcript]
            ens_start = single_transcript[0]
            ens_end = single_transcript[1]
            if y_start < ens_start and y_start < ens_end and y_end > ens_start and y_end < ens_end:
                overlap.append(transcript)
            elif y_start > ens_start and y_start < ens_end and y_end > ens_start and y_end < ens_end:
                overlap.append(transcript)
            elif y_start > ens_start and y_start < ens_end and y_end > ens_start and y_end > ens_end:
                overlap.append(transcript)
            elif y_start < ens_start and y_start < ens_end and y_end > ens_start and y_end > ens_end:
                overlap.append(transcript)
        if len(overlap) == 0:
            y_iso_summary.update({iso:["novel"]})
        elif len(overlap) > 0:
            y_iso_summary.update({iso:overlap})
    return y_iso_summary

#write output
def write():
    y_isoforms_summary = compare()
    output = sys.argv[3]
    with open(output, 'a') as out:
        for iso in y_isoforms_summary:
            single_iso = y_isoforms_summary[iso]
            classification = ",".join(single_iso)
            final = "%s\t%s\n" % (str(iso), str(classification))
            out.write(final)


write()
