#sorting blast results from blasting female isoforms on the X chromosome to genome to see if there are strong hits elsewhere
#to run script: python3 Pull.nonchrXIX.matches.py


import sys

#need to pull BLAST information
#used output format 9 = where there are 4 comment lines before each query where fourth line specifies labels for query lines
#order = query id, subject id (id in combined sexes), % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
#need to pull query id, subject id, alignment length, q. start, q. end, s. start, s.end
#will first create dictionary with key = query id and value = each line that matches to query id
def pull_BLAST_results():
    BLAST_output = sys.argv[1]
    BLAST_dict = {}
    with open(BLAST_output, 'r') as blast_results:
        for line in blast_results:
            if line.startswith("#"):
                continue
            else:
                new_line = line.split()
                query_id = new_line[0]
                if query_id in BLAST_dict:
                    BLAST_dict[query_id].append(new_line)
                elif query_id not in BLAST_dict:
                    BLAST_dict.update({query_id:[new_line]})
    return BLAST_dict

#pull non chrXIX matches that have match of 50 bp or greater
def pull_nonchrXIX_matches():
    blast_dict = pull_BLAST_results()
    filtered_matches = {}
    for key in blast_dict:
        single_key = blast_dict[key]
        for value in single_key:
            chr_num = value[1]
            percent_same = float(value[2])
            alignment_length = int(value[3])
            if chr_num != "chrXIX" and alignment_length > 50 and percent_same > 95:
                if key in filtered_matches:
                    filtered_matches[key].append(value)
                elif key not in filtered_matches:
                    filtered_matches.update({key:[value]})
    return filtered_matches


#read in isoseq gtf with exon sizes
def read_isoseq_gtf():
    gtf_file = sys.argv[2]
    exon_dict = {}
    with open(gtf_file, 'r') as gtf:
        for line in gtf:
            new_line = line.split()
            exon_start = int(new_line[3])
            exon_end = int(new_line[4])
            exon_length = abs(exon_start - exon_end)
            isoform_id_full = new_line[9].strip(";")
            isoform_id = isoform_id_full.strip("\"")
            if isoform_id in exon_dict:
                exon_dict[isoform_id].append(exon_length)
            elif isoform_id not in exon_dict:
                exon_dict.update({isoform_id:[exon_length]})
    return exon_dict

#further filter X isoforms:
def filter_matches_with_exons():
    gtf_dict = read_isoseq_gtf()
    filtered_blast_matches = pull_nonchrXIX_matches()
    final_filtered_blast_matches = {}
    for key in filtered_blast_matches:
        if key in gtf_dict:
            single_key_blast = filtered_blast_matches[key]
            single_gtf = gtf_dict[key]
            for blast_value in single_key_blast:
                blast_alignment = int(blast_value[3])
                blast_count = 0
                for exon_length in single_gtf:
                    thirds_exon_length = int(exon_length/3)
                    if blast_alignment > thirds_exon_length:
                        blast_count += 1
                if blast_count > 0:
                    if key in final_filtered_blast_matches:
                        final_filtered_blast_matches[key].append(blast_value)
                    elif key not in final_filtered_blast_matches:
                        final_filtered_blast_matches.update({key:[blast_value]})
    return final_filtered_blast_matches

#examine matches:
def examine():
    final_matches = filter_matches_with_exons()
    print(len(final_matches))
    for key in final_matches:
        print(key)
        single_key = final_matches[key]
        for val in single_key:
            print(val)


examine()
