#!/use/bin/env python3

import genutils3
import subprocess
import os
import gzip
import sys
from optparse import OptionParser 
import extract_for_assem
import matplotlib.pyplot as plt
import statistics
import time
import signal
import psutil
def create_output_file_name_and_canu_genome_length(region):
    split_region = region.split(":")
    chromosome = split_region[0]
    start = split_region[1].split("-")[0]
    end = split_region[1].split("-")[1]
    file_name = chromosome + "_" + start + "_" + end
    one_based = chromosome + ":" + str(int(start)+1) + "-" + str(int(end))
    canu_genome_length = int(end) - int(start) + 30001
    #print(file_name)
    return(file_name, canu_genome_length, one_based)

def extend_region_range(region, extension_length, return_start = False):
    split_region = region.split(":")
    chromosome = split_region[0]
    start = split_region[1].split("-")[0]
    end = split_region[1].split("-")[1]
    start = int(start) - extension_length
    if start < 1:
        start
    end = int(end) + extension_length
    extended_region = chromosome + ":" + str(start) + "-" + str(end)
    #print(extended_region)
    if return_start == True:
        return extended_region, start, chromosome
    else:
        return extended_region

def filtering_samfile(sam, retention_value, fastq, file_name):
    def length_of_Match(CIGAR):
        letters = ["S", "M", "I", "D", "X", "H"]
        total_Match = 0
        new_appending = ""
        for char in CIGAR:
            #print(char)
            if char not in letters:
                #print(char)
                new_appending = new_appending + str(char)
            elif char == "S" or char == "H" or char == "D" or char == "I" or char == "X":
                new_appending = ""
            elif char == "M":
                total_Match = total_Match + int(new_appending)
                new_appending = ""
        #print(length)
        return total_Match
    
    def Max_Ins(CIGAR):
        letters = ["S", "M", "I", "D", "X", "H"]
        Max_Insertion = 0
        new_appending = ""
        for char in CIGAR:
            #print(char)
            if char not in letters:
                #print(char)
                new_appending = new_appending + str(char)
            elif char == "S" or char == "H" or char == "D" or char == "M" or char == "X":
                new_appending = ""
            elif char == "I":
                if int(new_appending) > Max_Insertion:
                    Max_Insertion = int(new_appending)
                new_appending = ""
        #print(length)
        return Max_Insertion
    passed_reads = []
    with open(sam) as samfile:
        total = 0
        Qualified_reads = 0
        Failed_reads = 0
        Readinfo = ["",{"Genome" : 0, "LINE-1" : 0}, 0]
        for line in samfile:
            original_line = line
            if line.startswith("@"):
                continue
            line = line.split()
            if line[1] == "4":
                continue
            total +=1
            #print(line[0:5])
            if line[0] == Readinfo[0]:
                if line[2] == "L1_Cf":
                    Readinfo[1]["LINE-1"] = max(length_of_Match(line[5]), Readinfo[1]["LINE-1"])
                else:
                    Readinfo[1]["Genome"] = max(length_of_Match(line[5]), Readinfo[1]["Genome"])
                    Readinfo[2] = max(Max_Ins(line[5]), Readinfo[2])
            else:
                if Readinfo[0] != "":
                    if (Readinfo[1]["LINE-1"] > 50 and Readinfo[1]["Genome"] > 50) or Readinfo[2] > 50:
                        Qualified_reads +=1
                        print(f"This read passed. {Readinfo}")
                        passed_reads.append(Readinfo[0])
                    else:
                        Failed_reads +=1
                        print(f"This read failed. {Readinfo}")
                Readinfo = [line[0] ,{"Genome" : 0, "LINE-1" : 0}, 0]
                if line[2] == "L1_Cf":
                    Readinfo[1]["LINE-1"] = max(length_of_Match(line[5]), Readinfo[1]["LINE-1"])
                else:
                    Readinfo[1]["Genome"] = max(length_of_Match(line[5]), Readinfo[1]["Genome"])
                    Readinfo[2] = max(Max_Ins(line[5]), Readinfo[2])
            #print(Readinfo)
        if (Readinfo[1]["LINE-1"] > 50 and Readinfo[1]["Genome"] > 50) or Readinfo[2] > 50:
            Qualified_reads +=1
            print(f"This read passed. {Readinfo}")
            passed_reads.append(Readinfo[0])
        else:
            Failed_reads +=1
            print(f"This read failed. {Readinfo}")
        #print(total)
        #print(Qualified_reads)
        #print(Failed_reads)
        print("The total number of alignments is %s.\nOf them %s have no match in the alignment genome.\nThe number of alignments which \
qualify is %s and the number that failed is %s. \nThe minimum number of qualified reads is %s." % (total, total-Qualified_reads-Failed_reads, Qualified_reads, Failed_reads, retention_value))
        if Qualified_reads < retention_value:
            print("Region Analysis Terminated due to insufficient number of reads containing alignment to reference genome and Line 1 sequence")
            subprocess.run(f'echo "Not Enough Qualified Reads" > {file_name}_Finished.txt', shell = True)
            sys.exit()
        else: 
            print("Minimum read filtering value met or exceeded. Continuing analysis")
            
    print(f"The list of reads which passed filtering is {passed_reads}.")
    
    passed_fastq = sam.split("Local")[0] + "Passed.fq"
    passed_fastq_file = open(passed_fastq, 'wb')
    print(passed_fastq)
    
    original_fq = gzip.open(fastq,'rb')
    list_of_fq_lines = original_fq.readlines()
    #print(list_of_fq_lines[0:2])
    for i in range(len(list_of_fq_lines)):
        name = list_of_fq_lines[i].decode()
        if name.startswith("@"):
            #print('the line starts with @')
            name = name[1::].rstrip()
            print(name)
            if name in passed_reads:
                passed_fastq_file.write(list_of_fq_lines[i])
                passed_fastq_file.write(list_of_fq_lines[i+1])
                passed_fastq_file.write(list_of_fq_lines[i+2])
                passed_fastq_file.write(list_of_fq_lines[i+3])
            if name == "SRR7362988.128813_1":
                print('found name')
                print(list_of_fq_lines[i+2])
                print(len(list_of_fq_lines[i+3]))

    command = f'gzip {sam.split("Local")[0]} + Passed.fq'
    print(command)
    original_fq.close()
    passed_fastq_file.close()
    subprocess.run(f'gzip {sam.split("Local")[0] + "Passed.fq"}', shell = True)

    return "Filtering_Step_Completed"
def length_of_insertion(CIGAR):
    letters = ["S", "M", "I", "D", "X", "H"]
    length = 0
    longest_insertion = 0
    total_insertion = 0
    read_up_to = 0
    longest_insertion_start = 0
    new_appending = ""
    for char in CIGAR:
        #print(char)
        if char not in letters:
            #print(char)
            new_appending = new_appending + str(char)
        elif char == "S" or char == "H" or char == "D" or char == "M" or char == "X":
            #print(new_appending, char,char)
            read_up_to = read_up_to + int(new_appending)
            new_appending = ""
        elif char == "I":
            if longest_insertion < int(new_appending):
                longest_insertion = int(new_appending)
                longest_insertion_start = read_up_to
            read_up_to = read_up_to + int(new_appending)
            total_insertion = total_insertion + int(new_appending)
            new_appending = ""
    #print(length)
    return length, longest_insertion, longest_insertion_start

def extracting_putative_insertions(sam, contigs):
    def length_and_position_of_insertion(CIGAR, start):
        letters = ["S", "M", "I", "D", "X", "H"]
        match_length = 0
        clips = []
        longest_insertion = 0
        total_insertion = 0
        read_up_to = 0
        longest_insertion_start = 0
        new_appending = ""
        putative_insertions = []
        end_in_contig = 0

        for i in range(len(CIGAR)):
            char = CIGAR[i]
            #print(i, len(CIGAR))
            #print(char)
            if char not in letters:
                #print(char)
                new_appending = new_appending + str(char)
            else:
                if char == "S" or char == "H":
                    #if read_up_to == 0:
                        #clips.append(int(new_appending))
                    if i+1 == len(CIGAR):
                        end_in_contig = read_up_to
                    #print(new_appending, char,char)
                    read_up_to = read_up_to + int(new_appending)
                    new_appending = ""
                elif char == "D":
                    match_length = match_length + int(new_appending)
                    new_appending = ""
                elif char == "M" or char == "X":
                    match_length = match_length + int(new_appending)
                    read_up_to = read_up_to + int(new_appending)
                    new_appending = ""
                    if i+1 == len(CIGAR):
                        end_in_contig = read_up_to
                elif char == "I":
                    #print(new_appending)
                    if int(new_appending) >  50:
                        insertion_start_in_contig = read_up_to
                        read_up_to = read_up_to + int(new_appending)
                        genome_position = match_length + start   
                        #putative_insertions.append([int(new_appending), match_length, insertion_start_in_contig, genome_position])
                        putative_insertions.append([insertion_start_in_contig, insertion_start_in_contig + int(new_appending), genome_position, genome_position+1]) 
                        #print(putative_insertions)
                    else:
                        read_up_to = read_up_to + int(new_appending)
                    new_appending = ""
                    if i+1 == len(CIGAR):
                        end_in_contig = read_up_to
        if clips == []:
            clips.append(0)
            clips.append(end_in_contig)
        else:
            clips.append(end_in_contig)
        return putative_insertions, int(match_length) + start, clips, read_up_to
    
    total_insertions_all_contigs = {}
    for contig_name in contigs:
        #print(f"starting work with {contig_name}")
        print(f"Detecting insertions in contig {contig_name}.")
        with open(sam) as samfile:
            print(f'samfile with name {sam} opened')
            regions = []
            total_insertions = []
            for line in samfile:
                #print(str(contig_name))
                #print(line[0:20])
                if line.startswith(contig_name):
                    line = line.split()
                    #print(line)
                    insertions, end_position, clips_positions, alignment_length = length_and_position_of_insertion(line[5], int(line[3]))

                    for item in insertions:
                        print("New potential insertion detected with contig and reference coordinates")
                        print(item)
                        #Region is in the form reference name, start of alignment, end of alignment, flag, contig name, list of contig coors
                        total_insertions.append(item)

                    regions.append([line[2], int(line[3]),end_position,line[1],line[0],clips_positions])
                    #print([line[0], line[3], end_position])
                    #print(alignment_length)

            #print(regions)
            #print(total_insertions)

        #print(regions, '\n\n')
        contig_positions = []
        for region in regions:
            contig_positions.append(region[-1])
        print(f"The positions in the contig which are covered by at least 1 alignment are {contig_positions}")
        if not regions == []:
            bp_hit = []
            for region in regions:
                for i in range(region[-1][0],region[-1][1]+1):
                    bp_hit.append(i)
            bp_hit = set(bp_hit)
            #print(bp_hit)

            #this can be optimized
            not_counted = []
            for i in range(0,alignment_length+1):
                if i not in bp_hit:
                    not_counted.append(i)
            
            #print('printing not counted', alignment_length)
            #print(not_counted)
            bp_regions = []
            for i in range(len(not_counted)):
                if bp_regions ==[]:
                    bp_regions.append([not_counted[i]])
                    continue
                if i < len(not_counted) -1 and not_counted[i-1] == not_counted[i] - 1:
                    continue
                elif i < len(not_counted) -1 and not_counted[i-1] != not_counted[i] -1:
                    bp_regions[-1].append(not_counted[i-1])
                    bp_regions.append([not_counted[i]])
                else:
                    bp_regions[-1].append(not_counted[i])
            print(f"The regions of the contig which are not covered by an alignment are:\n{bp_regions}")


            reference_positions = {}
            for region in regions:
                reference_positions[region[-1][0]] = region[1]
                reference_positions[region[-1][1]] = region[2]
            print(f"The dictionary which maps contig positions to the aligned reference is {reference_positions}")

            
            #filter basepair regions which occur in softclips at the very start or end of the contig.
            #We shouldnt need these as the insertion should be located somewhat near the middle of the region due to filtering
            #criteria.
            indexes_to_remove = []
            #print(bp_regions,len(bp_regions))
            for i in range(len(bp_regions)):
                if len(bp_regions) == 2:
                    if bp_regions[i][0] > max(reference_positions.keys()):
                        print(f"Tail end of contig does not align to reference.  Removing {bp_regions[i]} from list of regions to analyze")
                        indexes_to_remove.append(i)
                        continue
                    elif bp_regions[i][1] < min(reference_positions.keys()):
                        print(f"Head of contig does not align to reference.  Removing {bp_regions[i]} from list of regions to analyze")
                        indexes_to_remove.append(i)
                        continue
                elif len(bp_regions) == 1:
                    print(f"bp_region has only one coordinate {bp_regions[i][0]}. Removing from subsequent analysis")
                    indexes_to_remove.append(i)
                    
            #indexes_to_remove.sort(reverse=True)
            for index in reversed(indexes_to_remove):
                bp_regions.pop(index)
            
            for i in range(len(bp_regions)):
                print(bp_regions)
                print(bp_regions[i][0])
                print(bp_regions[i][0]-1)
                print(reference_positions)
                append_value = reference_positions[bp_regions[i][0]-1]
                bp_regions[i].append(append_value)    
                append_value = reference_positions[bp_regions[i][1]+1]
                bp_regions[i].append(append_value)


            print(f"The remaining bp regions which do not have an alignment to the reference are: {bp_regions}")

            for locus in bp_regions:
                if locus[1] - locus[0] < 50:
                    continue
                else:
                    total_insertions.append(locus)
        total_insertions.insert(0, "Start of Gap/Insertion, End of Gap/Insertion, Last base in previous alignment, First base in next alignment")
        total_insertions_all_contigs[contig_name] = total_insertions
        
    print(f"The dictionary containing all potential insertions is:\n{total_insertions}")    
    return total_insertions_all_contigs

def Determine_Contig_Orientation(sam, contig, dest_dir, locus):
    contigs = {}
    with open(sam, 'rt') as sam_file:
        for line in sam_file:
            if not line.startswith("@"):
                #print(line)
                samline = line.rstrip().split()
                parsed_line =genutils3.parse_sam_line(samline)
                if parsed_line['reverseStrand'] is False:
                    #print('not reverse orientation')
                    length = len(parsed_line['seq'])
                    direction = "Forward"
                else:
                    print('Contig in Reverse Orientation')
                    length = len(parsed_line['seq'])
                    direction = "Reverse"
                if parsed_line['seqName'] not in contigs.keys():
                    contigs[parsed_line['seqName']] = [length, direction]
                    #print(contigs)
                if length > contigs[parsed_line['seqName']][0]:
                    contigs[parsed_line['seqName']] = [lenth, direction]
                    #print(contigs)
                    
    contig_file = open(contig, 'rt')
    output_contigs = {}
    for line in contig_file:
        if line.startswith(">"):
            split_line = line.split()
            contig_name = split_line[0][1::]
            contig_key = line
            output_contigs[line] = ""
        else:
            line = line.rstrip()
            output_contigs[contig_key] = output_contigs[contig_key] + line
    #print(contig_name, split_line, output_contigs)
    contig_file.close()
    
    contig_number = 0
    name_number_dict = {}
    print(f"The contigs, their lengths, and original orientations are as follows:\n{contigs}")
    for key in output_contigs.keys():
        #print(key)
        output_name = locus + "_proper_contig_orientation" + str(contig_number) + ".fasta"
        output_file = open(output_name, 'wt')
        output_file.write(key)
        contig = key.split()[0][1::]
        name_number_dict[contig_number] = contig
        #print(contig)
        #print(output_contigs[contig]
        #print(contigs.keys())
        if contigs[contig][1] == 'Forward':
            output_file.write(output_contigs[key])
            print(f'Forward contig {key} written to file {output_name}.')
            output_file.close()
        elif contigs[contig][1] == 'Reverse':
            reversed_seq = genutils3.revcomp(output_contigs[key])
            output_file.write(reversed_seq)
            print(f'Reverse contig {key} written to file {output_name}.')
            output_file.close()
        else:
            raise Exception("Contig Direction Not Properly Specified")
        contig_number +=1
    return name_number_dict

def length_of_alignment_from_CIGAR_string(cigar_string):
    """
    Function that takes in a cigar string for a supplementary alignment and determines the length of the region of the read
    which mapps to the supplementary region.  Soft clipped regions and insertion sections will be removed as I'm only 
    interested in genomic coordinates.  Deleteions will be included to generate the proper length according to the reference sequence
    """
    letters = ["S", "M", "I", "D", "X", "H"]
    length = 0
    new_appending = ""
    for char in cigar_string:
        #print(char)
        if char not in letters:
            #print(char)
            new_appending = new_appending + str(char)
        elif char == "S" or char == "H" or char == "I":
            #print(new_appending, char,char)
            new_appending = ""
        elif char == "D" or char == "M" or char == "X":
            #print(new_appending, char, char, char)
            length = length + int(new_appending)
            new_appending = ""
    #print(length)
    return length

def insertion_location(age_file):
    age_file = open(age_file, 'rt')
    lines = age_file.readlines()
    excised_regions = lines[0:31]
    #print(excised_regions)
    
    
    #begin parsing the file
    scoring = excised_regions[1]
    first_seq_info = excised_regions[3]
    second_seq_info = excised_regions[4]
    score = excised_regions[6]
    aligned_nuqs = excised_regions[7]
    identify = excised_regions[8]
    gaps = excised_regions[9]
    alignment_first = excised_regions[12]
    alignment_second = excised_regions[13]
    excised_first = excised_regions[16]
    excised_second = excised_regions[17]
    
    #handle if there's alternative region coordinates
    if excised_regions[18].startswith("ALTERNATIVE REGION"):
        #print("There is an alternative region")
        alternative_first = excised_regions[19]
        alternative_second = excised_regions[20]
        excised_regions = excised_regions[0:18] + excised_regions[21::]
        #print(excised_regions)
        
    id_at_breaks_first = excised_regions[20]
    id_at_breaks_second = excised_regions[21]
    id_outside_breaks_first = excised_regions[23]
    id_outside_breaks_second = excised_regions[24]
    id_inside_breaks_first = excised_regions[26]
    id_inside_breaks_second = excised_regions[27]
    
    #print(id_inside_breaks_second)
    #determine if there's identity at breakpoints
    TSDs = id_at_breaks_first.split()
    
    #print(TSDs)
    if TSDs[3] == "0":
        #print(f"TSDs[3] == {TSDs[3]}")
        #print('starting TSD processing')
        #first_excised_coors = excised_first.split()[5]
        #print(first_excised_coors, "<- first excised coors")
        TSD_boundaries = [TSDs[3], [None,None]]
        #print('tsd done')
    else:
        #first_seq[1] = [int(first_seq[1].split(",")[0][1::]) + int(TSDs[3]),int(first_seq[1].split(",")[1][:-1])]
        TSD_boundaries = TSDs[3], [TSDs[-3].strip("][").split(",")[1], TSDs[-1].strip("][").split(",")[0]]
    print(f"The estimated target sight duplication length and boundaries are {TSD_boundaries}.")
    
    #determine the length of the insertion
    first_seq=excised_regions[3].split()
    if excised_first.split()[3] == "0":
        #first_seq = "0"
        print(f"No excised regions in age file {age_file}.")
        #if there's not an excised region in the contig of interest, ignore the region`
        return "", "", ""
    else:
        first_seq = [excised_first.split()[3], excised_first.split()[5]]
        first_seq[0] = int(first_seq[0])
        first_seq[1] = [int(first_seq[1].split(",")[0][1::]),int(first_seq[1].split(",")[1][:-1])]
    print(f"The length of the insertion and contig coordinates are {first_seq}.")
    second_seq=excised_second.split()
    #if int(second_seq[3]) < 5000:
        #print(second_seq)
        #return "", "", ""
    second_seq = second_seq[3]
        #print(second_seq)
    #else:
        #second_seq = [second_seq[3], second_seq[5]]
        #print(second_seq)
        #print(f"second seq does not have an exact insertion sight.   Second seq breakpoints are {second_seq}")
        #print(f"Second seq has a gap larger than 5000bp which has disqualified the insertion from further processing.  Second seq breakpoints are {second_seq}.")
        #return "", "", ""
        #determine the reference position of the potential insertion
        
    ref_alignment = excised_regions[0].split()
    excision_ref_coordinants = [alignment_second.split("[")[1].split("]")[0].split(',')[1], alignment_second.split("[")[2].split("]")[0].split(',')[0]]
    #excision_ref_coordinants = [alignment_second.split()[3].strip("][").split(",")[1], alignment_second.split()[6].strip("][").split(",")[0]]
    print(f"The coordinates excised from the reference are: {excision_ref_coordinants}")

    #print(TSDs)
    #This is not currently being used as a filtering critera.  The information needs to be mined later. But for now moving forward without it.
    #if int(TSDs[3]) < 4:
    #    #if no TSD occurs, disregard the region
    #    return "", "", ""
    #else:
   
    
    return first_seq, excision_ref_coordinants, TSD_boundaries

def Normalize_Fasta_File(file_name):
    with open(file_name, 'rt') as fasta_file:
        outfile_name = file_name.split(".fasta")[0] + ".normalized.fasta"
        print(file_name, outfile_name)
        #print(outfile_name)
        with open(outfile_name, 'wt') as outfile:
            for line in fasta_file:
                #print(line)
                if line.startswith(">"):
                    outfile.write(line)
                else:
                    new_line = ""
                    for char in line.rstrip():
                        new_line = new_line + char
                        if len(new_line) == 50:
                            new_line = new_line + "\n"
                            outfile.write(new_line)
                            #print(new_line)
                            #print("appended new line")
                            new_line = ""
    return f"successfully generated normalize fasta file {outfile_name}."

def Repeat_Positions(repeat_masker_output, range_between_TSDs):
    #new method of determining % L1 content.
    insertion = (range_between_TSDs[1], range_between_TSDs[0])
    initial_range = int(insertion[1]) - int(insertion[0])
    L1s = []
    print(f"INSERTION IS {insertion}")
    with open(repeat_masker_output, 'rt') as file:
        for line in file:
            line_start = 0
            line_end = 0
            if not line.startswith("   SW") and not line.startswith("score") and not line == "\n":
                #print(line)
                line = line.split()
                if line[10] == "LINE/L1":
                    print(line)
                    #if RM is outside of the insertion
                    if int(line[6]) < int(insertion[0]):
                        continue
                    if int(line[5]) > int(insertion[1]):
                        continue
                        
                    #if RM extends beyond both ends of the insertion    
                    if int(line[5]) < int(insertion[0]) and int(line[6]) > int(insertion[1]):
                        print(line)
                        line_start = int(insertion[0])
                        line_end = int(insertion[1])
                        
                    #if the RM covers one of the boundaries of the insertion
                    if int(line[5]) < int(insertion[0]) and int(line[6]) < int(insertion[1]):
                        print(line)
                        line_start = int(insertion[0])
                        line_end = int(line[6])
                    if int(line[5]) > int(insertion[0]) and int(line[5]) < int(insertion[1]) and int(line[6]) > int(insertion[1]):
                        print(line)
                        line_start = int(line[5])
                        line_end = int(insertion[1])
                    
                    #if the RM is entirely within the insertion
                    if int(line[5]) > int(insertion[0]) and int(line[5]) < int(insertion[1]) and int(line[6]) < int(insertion[1]):
                        line_start = int(line[5])
                        line_end = int(line[6])
                        
                    #determine the orientation of the element relative to the fasta file
                    if line[8] == "C":
                        orientation = "Reverse"
                    else:
                        orientation = "Forward"
                    L1s.append([line_start, line_end, orientation])
                    
                    print(initial_range)
                    print(L1s)
                    
        #determine L1 content between TSDs
        if len(L1s) > 1:
            L1_content = "N/A Multiple L1s detected"
            transduction = "N/A Multiple L1s detected"
            orientation = None
            #print("Multiple L1s Detected in Region")
        else:
            if L1s == []:
                L1_content = "No L1 detected"
                transduction = "No L1 detected"
                orientation = None
                #print('goop')
            else:
                L1_content = (line_end-line_start)+1
                print(L1_content)
            
                #putative 3' transduction detection
                if orientation == "Forward":
                    transduction = [L1s[0][1],insertion[1]]
                    #print(transduction)
                else:
                    transduction = [insertion[0],L1s[0][0]]
                    #print(transduction)
    return L1_content, transduction, orientation
    
    
def Repeat_Masker_and_Generating_MiroPeats_Images(file_name, destination, locus, sam, ref, contig_file_number, start_of_samref, ref_ins_start, ref_ins_end, positions_in_generated_contig):
    #To start, we need to identify the region of the reference genome that our contig aligns to. 
    with open(f"{sam}", 'rt') as samfile:
        #print(os.getcwd())
        return_dir = os.getcwd()
        if not os.path.isdir(f"{file_name}_Miro"):
            print("Path does not exist")
            os.mkdir(f"{file_name}_Miro")
        os.chdir(f"{file_name}_Miro")
        #print(os.getcwd())
        length = 0
        start = 1000000000 
        end = 0
        for line in samfile:
            if not line.startswith("@"):
                #print(line)
                samline= line.split()
                start_chr = samline[2]
                start_pos = int(samline[3])
                CIGAR_string = samline[5]
                #print(start_chr, start_pos)
                length_of_alignment= length_of_alignment_from_CIGAR_string(CIGAR_string)
                if start_pos + start_of_samref < start:
                    start = start_pos + start_of_samref
                if start_pos + length_of_alignment + start_of_samref > end:
                    end = start_pos + length_of_alignment + start_of_samref
        chrom = start_chr.split(":")[0]
        if int(start)-10000 < 1:
            start = 10001
        coordinates = "%s:%s-%s" % (chrom, int(start)-10000, int(end)+10000)
        print(f"The coordinates used by samtools faidx to generate miropeats images are {coordinates}.")
        
        #Create reference fasta with samtools faidx
        print("Checking if reference fasta exists and creating it if it does not")
        if os.path.exists(f"Reference_Fasta_{contig_file_number}.fa"):
            print(f"Reference File Reference_Fasta_{contig_file_number}.fa already exists")
        else:
            command = f'samtools faidx {ref} {coordinates} > Reference_Fasta_{contig_file_number}.fa'
            print(f"Creating reference file:\n{command}")
            subprocess.run(command, shell=True)
        
        command = f'miropeats -s 200 -onlyinter -seq Reference_Fasta_{contig_file_number}.fa ../{file_name}_Racon_Corrected_Contig{contig_file_number}.normalized.fasta | tee > 200.mrout'
        print("miropeats commands to generat unannotated miropeats images")
        print(command)
        subprocess.run(command, shell=True)
        print('ps2pdf threshold200.ps')
        subprocess.run('ps2pdf threshold200.ps', shell=True)
        
        #Use Repeat Masker to generate output for the two regions
        print('RepeatMasker commands to assess the repeat content of the reference sequence')
        if os.path.exists(f"Reference_Fasta_{contig_file_number}.fa.out"):
            print(f"RepeatMasker output for file Reference_Fasta_{contig_file_number}.fa.out already exists.")
        else:
            command = f'RepeatMasker -species dog Reference_Fasta_{contig_file_number}.fa'
            print(f"Creating RepeatMasker file for Reference_Fasta_{contig_file_number}.fa")
            subprocess.run(command, shell=True)
        
        #determine the position of the insertion in the reference sequence
        topBreak_start = ref_ins_start - (start - 10000)
        topBreak_end = ref_ins_end - (start - 10000)
        print(f'start in reference of insertion is {ref_ins_start}, end is {ref_ins_end}.')
        #print(start, ref_ins_start - start - 10000, ref_ins_start - (start - 10000))
        print(f"The start of the breakpoint in the reference is {topBreak_start}.\nThe end of the breakpoint in the reference is {topBreak_end}")
        #Use the annotate miropeats script to create the annotated ps file.  Requires KiddLabCustom/2.0
        command = f'annotate-miropeats-2seqs.py --miroin threshold200.ps --siteID {locus} \
--topRM Reference_Fasta_{contig_file_number}.fa.out --bottomRM ../{file_name}_Racon_Corrected_Contig{contig_file_number}.normalized.fasta.out --topName {chrom} \
--topBreak {topBreak_start},{topBreak_end} --bottomBreak {positions_in_generated_contig[0]},{positions_in_generated_contig[1]}. '
        print("annotated miropeats commands:")
        print(command)
        subprocess.run(command, shell=True)
        subprocess.run('ps2pdf threshold200.ps', shell=True)
        print('ps2pdf threshold200.ps')
        command = f'ps2pdf -dEPSCrop threshold200.ps.annotated.ps ../Annotated_Image_{chrom}_{ref_ins_start}_{ref_ins_end}.pdf'
        print(command)
        subprocess.run(command, shell=True)
        
        #liftover
        lift_bed = "../%s_Contig_%s_%s_%s_%s.bed" % (file_name, contig_file_number, chrom, int(start)-10000, int(end)+10000)
        canfam_bed_line = "%s    %s    %s" % (chrom, int(start)-10000, int(end)+10000)
        with open(lift_bed, 'wt') as bed:
            bed.write(canfam_bed_line)
        command = f'liftOver {lift_bed} /home/jmkidd/links/kidd-lab/genomes/lift-over/canFam3.1ToZoey2.3.chain.gz ../{file_name}_lifted{contig_file_number}.bed ../{file_name}_unlifted{contig_file_number}.bed'
        subprocess.run(command, shell=True)
        
    os.chdir(return_dir)
    return True
if __name__ == "__main__":
    USAGE = """ USAGE:  python Automate_Region_Analysis_With_Python.py --directory <str destination directory for output> --locus <str locus of interest chr:start-end>\
--bam <bam file> --reference <refence fasta file>"""
    
    parser = OptionParser(USAGE)
    parser.add_option("--directory",dest='working_dir',help='prefix of working directory')
    parser.add_option("--locus", dest='locus_str',help='genomic locus which will undergo analysis in form str chr:start-end')
    parser.add_option("--bam", dest='bam_file', help='Bamfile used for read extraction and alignment purposes')
    parser.add_option("--reference",dest='ref_file',help='Reference fasta file used for minimap2 alignment step')
    parser.add_option("--L1_sequence", dest="L1_seq", help="path to L1 sequence. Default is L1_Cf")
    parser.add_option("--output", dest = "output_directory", help = "name of directory where the output will be placed")
    (options, args)=parser.parse_args()
    
    CLAINE_dir = os.path.dirname(sys.argv[0])
    #print(CLAINE_dir)
    
    if options.working_dir is None:
        parser.error('No working directory set')
    if options.locus_str is None:
        parser.error('No locus set')
    if options.bam_file is None:
        parser.error("No Bamfile provided")
    if options.ref_file is None:
        parser.error("No Reference fasta provided")
    if options.L1_seq is None:
        L1_seq = "%s/L1_Cf-FINAL.fa" % (CLAINE_dir)
        #print("Using default L1_Cf consensus found at %s." % (L1_seq))
    else:
        L1_seq = options.L1_seq
    
    print(f'All args accounted for.\n...................\nBeginning analysis of region {options.locus_str}\n')
    file_name_prefix, genome_length, one_based_region = create_output_file_name_and_canu_genome_length(options.locus_str)
    print(f"The one based representation of the region is {one_based_region}.")
    command = f"module list"
    process = subprocess.Popen([command], stdout=subprocess.PIPE, shell=True)
    stdout = process.communicate()
    print(f"The modules loaded during this run are as follows {stdout}")
    
    print('Testing if Output Directory Already Exists')
    
    #determine which chromosome the region is from
    ref_chr = options.locus_str.split(":")[0]
    ref_chr = ref_chr.split("chr")[1]
    chr_directories = []
    for i in range(0, 39):
        chr_directories.append(str(i))
    chr_directories.append("X")
    print(chr_directories)
    if ref_chr in chr_directories:
        locus_chr = f"chr{ref_chr}"
    else:
        locus_chr = "other"
    outdir = options.working_dir + "/" + options.output_directory
    os.chdir(outdir)
    
    
    #create tracker for start of run
    start_command = f"touch {file_name_prefix}_start.txt"
    subprocess.run(start_command, shell=True)
    
    #begin read extraction protocol
    fastq_out= file_name_prefix +".fq.gz"
    print("Creating fastq_file with name %s.\n" % (fastq_out))
    
    extract_for_assem.extract_long_reads(options.bam_file, one_based_region, fastq_out,debug=True)
    
    #Filtering Steps
    #Generate files requires for analysis
    print("Using Minimap2 to filter regions based on read alignments to local reference sequence and LINE-1 content")
    extended_locus = extend_region_range(one_based_region, 10000)
    command = "samtools faidx %s %s > %s_Reference.fasta" % (options.ref_file, extended_locus, file_name_prefix)
    print(f"The samtools faidx command used to extract sequence from the reference is:\n{command}")
    subprocess.run(command, shell=True)
    command = "cat %s >> %s_Reference.fasta" % (L1_seq, file_name_prefix)
    print(f"The combined fasta file containing the faidx reference sequence and LINE-1 sequence is:\n{command}")
    subprocess.run(command, shell=True)
    
    #align the reads to the combined fasta file
    command = f"minimap2 -ax map-pb {file_name_prefix}_Reference.fasta {file_name_prefix}.fq.gz > {file_name_prefix}_Local_mapped.sam" 
    print(f"The minimap2 command which aligns the reads to the combined fasta local genome is: \n{command}")
    subprocess.run(command, shell=True)
    
    #Analyze if threshold is met
    samfile = f"{file_name_prefix}_Local_mapped.sam"
    #filtering_samfile(samfile, 5, fastq_out, file_name_prefix)
    #print("\n")
    
    
    #Begin Canu Analysis
    Canu_Detection = True
    #want to add a catch which will handle low read coverage.  Could do as a binary or int for minInput or stopOnLow
    print(f"Beginning Canu assembly of reads found in {fastq_out}.") 
    print("Predicted Canu genome length is %s bp." % (genome_length))
    canu_out = file_name_prefix + "_canu"
    genome_size_command = "genomeSize=" + str(genome_length)
    command = "canu -useGrid=false -maxThreads=3 -corThreads=1 minInputCoverage=5 stopOnLowCoverage=5 -d %s -p %s %s -pacbio %s" % (canu_out, file_name_prefix, genome_size_command, fastq_out)
    print(f"Canu command is: {command}")
    subprocess.run(command, shell=True)
    print("\n")
    
    #Check for Canu output
    if not os.path.exists(f"{canu_out}/{file_name_prefix}.contigs.fasta"):
        print(f"Canu did not generate {canu_out}/{file_name_prefix}.contigs.fasta\nTerminating run.")
        #sys.exit()
        Canu_Detection = False
    elif os.stat(f"{canu_out}/{file_name_prefix}.contigs.fasta").st_size == 0:
        print(f"{canu_out}/{file_name_prefix}.contigs.fasta empty")
        Canu_Detection = False
      
    #if Canu_Detection == True:
    def process_local_alignment_output(out, file_prefix, path_contig_file, locus_str, ref):
        #Using Minimap2 to align the contig back to reference genome of interest.
        print("Using Minimap2 to align the Canu contig(s) back to a segment of the reference genome")
        #path_to_contig_file = outdir + "/" + file_name_prefix + "/" + file_name_prefix + ".contigs.fasta"
        minimap2_output = out + "/" + file_prefix + ".mapped.sam"
         
        #Create 100kb 5'/3' window for minimap alignment 
        extended_locus_100kb, reference_start_pos, chromosome = extend_region_range(locus_str, 100000, True)
        print(f"The extended locus that will be extracted with samtools faidx is {extended_locus_100kb}.\nThe start position of this extraction is at position {reference_start_pos} (1 based?).")
        #use samtools faidx to create the region 
        command = f"samtools faidx {options.ref_file} {extended_locus_100kb} > {file_prefix}_Extended_Faidx_100kb.fasta"
        print(f"The samtools faidx command extracting the portion of the reference genome to be used is:\n{command}")    
        subprocess.run(command, shell = True)

        #use minimap2 to perform alignment
        #command = f"minimap2 -ax map-pb {outdir}/Extended_Faidx_100kb.fasta {path_to_contig_file} > {outdir}/Minimap_Output_100kb_extension.sam"
        command = f"/home/jmkidd/links/kidd-lab/progs/minimap2-2.20/minimap2-2.20_x64-linux/minimap2 -ax map-pb {file_prefix}_Extended_Faidx_100kb.fasta {path_contig_file} > {file_prefix}_Minimap_Output_100kb_extension.sam"

        print(f"The minimap2 command which aligns the contigs to the faidx extracted reference is:\n{command}\n")
        subprocess.run(command, shell = True)
        
        #need to get contig names from canu fasta file
        contig_names = []
        with open(path_contig_file, 'rt') as contig_file:
            for line in contig_file:
                if line.startswith(">"):
                    contig_name = str(line.split()[0][1::])
                    #print(contig_name)
                    contig_names.append(contig_name)
        print("The contigs detected as output from canu are as named as follows:")
        print(contig_names)
        print("\n")
        
        print("Beginning putative insertion detection process")
        potential_insertions = extracting_putative_insertions(f"{file_prefix}_Minimap_Output_100kb_extension.sam", contig_names)
        print(potential_insertions)
        
        #Assess if contigs are in the proper orientation, and if not reverse complement the sequences
        print("Determining the likely orientation of each contig and creating one file for each")
        count_contigs = Determine_Contig_Orientation(f"{file_prefix}_Minimap_Output_100kb_extension.sam", path_contig_file, out, file_name_prefix)
        print("\n")
        
        #remove contigs which do not have regions to check for insertions
        print("Removing any contig which does not have any putative insertions.")
        print(f"The original list of contigs is {count_contigs}.")
        contigs_to_remove = []
        for key in count_contigs.keys():
            if len(potential_insertions[count_contigs[key]]) == 1:
                contigs_to_remove.append(key)
                continue
        for key in contigs_to_remove:
            count_contigs.pop(key)
        print(f"The list of contigs which have at least one insertion to be analyzed via age align is {count_contigs}.")
        
        if len(count_contigs) == 0:
            print("No Insertions/Contigs to analyze.  Terminating Program")
            Detection = False
        else:
            Detection = True
        return Detection, count_contigs, potential_insertions, reference_start_pos, chromosome
    path_to_contig_file = f"{canu_out}/{file_name_prefix}.contigs.fasta"
    if Canu_Detection == True:
        Canu_Detection, count_of_contigs, potential_insertions, reference_start_pos, chromosome = process_local_alignment_output(outdir, file_name_prefix, path_to_contig_file, one_based_region, options.ref_file)    
    if Canu_Detection == False:
        print(f"Beginning Wtdbg2 assembly of reads found in {fastq_out}.") 
        print("Predicted Wtdbg2 genome length is %s bp." % (genome_length))
        #wtdbg2_out = file_name_prefix
        command = f"module list"
        process = subprocess.Popen([command], stdout=subprocess.PIPE, shell=True)
        stdout = process.communicate()
    
        
        
        wtdbg2_command = f"exec wtdbg2 -x rs -g {genome_length} -t 3 -i {file_name_prefix}.fq.gz -fo {file_name_prefix}"
        wtpoa_command = f"wtpoa-cns -t 3 -i {file_name_prefix}.ctg.lay.gz -fo {file_name_prefix}.ctg.fa"
        print(f"wtdbg2 command is: {command}")
        print(f"wtpoa-cns_command is: {command}")
        subprocess.run(wtdbg2_command, shell=True)
        subprocess.run(wtpoa_command, shell=True)
        #second pass
        #subprocess.run(f"rm {outdir}/{file_name_prefix}.ctg.fa", shell=True)
        if not os.path.exists(f"{file_name_prefix}.ctg.fa"):
            print("No contig file found, re-running wtdbg2 and wtpoa!")
            subprocess.run(wtdbg2_command, shell=True)
            subprocess.run(wtpoa_command, shell=True)
        elif os.stat(f"{file_name_prefix}.ctg.fa").st_size == 0:
            print("Contig file empty, re-running wtdbg2 and wtpoa!")
            subprocess.run(wtdbg2_command, shell=True)
            subprocess.run(wtpoa_command, shell=True)
        print("\n")
        
        if not os.path.exists(f"{file_name_prefix}.ctg.fa"):
            print(f"After second pass, the {file_name_prefix}.ctg.fa was still not generated. Terminating")
            subprocess.run(f'echo "No contigs found in either wtdbg2 or canu." > {file_prefix}_Finished.txt', shell = True)
            sys.exit()
        elif os.stat(f"{file_name_prefix}.ctg.fa").st_size == 0:
            print("Wtdbg2 contig file empty after second pass. Terminating")
            subprocess.run(f'echo "No contigs found in either wtdbg2 or canu." > {file_prefix}_Finished.txt', shell = True)
            sys.exit()
        path_to_contig_file = f"{file_name_prefix}.ctg.fa"
        Canu_Detection, count_of_contigs, potential_insertions, reference_start_pos, chromosome = process_local_alignment_output(outdir, file_name_prefix, path_to_contig_file, one_based_region, options.ref_file)
    #Insertion detection
    insertions = []
    list_of_information = []

    for contig in count_of_contigs.keys():
        print(f"Beginning final contig analysis with {contig}.")
        #print("current directory is", os.getcwd())
        Insertion_in_contig = False
        #Use minimap2 to create an alignment of the reads back to the contig for racon
        command = f"minimap2 -ax map-pb {file_name_prefix}_proper_contig_orientation{contig}.fasta {fastq_out} > {file_name_prefix}_contig_alignment{contig}.sam"
        print(f"The command which aligns the reads back to the contig for contig polishing is:\n{command}")
        subprocess.run(command, shell=True)
        
        #use Racon to polish the contig
        command = f"racon --no-trimming {fastq_out} {file_name_prefix}_contig_alignment{contig}.sam {file_name_prefix}_proper_contig_orientation{contig}.fasta > {file_name_prefix}_Racon_Corrected_Contig{contig}.fasta"
        print(f"Racon command to generate racon polished contig: {command}")
        subprocess.run(command, shell=True)
        print(f"Racon has finished running on contig {contig}")
        #command = f"touch {file_name_prefix}_Racon_Command_Finished_{file_name_prefix}_contig_{contig}.txt"
        #subprocess.run(command, shell=True)
        
        #normalize the fasta format contig.
        Normalize_Fasta_File(f"{file_name_prefix}_Racon_Corrected_Contig{contig}.fasta")
        print("Fasta file normalized")
        
        #Detect if insertions are present, and where they're located.
        breakpoints = []
        probable_insertions = []
        TSD_boundaries = []
        insertion_info = []
        for boundary in potential_insertions[count_of_contigs[contig]][1::]:
            print(f"The insertion that is being analyzed is {boundary}.")
            if (boundary[3] - boundary[2]) < -100:
                # The minimum is negative to factor in that alignment softwares may struggle with target site duplications which could lead to 
                # a boundary[3] value which is smaller than boundary[2]
                print(f"boundary {boundary} has been skipped because {boundary[3]} - {boundary[2]} is less than negative 100")
                continue
            if (boundary[1] - boundary[0]) < -100:
                print(f"boundary {boundary} has been skipped because {boundary[1]} - {boundary[0]} is less than negative 100")
                continue
            age_boundary_buffers = [10000,1000]
            #age_boundary_buffers = [1000]
            for buffer in age_boundary_buffers:
                temp_bounds = []
                if boundary[0] - buffer < 1:
                    temp_bounds.append(buffer + 1)
                else:
                    temp_bounds.append(boundary[0])
                temp_bounds.append(boundary[1])
                if boundary[2] - buffer < 1:
                    temp_bounds.append(buffer + 1)
                else:
                    temp_bounds.append(boundary[2])
                temp_bounds.append(boundary[3])
                contig_coors = f"{temp_bounds[0] - buffer}-{temp_bounds[1] + buffer}"
                ref_coors = f"{temp_bounds[2] - buffer}-{temp_bounds[3] + buffer}"
                print(f"The age boundaries for the {age_boundary_buffers} buffer {boundary} putative insertion are as follows.\nThe contig coordinates used are {contig_coors}.\
\nThe reference coordinates used are {ref_coors}.")
                #print(contig_coors, ref_coors)
                
                command = f"age_align -coor1={contig_coors} -coor2={ref_coors} {file_name_prefix}_Racon_Corrected_Contig{contig}.normalized.fasta {file_name_prefix}_Extended_Faidx_100kb.fasta > {file_name_prefix}_age_output_{contig_coors}.txt"
                print(f"The age command used is {command}.")
                subprocess.run(command, shell=True)
                
                #age_file = open(f'{outdir}/age_output_{contig_coors}.txt', 'r')
                #lines = read_lines
                seq_breakpoints, insertion_sight, boundaries =insertion_location(f'{file_name_prefix}_age_output_{contig_coors}.txt')
                if seq_breakpoints == "":
                    print(f'Insertion {boundaries} failed to meet criteria')
                else:
                    #print(seq_breakpoints, insertion_sight, boundaries)
                    insertion_info.append([seq_breakpoints,insertion_sight,boundaries,buffer])
                    #breakpoints.append(seq_breakpoints)
                    #probable_insertions.append(insertion_sight)
                    #TSD_boundaries.append(boundaries)
        
        #once all insertions have been evaluated.  Determine if any insertions met the criteria.
        #If so, run repeat Masker to detect if the insertion contains LINE-1 sequence
        if len(insertion_info) >= 1:
            print("RepeatMasker command to allow for identification of LINE-1 content of the insertion:")
            command = f'RepeatMasker -species dog {file_name_prefix}_Racon_Corrected_Contig{contig}.normalized.fasta'
            print(command)
            subprocess.run(command, shell=True)
            
            #determine where in the RepeatMasker Track LINE-1 elements are
            print("Filtered RepeatMasker file to only contain repeat information related to LINE-1 elements")
            #LINE1_content = Repeat_Positions(f'{file_name_prefix}_Racon_Corrected_Contig{contig}.normalized.fasta.out')
           
            #Determine if insertion contains presence of LINE-1 elements
            print('Beginning advanced LINE1 insertion analysis with age_align')
            #print(TSD_boundaries)
            for candidate in insertion_info:
                #print(candidate[2])
                #initial_range = int(candidate[2][1][1]) - int(candidate[2][1][0]+{candidate[2][0]})                
                #if initial_range == 0:
                #    continue
                #print(f"Initial range between TSDs is {initial_range}.")
                print(candidate)
                between_TSDs = (int(candidate[0][1][1])-int(candidate[2][0])+1, int(candidate[0][1][0]))
                LINE1_content, transduction_boundaries, L1_orientation = Repeat_Positions(f'{file_name_prefix}_Racon_Corrected_Contig{contig}.normalized.fasta.out', between_TSDs)
                print(f"The length of the LINE-1 insertion is {LINE1_content}")
                print(f"The putative transduction, if any is within the boundaries {transduction_boundaries}")



                print(f"The insertion is located at extracted faidx extracted reference position {candidate[1]}.")
                #insertions.append([int(probable_insertions[i][0]) + int(reference_start_pos), int(probable_insertions[i][1]) + int(reference_start_pos)])
                ins_start = int(candidate[1][0]) + int(reference_start_pos)
                ins_end = int(candidate[1][1]) + int(reference_start_pos) -1
                positions_in_contig = candidate[0][1]
                length_in_contig = candidate[0][0]
                #print(positions_in_contig, length_in_contig)
                #Create New Line that contains the information
                #information = f"{chromosome}\t{ins_start-1}\t{ins_end-1}\t{options.locus_str}\t{count_of_contigs[contig]}\t{initial_range}\t{LINE1_proportion}\t{positions_in_contig[0]-1}\t{positions_in_contig[1]-1}\t{length_in_contig}\t{candidate[3]}"
                information = f"{chromosome}\t{str(int(ins_start))}\t{str(int(ins_end))}\t{options.locus_str}\t{count_of_contigs[contig]}\t{str(int(positions_in_contig[0]))}\t{str(int(positions_in_contig[1]))}\t{length_in_contig}\t{int(candidate[2][0])}\n"
                print("The detected insertion is as follows:")
                print(information)
                duplicate = False
                #if information in list_of_information:
                #    print("duplicate detected. only retaining one copy of the insertion in output")
                for detected_insertion in list_of_information:
                    detected_insertion = detected_insertion.split("\t")
                    if information.split("\t")[0:4] == detected_insertion[0:4]:
                        print("duplicate detected. Only one copy of the insertion will be presented in output")
                        duplicate = True
                if duplicate == False:
                    list_of_information.append(information)
                    Repeat_Masker_and_Generating_MiroPeats_Images(file_name_prefix, outdir, options.locus_str, f"{file_name_prefix}_Minimap_Output_100kb_extension.sam", options.ref_file, contig, int(reference_start_pos), ins_start, ins_end, positions_in_contig)
                    print("\n")
    #print(insertions)
    print(list_of_information)
    
    #this file will only be created if the script finishes.
    with open(f"{file_name_prefix}_CLAINE_hits.txt", 'wt') as outfile:
        for item in list_of_information:
            outfile.write(item)
        print("all insertions written to outfile")
    subprocess.run(f'echo "Script Ran To Completion" > {file_name_prefix}_Finished.txt', shell = True)
    
    #create fasta file containing all insertions.
    for item in os.listdir():
        if item.startswith(f"{file_name_prefix}_Racon") and item.endswith("fasta") and not item.endswith("normalized.fasta"):
            print(item)
            with open(f"{item}", 'rt') as racon_file:
                header_line = racon_file.readline()
                assembled_name = header_line.split()[0][1::]
            
                for info in list_of_information:
                    info = info.split("\t")
                    if info[4] == assembled_name:
                        print("Found a match")
                        command = f"samtools faidx {item}"
                        print(command)
                        subprocess.run(command, shell = True)
                        command = f"samtools faidx {item } {assembled_name}:{info[5]}-{info[6]} >> {file_name_prefix}_Extracted_Sequence.fasta"
                        print(command)
                        subprocess.run(command, shell = True)
            #for item in list_of_information:
            
    #cleanup
    subprocess.run(f'rm -rf {file_name_prefix}_Miro', shell = True)
    subprocess.run(f'rm -rf *RMoutput', shell = True)
