#Pull full transcript sequence (not spliced sequence from SQANTI)
#will need to read in full bed file with TSSs, classification file and weird isoforms (to remove these)
#to run script: python3 Pull.unspliced.nucleotide.sequences.py <full bed file> <isoforms to remove because of issues in bed file> <ensembl fasta file> <class file>

import sys

#read bed file with all transcript start and stop positions
#returns dictionary with key == isoform and value == [isoform, chr_num, start_pos, end_pos, strand]
def read_bed():
    bed_file = sys.argv[1]
    bed_dict = {}
    with open(bed_file, 'r') as bed_info:
        for line in bed_info:
            new_line = line.split()
            chr_num = new_line[0]
            start_pos = new_line[1]
            end_pos = new_line[2]
            isoform = new_line[3]
            strand = new_line[5]
            dict_value = [isoform, chr_num, start_pos, end_pos, strand]
            bed_dict.update({isoform:dict_value})
    return bed_dict

#weird isoforms to be handled manually
#returns list of isoforms to remove from other files
def pull_weird_isoforms():
    isoforms_file = sys.argv[2]
    weird_isoforms = []
    with open(isoforms_file, 'r') as isoforms:
        for line in isoforms:
            isoform = line.strip("\n")
            weird_isoforms.append(isoform)
    return weird_isoforms

def read_ensembl_fasta():
    fasta_file = sys.argv[3]
    fasta_dict = {}
    final_fasta_dict = {}
    with open(fasta_file, 'r') as fasta:
        for line in fasta:
            if line.startswith(">"):
                new_line = line.split(" ")
                full_isoform_id = new_line[0].strip(" ")
                fasta_id = full_isoform_id.strip(">")
                final_fasta_id = fasta_id.strip("\n")
            else:
                new_line = line.strip("\n")
                if final_fasta_id in fasta_dict:
                    fasta_dict[final_fasta_id].append(new_line)
                elif final_fasta_id not in fasta_dict:
                    fasta_dict.update({final_fasta_id:[new_line]})
        for chr in fasta_dict:
            final_seq = []
            single_seq = fasta_dict[chr]
            for seq in single_seq:
                final_seq += seq
            final_fasta_dict.update({chr:final_seq})
    return final_fasta_dict

#read classification file
#returns dictionary with key == isoform and value == chr num
#will need to change column numbers depending on the file
def read_class():
    class_file = sys.argv[4]
    class_dict = {}
    with open(class_file, 'r') as class_info:
        for line in class_info:
            if line.startswith("PB"):
                new_line = line.split("\t")
                isoform = new_line[0]
                chr_num = new_line[2]
                strand = new_line[3]
                dict_value = [chr_num, strand]
                class_dict.update({isoform:dict_value})
    return class_dict

def filtered_bed():
    weird_isoforms = pull_weird_isoforms()
    bed_dict = read_bed()
    for isoform in weird_isoforms:
        if isoform in bed_dict:
            del bed_dict[isoform]
    return bed_dict


#pull sequences for each isoforms
#returns dictionary with key == isoform and value == [strand, [sequence]]
def pull_seq():
    bed_positions = filtered_bed()
    class_info = read_class()
    ensembl_seqs = read_ensembl_fasta()
    final_seqs = {}
    for isoform in class_info:
        if isoform in bed_positions:
            single_bed = bed_positions[isoform]
            single_class = class_info[isoform]
            chr_num = single_class[0]
            strand = single_class[1]
            start_pos = int(single_bed[2])
            end_pos = int(single_bed[3])
            single_ensembl_chr = ensembl_seqs[chr_num]
            single_seq = single_ensembl_chr[start_pos:end_pos]
            dict_value = [strand, single_seq]
            final_seqs.update({isoform:dict_value})
    return final_seqs

#converting sequences based on strand:
def convert_seqs():
    all_seqs = pull_seq()
    single_seq = all_seqs["PB.1833.1"][1]
    new_seq = []
    new_nt = ""
    for nt in single_seq:
        if nt == "A":
            new_nt = "T"
        elif nt == "T":
            new_nt = "A"
        elif nt == "C":
            new_nt_ = "G"
        elif nt == "G":
            new_nt = "C"
        new_seq += new_nt
    print(new_seq)
    new_seq.reverse()
    print(new_seq)



convert_seqs()
