#script to pull specific region of sequence from fasta file
#mainly to check another script
#will need to read in full fasta file, split by chromosome then read in as standard input (chr number, strand, start position, end position)
#to run script: PullSeq_Check.py <chromosome fasta file> <specify chr num> <specify strand> <specify start pos> <specify end pos>

import sys

def read_ensembl_fasta():
    fasta_file = sys.argv[1]
    fasta_dict = {}
    final_fasta_dict = {}
    with open(fasta_file, 'r') as fasta:
        for line in fasta:
            if line.startswith(">"):
                new_line = line.split(" ")
                full_isoform_id = new_line[0].strip(" ")
                fasta_id = full_isoform_id.strip(">")
                final_fasta_id = fasta_id.strip("\n")
            else:
                new_line = line.strip("\n")
                if final_fasta_id in fasta_dict:
                    fasta_dict[final_fasta_id].append(new_line)
                elif final_fasta_id not in fasta_dict:
                    fasta_dict.update({final_fasta_id:[new_line]})
        for chr in fasta_dict:
            final_seq = []
            single_seq = fasta_dict[chr]
            for seq in single_seq:
                final_seq += seq
            final_fasta_dict.update({chr:final_seq})
    return final_fasta_dict

#pull specific sequence
def pull_seq():
    chromosome_seqs = read_ensembl_fasta()
    chr_num = sys.argv[2]
    strand = sys.argv[3]
    start_pos = int(sys.argv[4])
    end_pos = int(sys.argv[5])
    single_chr = chromosome_seqs[chr_num]
    if strand == "+":
        print(single_chr[start_pos:end_pos])
    elif strand == "-":
        print(single_chr[end_pos:start_pos])


pull_seq()
