#!/usr/bin/python

# import argparse # For the fancy options
from Bio import SeqIO # to manipulate fasta sequences/files
from Bio.SeqRecord import SeqRecord # to manipulate fasta sequences/files
import sys # To exit the script 
import os # For the input name
import numpy 
# import matplotlib.pyplot as plt


# example:
# python extract_seq.py assembly_sample.fasta annotation.txt wtf_found.txt all_annotation_refCoor.bed EBCXXX wtf 500 
# ------------------------------------------------------
fasta_contigs=open(sys.argv[1],"r")

table_annotation=open(sys.argv[2],"r")
table_annotation_line=table_annotation.readlines()

## table_annotation_included=open(sys.argv[3],"r")
## table_annotation_included_list=table_annotation_included.readlines()

# table_know_coor=open(sys.argv[3],"r")
# table_know_coor_list=table_know_coor.readlines()

sample=sys.argv[3]
name=sys.argv[4]
min_len=sys.argv[5]

## table_summary = open(sample+'_'+name+'_minLen'+min_len+'_table.txt', 'wb')


## annotation_included = []
## for record in table_annotation_included_list:
	## annotation_included.append(record.split("\n")[0])
## #print(annotation_included)

sequences_dic={}
for seq_record in SeqIO.parse(fasta_contigs, "fasta"):
	sequences_dic[seq_record.id] = seq_record.seq
	#print(seq_record.id)


final_seqs = []
for record in table_annotation_line:
	seq_chr = record.split("\t")[0]
	seq_start = int(record.split("\t")[1])-1
	seq_end = int(record.split("\t")[2])
	ID_seq=record.split("\t")[3]
	strand_dir=record.split("\t")[12]
	ins_dir=record.split("\t")[4]
	final_seq = sequences_dic[seq_chr][seq_start:seq_end]
	if (seq_end-seq_start>int(min_len)):
		if (strand_dir=="+"):
			if (ins_dir=="-"):
				final_seq = final_seq.reverse_complement()
		elif (strand_dir=="-"):
			if (ins_dir=="+"):
				final_seq = final_seq.reverse_complement()
		record = SeqRecord(final_seq, ID_seq, '', '')
		final_seqs.append(record)


SeqIO.write(final_seqs, sample+'_'+name+'_minLen'+min_len+'_allSeqSamedir.fasta', "fasta")

## table_summary.close()

