#!/usr/bin/env python3
# a custom python script for getting corresponding genomic position as a bed file of genome-specific SVs 

# Import modules
import pandas as pd
from itertools import chain
from pyfasta import Fasta

# Load txt files that are manually saved from check_subset_detail()
v2_novel_snps = pd.read_csv("./v2.novel.snps.txt", header=None)[0].tolist()

# Writing bed file with novel SV/SNP positions (visually inspecting them with IGV)
with open('./v2.novel.snps.bed','w') as f:
    for each_snps in v2_novel_snps:
        coord = sv2_data[sv2_data['ID'] == each_snps]['query_coordinates'].tolist()[0].split(':')
        chrom=coord[0]
        start, end = coord[1].split('-')
        strand = coord[2]

        f.write('\t'.join([chrom, start, end, each_snps, '.', strand])+'\n')

# Load fasta file for extracting sequences 
cb4856_v2_scaffold = Fasta("cb4856.qv2.bac.pilon2.scaffold.fasta")
cb4856_v2_fasta = Fasta("cb4856.qv2.bac.pilon2.scaffold.gap.mt.fasta")		

# Load the bed file created above
v2_novel_snps_bed = pd.read_csv("v2.novel.snps.bed", sep='\t',header=None)
		
# Writing fasta file with PyFasta module
with open("cb4856_contig_scaffold_novel_sv_v2.fasta",'w') as f:

    for index, series in v2_novel_snps_bed.iterrows():
        contig = series[0].split('|')[0]
        start,end= series[1],series[2]
        if start > end:
            start,end = end,start

        seq= cb4856_v2_scaffold[contig][start:end]
        f.write(">"+contig+str(start)+'\n')
        f.write(seq+'\n')