#!/usr/bin/env python3
# a custom python script for reformatting the Assemblytics result

# Import modules
from pyfasta import Fasta
from Bio.Seq import Seq

# Load original 2.0.vcf
bed_path = "2.0.bed"
ref_fasta_path = "Caenorhabditis_elegans.WBcel235.dna.toplevel.fa"
query_fasta_path = "cb4856.qv2.bac.pilon2.scaffold.fasta"

# Output file path
vcf_path = "2.0.my.vcf"

# Read fasta files
ref_fasta = Fasta(ref_fasta_path)
query_fasta = Fasta(query_fasta_path)

# Convert bed to vcf

# Read_bed
with open(bed_path,'r') as bed_lines:
    with open(vcf_path,'w') as vcf:
        # Remove header: manual
        for each_line in bed_lines:
            chrom, ref_start, ref_stop, ID, size, strand, _type, ref_gap_size, query_gap_size, query_coordinates, method = each_line.strip().split()
            query_tig, query_region, query_strand = query_coordinates.split(':')
            query_region_forward, query_region_reverse = query_region.split('-')
            
            # Naive way to replace name
            if chrom == 'I':
                chrom = 'I dna:chromosome chromosome:WBcel235:I:1:15072434:1 REF'
            elif chrom == 'II':
                chrom = 'II dna:chromosome chromosome:WBcel235:II:1:15279421:1 REF'
            elif chrom == 'III':
                chrom = 'III dna:chromosome chromosome:WBcel235:III:1:13783801:1 REF'
            elif chrom == 'IV':
                chrom = 'IV dna:chromosome chromosome:WBcel235:IV:1:17493829:1 REF'
            elif chrom == 'V':
                chrom = 'V dna:chromosome chromosome:WBcel235:V:1:20924180:1 REF'
            elif chrom == 'X':
                chrom = 'X dna:chromosome chromosome:WBcel235:X:1:17718942:1 REF'
            elif chrom == 'MtDNA':
                chrom = 'MtDNA dna:chromosome chromosome:WBcel235:MtDNA:1:13794:1 REF'
            # Chrom_num for vcf format
            chrom_num = chrom.split()[0]
            
            ref_region_seq = ref_fasta[chrom][int(ref_start):int(ref_stop)]
            
            query_region_seq = query_fasta[query_tig][int(query_region_forward):int(query_region_reverse)]
            
            # Empty seq cases
            if int(ref_start) == int(ref_stop):
                ref_region_seq = '.'
                
            if int(query_region_forward) == int(query_region_reverse):
                query_region_seq = '.'
            
            
            # If strand is reverse, do reverse_complement()
            if strand == '-':
                ref_region_seq = str(Seq(ref_region_seq).reverse_complement())
            if query_strand == '-':
                query_region_seq = str(Seq(query_region_seq).reverse_complement())
            
            
            vcf.write('\t'.join([chrom_num, ref_start, ID, ref_region_seq, query_region_seq, '.', "PASS", '.']) + "\n")
    
