# %%
import pandas as pd
import pybedtools
import re
import os

astCal_fasta = os.getenv('HOME') + "/code/malawi_transposon/storage/cloud/genome/astCal1.2_ensembl/astCal_v1.2.fa"
flank_len = 100

# define function to get the reverse complement of a sequence
def reverse_complement(dna_sequence):
    complement_dict = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
    reverse_sequence = reversed(dna_sequence)
    reverse_complement_sequence = [complement_dict[base] for base in reverse_sequence]
    return ''.join(reverse_complement_sequence)

# %%
# read files
# bubbles of interest
df_bubble = pd.read_csv("bubbles_to_validate.csv")

# alleles of every bubble
df_lookup = pd.read_table("alleles_by_bubble.txt").set_index("bubble_id")

# sequences of graph segments
# extract segment number and set it as the index
df_segments = pd.read_table("segments.txt", names=['segment', 'sequence'])
df_segments['segment'] = [re.match('s(\d+)', x).group(1) for x in df_segments['segment']]
df_segments = df_segments.set_index('segment')

# %%
fasta_sequences = []
fasta_names = []

for idx in range(df_bubble.shape[0]):
    bubble_coord_raw = df_bubble.loc[idx, 'coord']
    bubble_id = df_bubble.loc[idx, 'bubble_id']

    # get bubble coordinates
    bubble_coord = re.match(r'(?P<chr>\w+):(?P<start>\d+)-(?P<end>\d+)', bubble_coord_raw).groupdict()

    # get starting flank sequence
    flank_start = pybedtools.BedTool(
        f"{bubble_coord['chr']} {int(bubble_coord['start']) - flank_len} {int(bubble_coord['start'])}", 
        from_string = True
    )
    flank_start = open(flank_start.sequence(fi = astCal_fasta).seqfn).read().split('\n')[1]

    # get ending flank sequence
    flank_end = pybedtools.BedTool(
        f"{bubble_coord['chr']} {int(bubble_coord['end'])} {int(bubble_coord['end']) + flank_len}", 
        from_string = True
    )
    flank_end = open(flank_end.sequence(fi = astCal_fasta).seqfn).read().split('\n')[1]

    # get bubble sequences
    bubble_sequences = []
    unique_alleles = list(df_lookup.loc[bubble_coord_raw].unique())
    for allele in unique_alleles:
        segments = allele.split(',')
        if allele == '*':
            bubble_sequences.append("")
            continue
        else:
            allele_sequence = ""
            for segment in segments:
                segment_id, segment_direction = re.match(r'(\d+)([+-])', segment).groups()
                if segment_direction == '+':
                    segment_sequence = df_segments.loc[segment_id, 'sequence']
                elif segment_direction == '-':
                    segment_sequence = reverse_complement(df_segments.loc[segment_id, 'sequence'])
                allele_sequence += segment_sequence
            bubble_sequences.append(allele_sequence)

    # combine with flank
    fasta_sequences = fasta_sequences + [''.join([flank_start, x, flank_end]) for x in bubble_sequences]

    # create header names
    fasta_names = fasta_names + [f">{bubble_id}|{bubble_coord_raw}|{x}" for x in unique_alleles]

# %%
# write into FASTA file
with open("./local/paths_to_validate.fa", 'w') as o:
    for idx in range(len(fasta_names)):
        o.write(fasta_names[idx] + '\n')
        o.write(fasta_sequences[idx] + '\n')

# %%
