## THIS SCRIPT IS POINTLESS NOW, BETTER USE BEDTOOLS FLANK

# %%
from pathlib import Path
REPO_DIR = Path(__file__).resolve().parent.parent
SAMPLE_DIR = REPO_DIR.joinpath("prototype/")

import pandas as pd
col_names = ['SW_score', 'perc_div', 'perc_del', 'perc_ins',
             'chr', 'chr_start', 'chr_end', 'chr_left', 'complement',
             'repeat', 'repeat_family', 'repeat_begin', 'repeat_end', 'repeat_left',
             'id', 'landscape']
df = pd.read_table(SAMPLE_DIR.joinpath('greg_repeatmodeller/astcal_repeatmodeller.out.gz'),
                   names=col_names, delim_whitespace=True, 
                   skiprows=3, index_col=False)

# %%
import linecache
ASSEMBLY_REPORT = REPO_DIR.joinpath('genome/astCal1.2/assembly_report.txt')
headerline = linecache.getline(str(ASSEMBLY_REPORT), 29)
headerline = headerline[2:].strip('\n').split('\t')
print(headerline)

mapping = pd.read_table(ASSEMBLY_REPORT, skiprows=29, header=None)
mapping.columns = headerline
mapping

# %%
# create a dictionary to map GenBank to NCBI
mapping_dict = mapping.set_index('GenBank-Accn')['RefSeq-Accn'].to_dict()
df['chr'] = df['chr'].map(mapping_dict)
df['complement'] = df['complement'].map({'+': '+', 'C': '-'})

# %%
# create bed file of all transposons
bed_cols = ['chr', 'chr_start', 'chr_end', 'repeat_family', 'perc_div', 'complement']
df[bed_cols].to_csv(
    SAMPLE_DIR.joinpath("astcal_transposons.bed"),
    sep='\t', index=False, header=False
)

# %%
# create BED file with extended sequences on chr 1
chr1_id = 'NC_039302.1'
chr1_size = 41162407

i = 25
df_BED = df[bed_cols].copy()
df_BED['chr_start'] = df_BED['chr_start'] - i
df_BED['chr_end'] = df_BED['chr_end'] + i

df_BED.query(
    'chr == @chr1_id & chr_start > 0 & chr_end <= @chr1_size'
    ).to_csv(
        SAMPLE_DIR.joinpath("astcal_repeatmodeller_extend.bed"),
        sep='\t', index=False, header=False
    )

# %%
# repeat for zebra
df = pd.read_table(SAMPLE_DIR.joinpath('greg_repeatmodeller/mayZeb_repeatmodeller.out.gz'),
                   names=col_names, delim_whitespace=True, 
                   skiprows=3, index_col=False)

chr1_id = 'NC_036780.1'
chr1_size = 38676823

bed_cols = ['chr', 'chr_start', 'chr_end', 'repeat_family', 'perc_div', 'complement']

i = 25
df_BED = df[bed_cols].copy()
df_BED['chr_start'] = df_BED['chr_start'] - i
df_BED['chr_end'] = df_BED['chr_end'] + i

df_BED.query(
    'chr == @chr1_id & chr_start > 0 & chr_end <= @chr1_size'
    ).to_csv(
        SAMPLE_DIR.joinpath(f"mayZeb_repeatmodeller_extend.bed"),
        sep='\t', index=False, header=False
    )

