"""
Creates mapping file alias between different conventions for naming chromosomes.

Change the `species` variable to process astCal and mayZeb genomes.
"""

# %%
# obtain file directories
import pandas as pd
from pathlib import Path

BASE_DIR = Path(__file__).resolve().parent.parent
GENOME_DIR = BASE_DIR.joinpath('genome/')
OUTPUT_DIR = BASE_DIR.joinpath('metadata/')

assert BASE_DIR.is_dir()
assert GENOME_DIR.is_dir()
assert OUTPUT_DIR.is_dir()

# %%
def read_assembly_report(path):
    df = pd.read_table(path,
        comment='#',
        header=None,
        usecols=[0, 2, 4, 6],
        names=['chr','ensembl', 'genbank', 'ncbi']
    )
    return df

def rename_chr_column(df):
    import re
    # rename chromosomes or linkage groups
    chr_index = df['chr'].str.contains('chr|LG|lg')
    
    df.loc[chr_index, 'chr'] = [
        'chr' + re.search(r'\d+', x).group() 
        for x in df.loc[chr_index, 'chr']
    ]

    df.loc[~chr_index, 'chr'] = [
        'scf' + str(x+1).zfill(8)
        for x in range(0, sum(~chr_index))
    ]
    
    return df

def drop_mitochondria(df):
    mito_names = ['MT', 'mt', 'mitogenome', 'mitochondria']
    df = df.query(
        "~(chr.isin(@mito_names) | "
        "  ensembl.isin(@mito_names) | "
        "  genbank.isin(@mito_names) | "
        "  ncbi.isin(@mito_names) "
        ")"
    )
    return df


# %%
species = 'mayZeb2.0'
REPORT_PATH = GENOME_DIR.joinpath(f'{species}/assembly_report.txt')
assert REPORT_PATH.is_file()

df = read_assembly_report(REPORT_PATH)
df = drop_mitochondria(df)
df = rename_chr_column(df)

# rm mitochondria entries (mapped to nothing)
df['ensembl'][df.ensembl == 'na'] = df['genbank'][df.ensembl == 'na']

for y in [['chr','ncbi'], ['chr','ensembl'], ['ncbi','ensembl']]:
    df[y].to_csv(
        OUTPUT_DIR.joinpath(f'alias_{species}_{y[0]}-{y[1]}.txt'),
        index=False, header=False, sep='\t'
    )

# %%
species = 'astCal1.2'
REPORT_PATH = GENOME_DIR.joinpath(f'{species}/assembly_report.txt')
assert REPORT_PATH.is_file()

df = read_assembly_report(REPORT_PATH)
df = drop_mitochondria(df)
df = rename_chr_column(df)

# rm mitochondria entries (mapped to nothing)
df['ensembl'][df.ensembl == 'na'] = df['ncbi'][df.ensembl == 'na'] 

# for calliptera ensembl genome, scaffolds use the genbank accession
df['ensembl'][df.chr.str.startswith('scf')] = df['genbank'][df.chr.str.startswith('scf')]

for y in [['chr','ncbi'], ['chr','ensembl'], ['ncbi','ensembl'], ['genbank', 'ensembl']]:
    df[y].to_csv(
        OUTPUT_DIR.joinpath(f'alias_{species}_{y[0]}-{y[1]}.txt'),
        index=False, header=False, sep='\t'
    )

