# Script to parse RepeatMasker output into simpler format
# Usage: python -h /path/to/script.py

import sys
import argparse
import pandas as pd
from pathlib import Path

def read_repeatmasker_file_from_stdin():
    col_names = ['SW_score', 'perc_div', 'perc_del', 'perc_ins',
                'chr', 'start', 'end', 'left', 'complement',
                'repmask_id', 'repeat_family',
                'repeat_start', 'repeat_end', 'repeat_left',
                'id', 'landscape']
    col_types = {'chr': str, 'id': str, 'left': str,
        'repeat_start': str, 'repeat_left': str, 'repeat_end': str,
        'perc_div': float, 'perc_del': float, 'perc_ins': float
    }
    return pd.read_table(sys.stdin, names=col_names, delim_whitespace=True, 
        skiprows=3, index_col=False, dtype = col_types)

def cmdline_args():
    p = argparse.ArgumentParser(
        description="Formats and extract the most useful columns from a RepeatMaster outfile.",
        usage='cat input.txt | python parse_repmask_to_simple.py [options] > output.txt',
        formatter_class=argparse.RawDescriptionHelpFormatter
    )
    p.add_argument("--alias", type=str,
                   help="alias file to convert chromosome names")
    return p.parse_args()

def get_new_df(df):
    cols_to_keep = [
        'id', 'perc_div', 'chr', 'start', 'end', 'complement', 
        'repmask_id', 'repeat_family'
    ]
    df_new = df[cols_to_keep].copy()
    df_new['complement'] = df_new['complement'].map({'+': '+', 'C': '-'})
    return df_new

# parse arguments
args = cmdline_args()

# read repeatmasker file from stdin
df = read_repeatmasker_file_from_stdin()

# extract relevant columns, format complement column
df = get_new_df(df)

# drop rows where the ID column contains NA
print(f"# Dropped {sum(df['id'].isna())} row where the ID column is NA.")
df = df.loc[~df['id'].isna()]

# mark entries which are split
import numpy as np
ids, counts = np.unique(df['id'], return_counts=True)
split_entries_bool = df['id'].isin(ids[counts > 1])
df['split'] = ""
df.loc[split_entries_bool, 'split'] = 't'

# print to stdout
df.to_csv(sys.stdout, sep='\t', na_rep='', index=False, header=True)
