# Script to format chromosome IDs and drop mitochondria entries from RepMask output.
# Usage: python /path/to/script.py -h

import sys
import argparse
import pandas as pd
from pathlib import Path

def cmdline_args():
    p = argparse.ArgumentParser(
        description="Formats the chromosome column based on an alias file.",
        usage='cat input.txt | python modify_chr_column.py [options] > output.txt',
        formatter_class=argparse.RawDescriptionHelpFormatter
    )
    p.add_argument("--alias", type=str,
        help="alias file to convert chromosome names")
    p.add_argument("--drop-mito", dest='dropmito', action='store_true',
        help="include this option to drop mitochondria entries")
    p.set_defaults(dropmito=False)
        
    return p.parse_args()

def read_from_stdin():
    return pd.read_table(sys.stdin, dtype=object, index_col=False, header='infer', comment='#')

# parse arguments
args = cmdline_args()

# read repeatmasker file from stdin
df = read_from_stdin()

# drop mitochondria entries
if args.dropmito is True:
    mito_names = ['MT', 'mt', 'mitogenome', 'mitochondria']
    df = df.loc[~df['chr'].isin(mito_names), ]

# optional step to convert chromosome coordinates into different convention
if args.alias is not None:
    path_alias = Path(args.alias)
    assert path_alias.is_file()
    mapping_dict = pd.read_table(path_alias, header=None).set_index(0)[1].to_dict()
    mapping_dict['unmatched'] = 'unmatched'
    df['chr'] = df['chr'].map(mapping_dict)

# print to stdout
df.to_csv(sys.stdout, sep='\t', na_rep='', index=False, header=True)

