# Script to parse minigraph variants file (for individual samples).
# Usage: python /path/to/script.py -h

import sys
import argparse
import pandas as pd
from pathlib import Path

def cmdline_args():
    p = argparse.ArgumentParser(
        description="Format a minigraph 6-column variant file.",
        usage='cat input.variants | python parse_minigraph_variants6col.py [options] > output.variants.simple',
        formatter_class=argparse.RawDescriptionHelpFormatter
    )
    return p.parse_args()

def read_from_stdin():
    col_names = ['chr', 'start', 'end', 'source', 'sink', 'query']
    dtype = {'chr': str}
    return pd.read_table(sys.stdin, names=col_names, index_col=False, dtype=dtype)

# parse arguments
args = cmdline_args()

# read minigraph variant file from stdin
df = read_from_stdin()

# concatenate the source and sink to get the bubble name
df['path'] = [str(x) + str(y) for x, y in zip(df['source'], df['sink'])]

# calculate length of bubble, add strand information
df['len'] = df['end'] - df['start']
df['strand'] = '+'

# split last column into separate fields (information about the querying segments)
new_cols = ['query_path', 'query_len', 'query_strand', 'query_chr', 'query_start', 'query_end']
df[new_cols] = df['query'].str.split(':', expand=True)

# some rows only have a single dot, meaning it doesn't match anything
df.loc[df['query_path'] == '.', ['query_len', 'query_strand']] = '.'
df.loc[df['query_path'] == '.', ['query_start', 'query_end']] = 0
df.loc[df['query_path'] == '.', 'query_chr'] = 'unmatched'

# write to standard output
cols_to_write = ['chr', 'start', 'end', 'path', 'len', 'strand',
    'query_chr', 'query_start', 'query_end', 'query_path', 'query_len', 'query_strand']
df[cols_to_write].to_csv(sys.stdout, sep='\t', na_rep='', index=False, header=False)
