# Script to parse minigraph variants file (for individual samples).
# Usage: python /path/to/script.py -h

import sys
import argparse
import pandas as pd
from pathlib import Path

def cmdline_args():
    p = argparse.ArgumentParser(
        description="Format a minigraph 6-column variant file.",
        usage='python combine_variants6col_files.py bubble-variants.bed sample1.variants6col.bed sample2.variants6col.bed',
        formatter_class=argparse.RawDescriptionHelpFormatter
    )
    p.add_argument("BUBBLE_FILE", help="base variant BED file from gfatools bubble", type=str)
    p.add_argument("VARIANT_FILES", help="individual simplified variant 6-column BED files (multiple entries allowed)", type=str, nargs="+")
    return p.parse_args()

# parse arguments
args = cmdline_args()

# check that the files exist and end with the correct prefix
assert Path(args.BUBBLE_FILE).is_file()
assert args.BUBBLE_FILE.endswith('-variants.bed')
OUTPUT_PREFIX = args.BUBBLE_FILE.replace('-variants.bed', "")

for filename in args.VARIANT_FILES:
    assert Path(filename).is_file()
    assert filename.endswith(".variants6col.bed")

# read gfatools bubble file, keeping only important columns
col_names = ['chr', 'start', 'end', 'n_segments', 'n_paths', 'inv', 'len_min', 'len_max', 'i1', 'i2', 'i3', 'segment_list', 'seq_min', 'seq_max']
df = pd.read_table(args.BUBBLE_FILE, names=col_names, index_col=False, dtype={'chr':str}).drop(columns=['i1','i2','i3','seq_min','seq_max'])

# simplify variant files 
samples = [x.replace(".variants6col.bed", "") for x in args.VARIANT_FILES]

# read individual variant files
for sample in samples:
    col_names = [f'{sample}_{x}' for x in ['chr', 'start', 'end', 'path', 'len', 'strand']]
    df_sample = pd.read_table(f'{sample}.variants6col.bed', names=col_names, index_col=False, dtype={'chr':str})
    df = pd.concat([df, df_sample], axis=1)

# whole matrix
df.to_csv(f'{OUTPUT_PREFIX}-summary.csv', index=False, header=True)
df[[f'{x}_path' for x in samples]].to_csv(f'{OUTPUT_PREFIX}-summary_path.csv.gz', index=False, header=True)
df[[f'{x}_len' for x in samples]].to_csv(f'{OUTPUT_PREFIX}-summary_length.csv.gz', index=False, header=True)

