#!/usr/bin/env python
import pandas as pd
import argparse

parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, \
description="""

subset_bed_file.py

Given a bed file and a file containing a list of names, this script will test
each line in the given bed file to see if the name in the 4th column (bed name 
column) is in the list of names.  If the name is present in the list, then
that bed entry will be printed to an output bed file.

""")

optional = parser._action_groups.pop()
required = parser.add_argument_group('required arguments')

##################################################
# required args:
required.add_argument("--bed", type=str,
                    help="""required, bed file of the format:

chr1	1000	2000	ENSG00000003147.16	...
chr2	2000	12000	ENSG00000009921.3	...
...

""", dest="bed", action='store', required=True)

required.add_argument("--names", type=str,
                    help="""required, file with a list of names:

ENSG00000003147.16
ENSG00000009921.3
...

""", dest="names", action='store', required=True)

required.add_argument("--out", type=str, help="required, bed output", required=True)

##################################################
# optional args:

optional.add_argument("--negative", help="""optional, give --negative if you wish to return all records
that do NOT match --names""", action='store_true')
optional.add_argument("--order_by_index", action='store_true')

##################################################
parser._action_groups.append(optional)
args = parser.parse_args()

# read in data
with open(args.names, 'r') as f:
    names = [line.strip() for line in f]

bed = pd.read_csv(args.bed, sep='\t', header=None)

# create a set of names to text
names_set = set(names)
if not args.negative:
    bed = bed[bed[3].isin(names_set)]
else:
    bed = bed[~bed[3].isin(names_set)]

if args.order_by_index:
    names = {name:i for i, name in enumerate(names)}
    bed.index = [names[n] for n in bed[3]]
    bed = bed.sort_index()

bed.to_csv(args.out, sep='\t', header=False, index=False)
