#!/usr/bin/env python
import pandas as pd
import numpy as np
import argparse

parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, \
description="""

This script takes a file of this format:

ensembl_gene_id	go_id	go_linkage_type
ENSG00000104231	GO:0008270	IEA
ENSG00000104231	GO:0046872	IEA
ENSG00000104231		
ENSG00000109956	GO:0015018	IEA
ENSG00000109956	GO:0015018	ISS
...

where each gene may or may not have one or many associated GO terms,
and converts it into a file of this format:

ENSG00000000003	GO:0004871;GO:0005515;GO:0039532;GO:0043123;GO:1901223;GO:0070062
ENSG00000000005	GO:0005515
ENSG00000000419	GO:0004169;GO:0004582;GO:0005515;GO:0006506;
...

which is used by GOAtools.

""")

optional = parser._action_groups.pop()
required = parser.add_argument_group('required arguments')

##################################################
# required args:

required.add_argument("-i", "--inf", help="required, path to input", required=True)
required.add_argument("-o", "--outf",  help="required, path to output", required=True)

##################################################
# optional args:

optional.add_argument("--stringent",
                      help="""optional, if indicated, only the following evidence codes
                      will be accepted: EXP,IDA,IGI,IMP,IPI,IC,TAS""", action='store_true')

##################################################
parser._action_groups.append(optional)
args = parser.parse_args()

GO = pd.read_csv(args.inf, sep='\t')
if args.stringent:
    GO = GO[GO['go_linkage_type'].isin(set(["EXP","IDA","IGI","IMP","IPI","IC","TAS"]))]

GO = GO[np.logical_not(pd.isnull(GO['go_id']))][['ensembl_gene_id','go_id']]
GO = GO.groupby('ensembl_gene_id')['go_id'].apply(lambda x: "%s" % ';'.join(x))

GO.to_csv(args.outf, sep='\t', index=True)