#!/usr/bin/env python
import pandas as pd
from collections import defaultdict
import argparse

parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, \
description="""

synthesize_meme_and_tomtom_results.py synthesizes and summarizes
MEME and TOMTOM results and outputs a human-readable tab-delimited summary

""")

##################################################
# required args:

parser.add_argument("-m", "--meme_results",
                    help="""required, file paths to meme results '-text' output,
for more information see: http://web.mit.edu/meme_v4.9.0/doc/meme-format.html
""", required=True)
parser.add_argument("-t", "--tomtom_results",
                    help="""required, file paths to tomtom results '-text' output,
for more information see: http://web.mit.edu/meme_v4.9.0/doc/tomtom.html
""", required=True)

parser.add_argument("--motif_db", help="required, file path to motif database in MEME format")
parser.add_argument("--out", help="required, file path to output")

##################################################
# optional args:

parser.add_argument("--min_meme_evalue", type=float,
                    help="optional, minimum MEME e-value to retain, else discard (default: %(default)s)",
                   default=0.01)
parser.add_argument("--min_tomtom_qvalue", type=float,
                    help="optional, minimum TOMTOM q-value to retain, else discard (default: %(default)s)",
                   default=0.1)

##################################################

args = parser.parse_args()

# read motif database and create dictionary
# linking ID to name, e.g, 'MA0004.1' -> 'Arnt'
with open(args.motif_db, 'r') as f:
    ID_to_name = {line.strip().split()[1]:line.strip().split()[2] for line in f if 'MOTIF' in line}

# create a dictionary of meme results,
# saving essential data
meme_motifs = defaultdict(dict)
with open(args.meme_results, 'r') as f:
    for line in f:
        # MEME results are meant to be human-readable
        # and thus are tricky to parse
        if 'maxsites=' in line:
            num_sequences = int(line.split()[4])
        if 'E-value' in line and 'MOTIF' in line:
            line = line.replace('=','')
            _,meme_motif_num,_,_,width,_,occurrences,_,_,_,e_value = line.split()
            # only retain significant results
        if 'Multilevel' in line:
            _,consensus = line.split()
            if float(e_value) < args.min_meme_evalue:
                meme_motifs[meme_motif_num]['consensus'] = consensus
            if float(e_value) < args.min_meme_evalue:
                meme_motifs[meme_motif_num]['width'] = int(width)
                meme_motifs[meme_motif_num]['occurrences'] = int(occurrences)
                meme_motifs[meme_motif_num]['percentage_of_sequences_w_motif'] = 100*int(occurrences)/float(num_sequences)
                meme_motifs[meme_motif_num]['e_value'] = float(e_value)

# read TOMTOM results
tomtom = pd.read_csv(args.tomtom_results, sep='\t')
# replace ID with TF name where possible
tomtom['TF_match'] = [x if x not in ID_to_name else ID_to_name[x] for x in tomtom['Target ID']]            
# only retain significant results
tomtom = tomtom[tomtom['q-value'] < args.min_tomtom_qvalue]
# there may be multiple significant TOMTOM hits per MEME motif
tomtom_TF_match_dict = tomtom.groupby('#Query ID')['TF_match'].apply(','.join).to_dict()

# for each MEME result, associate TOMTOM results
for meme_motif_num in sorted(meme_motifs):
    try:
        meme_motifs[meme_motif_num]['TF_match'] = tomtom_TF_match_dict[int(meme_motif_num)]
    except KeyError:
        meme_motifs[meme_motif_num]['TF_match'] = 'Unknown_motif'

# turn dict into dataframe
meme_motifs = pd.DataFrame(meme_motifs).T
# sort by significance
meme_motifs = meme_motifs.sort_values('e_value')
# reorder
meme_motifs = meme_motifs[['TF_match','consensus', 'e_value','occurrences','percentage_of_sequences_w_motif','width']]
meme_motifs.to_csv(args.out, sep='\t', index=False)