#!/usr/bin/env python
from sys import argv
import numpy as np
from collections import defaultdict

fimo_in = argv[1]
out = argv[2]

# fimo_in = "JunB_de_novo_motif.in.JunB.union.selected_reps.fimo.txt"
# out = "JunB_de_novo_motif.in.JunB.union.selected_reps.fimo.best.txt"

fimo_d = defaultdict(list)
with open(fimo_in, "r") as inf:
    header = next(inf)
    for line in inf:
        motif,seq_name,start,stop,strand,score,pval,qval,matched_seq = line.strip().split("\t")
        fimo_d[seq_name].append((float(pval), strand, start, stop, matched_seq))

def sort_nicely( l ): 
    """ 
    Sort the given list in the way that humans expect. 
    """ 
    import re
    convert = lambda text: int(text) if text.isdigit() else text 
    alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] 
    return(sorted( l, key=alphanum_key ))

with open(out, "w") as outf:
    for seq_name in sort_nicely(fimo_d.keys()):
        pvals = np.array([pval for (pval, strand, start, stop, matched_seq) in fimo_d[seq_name]])
        strands = np.array([strand for (pval, strand, start, stop, matched_seq) in fimo_d[seq_name]])
        starts = np.array([start for (pval, strand, start, stop, matched_seq) in fimo_d[seq_name]])
        stops = np.array([stop for (pval, strand, start, stop, matched_seq) in fimo_d[seq_name]])
        matched_seqs = np.array([matched_seq for (pval, strand, start, stop, matched_seq) in fimo_d[seq_name]])
        
        best_pval = pvals.min()
        best_matched_seqs = matched_seqs[np.where(pvals == best_pval)]
        best_strands = strands[np.where(pvals == best_pval)]
        best_starts = starts[np.where(pvals == best_pval)]
        best_stops = stops[np.where(pvals == best_pval)]
        # in case of tie, choose at random
        idx = np.random.choice(np.arange(len(best_matched_seqs)),1)[0]
        best_strand = best_strands[idx]
        best_start = best_starts[idx]
        best_stop = best_stops[idx]
        best_matched_seq = best_matched_seqs[idx]
        
        outf.write("%s\t%s\t%s\t%s\t%s\t%s\n"%(seq_name, best_pval, best_start, best_stop, best_strand, best_matched_seq))