#!/usr/bin/env python
import component_skeleton.main
from motevo_stuff import run_motevo
from sequence_logo import generate_sequence_logo
import os
import numpy as np

def informCont(wmLine):
    pseudo_count = 0.001  ## maybe we need to change/adjust this value
    try:
        wmValues = map(float, wmLine.split()[1:5]) # the first element is the index of the WM column, so we skip it
    except IndexError:
        print t
        print cols
        print i
        print wmfile
        return .0
    totalInformation = sum(wmValues) + 4*pseudo_count
    return 2.0 - sum([(-(i+pseudo_count)/totalInformation) * np.log2((i+pseudo_count)/totalInformation) for i in wmValues])


def trimWM(lines, cutoff):
    """
    This function trims the input WM, which is represented by a list that each of its entries
    is corresponded to each line of the input WM.
    """    
    header = lines[:3]
    footer = lines[-1]
    cols = lines[3:-1]
    start, stop = 0, len(cols)
    for i in range(len(cols)):
        if informCont(cols[i]) < cutoff:
            start = i + 1
            continue
        else:
            break

    for i in range(len(cols))[::-1]:
        if informCont(cols[i]) < cutoff:            
            stop = i 
            continue
        else:
            break

    convertToString = lambda i,j: '\t'.join([str(i+1).zfill(2)] + j.split()[1:]) + '\n'
    newWM = ''.join(header + 
                    [convertToString(i, counts) for i,counts in enumerate(cols[start:(stop)])] + 
                    [footer])
    return newWM


def execute(cf):
    """
    This component receives a weight matrix and set of sequences (non aligned)
    and by running MotEvo in WMREF mode, refines the input motif to explain the
    sequence data better.
    As output, it generates a new version (refined) motif. 
    """
    inputSequences = cf.get_input("InputSequences")
    wmFile = cf.get_input("WM")
    output_file = cf.get_output("refWM")
    output_logo = cf.get_output("Logo")    
    genome = cf.get_parameter("genome", "string")
    cutoff = cf.get_parameter("information_cutoff", "float")
    motevo_path = cf.get_parameter("motevo_path", "string")
    output_dir = os.path.dirname(output_file)

    tmpwm = os.path.join(os.path.split(output_file)[0], 'tmpwm')

    (siteFilename, priorFilename, paramFilename) = run_motevo(wmFile, inputSequences, output_dir, genome, motevo_path)

    os.system('mv %s %s' %(os.path.join(output_dir, 'wms.updated'), tmpwm))

    with open(tmpwm) as f:
        WM = trimWM(f.readlines(), cutoff)

    with open(output_file, 'w') as o:
        for line in WM:
            o.write(line)

    generate_sequence_logo(output_file, output_logo)
    os.system( 'rm %s %s %s %s' % (siteFilename, priorFilename, paramFilename, tmpwm) )
    return 0

component_skeleton.main.main(execute)
