#!/usr/bin/env python
import pandas as pd
import numpy as np
from sklearn import cross_validation
from sklearn.linear_model import ElasticNet
from sklearn.cross_validation import KFold
from sklearn.metrics import r2_score
from sklearn.preprocessing import scale
from scipy.stats import sem
from sys import argv

df = argv[1]
y = argv[2]
out_coef = argv[3]
out_fit_params = argv[4]

# df = "motif.enh_and_dhss.binary_0.0001.txt"
# y = "basal_p300.enh_and_dhss.standardized.txt"
# out_coef = "predict_basal_p300.enh_and_dhss.coefs.binary_0.0001.txt"
# out_fit_params = "predict_basal_p300.enh_and_dhss.fit_params.binary_0.0001.txt"

# df = "motif.enh_and_dhss.binary_0.00001.txt"
# y = "basal_p300.enh_and_dhss.standardized.txt"
# out_coef = "predict_basal_p300.enh_and_dhss.coefs.binary_0.00001.txt"
# out_fit_params = "predict_basal_p300.enh_and_dhss.fit_params.binary_0.00001.txt"

df = pd.read_csv(df, sep="\t", index_col=0)
y = pd.read_csv(y, sep="\t", index_col=0)

idx = sorted(set(df.index) | set(y.index)) 

df = df.ix[idx]
y = np.array(y.ix[idx])

y = scale(y)
X = np.array(df)

#####
# run model
#####

cv = KFold(len(y), n_folds=5, shuffle=True, random_state=1234)

# the following penalty terms should be sufficient
l1_ratios = [0.75, .9, .95, .99, 1]
alphas = 10**np.arange(-6,1).astype('float')

# for i in range(X_list[0].shape[1]):
# first test for the optimal l1_ratio / alpha
score_dict = {}
for l1_ratio in l1_ratios:
    for alpha in alphas:
        # set up the classifier
        regr = ElasticNet(l1_ratio=l1_ratio, alpha=alpha, random_state=1234)
        score = cross_validation.cross_val_score(regr, X, y, cv=cv, scoring="r2")
        # set up the classifier
        print "\ttesting l1_ratio = %s, alpha = %s, R2 = %s"%(l1_ratio, alpha, score.mean())
        score_dict[(l1_ratio, alpha)] = score

# compute mean log-loss across the 5 folds for each model
mean_r2 = np.array([np.mean(score) for params,score in score_dict.iteritems()])
ordered_params = np.array([params for params,score in score_dict.iteritems()])

# compute standard error of the log-loss for each model
sem_r2 = np.array([sem(score) for params,score in score_dict.iteritems()])

# find maximal r2 loss minus 1-se
max_r2_1se = max(mean_r2) - sem_r2[mean_r2.argmax()]

# find the parameters for the models with neg log loss greater than
# (maximal neg. log loss minus 1-se)
opt_params = ordered_params[mean_r2 >= max_r2_1se]

# loop over the models with r2 greater than
# (maximal r2 minus 1-se)
# and keep the model with the lowest number of 
# non-zero coefficients
smallest_num_coefs = np.inf
for l1_ratio, alpha in opt_params:
    fit = ElasticNet(l1_ratio=l1_ratio, alpha=alpha, random_state=1234).fit(X, y)
    num_non_zero_coefs = len(fit.coef_[fit.coef_ != 0])
    print "\tl1_ratio = %s, alpha = %s, num. non-zero coefs = %s"%(l1_ratio, alpha, num_non_zero_coefs)
    if num_non_zero_coefs <= smallest_num_coefs:
        smallest_num_coefs = num_non_zero_coefs
        best_fit = fit
        best_l1_ratio = l1_ratio
        best_alpha = alpha

fit = ElasticNet(l1_ratio=best_l1_ratio, alpha=best_alpha, random_state=1234).fit(X, y)
y_predict = fit.predict(X)
r2 = r2_score(y, y_predict)

coef_df = pd.DataFrame({'coef':fit.coef_})
coef_df.index = df.columns
coef_df.to_csv(out_coef, sep = "\t", index=True)

with open(out_fit_params, "w") as f:
    f.write("best_l1_ratio\tbest_alpha\tr2\n")
    f.write("%s\t%s\t%s\n"%(best_l1_ratio,best_alpha,r2))