#!/usr/bin/env python
import pandas as pd
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn import cross_validation
from sklearn.metrics import log_loss
from sys import argv
from scipy.stats import sem

predictor_df = argv[1]
test = argv[2]
bkg = argv[3]
coef_out = argv[4]
loss_out = argv[5]

# read in predictor dataframe with format:
# \tPRED1\tPRED2\tPRED3
# obs1\tval_00\tval_01\tval_02
# obs2\tval_10\tval_11\tval_12
# obs3\tval_20\tval_21\tval_22
# ...
predictor_df = pd.read_csv(predictor_df, sep='\t', index_col=0)
# import test set with format:
# obs689
# obs710
# ...
test = list(pd.read_csv(test, header=None, sep='\t')[0])
# import bkg. set with format:
# obs0
# obs3
# ...
bkg = list(pd.read_csv(bkg, header=None, sep='\t')[0])

# subset predictors to only contain test and bkg sets
predictor_df = predictor_df.ix[list(set(test) | set(bkg))]

# set up our predictor and response for classification/regression
X = predictor_df
y = np.array([1 if x in set(test) else 0 for x in list(predictor_df.index)])

# the following penalty terms should be sufficient
# but check that performance begins at a plateau, then plummets
score_dict = {}
l1_ratios = [0.5, 0.75, .9, .95, .99, 1]
alphas = 10**np.arange(-6,1).astype('float')

# from the sklearn documentation:
# "Some classification problems can exhibit a large 
# imbalance in the distribution of the target classes: 
# for instance there could be several times more negative 
# samples than positive samples. In such cases it is 
# recommended to use stratified sampling as implemented 
# in StratifiedKFold and StratifiedShuffleSplit to ensure 
# that relative class frequencies is approximately preserved 
# in each train and validation fold.
skf = cross_validation.StratifiedKFold(y, n_folds=5)

for l1_ratio in l1_ratios:
    for alpha in alphas:
        # set up the classifier
        clf = SGDClassifier(l1_ratio=l1_ratio, alpha=alpha, loss='log', penalty='elasticnet')
        # run the model across the 5 folds (if you run into problems, see me, this
        # can sometimes be tricky)
        score = cross_validation.cross_val_score(clf, X, y, cv=skf, scoring='neg_log_loss')
        score_dict[(l1_ratio, alpha)] = score

# compute mean log-loss across the 5 folds for each model
mean_neg_log_loss = np.array([np.mean(score) for params,score in score_dict.iteritems()])
ordered_params = np.array([params for params,score in score_dict.iteritems()])

# compute standard error of the log-loss for each model
sem_neg_log_loss = np.array([sem(score) for params,score in score_dict.iteritems()])

# typically, we want to minimize loss, but sklearn's unified API
# demands that all scoring functions are maximized so log-loss is negated
# and better models have higher log loss

# find maximal neg. log loss minus 1-se
max_neg_log_loss_1se = max(mean_neg_log_loss) - sem_neg_log_loss[mean_neg_log_loss.argmax()]

# find the parameters for the models with neg log loss greater than
# (maximal neg. log loss minus 1-se)
opt_params = ordered_params[mean_neg_log_loss > max_neg_log_loss_1se]

# loop over the models with neg log loss greater than
# (maximal neg. log loss minus 1-se)
# and keep the model with the lowest number of 
# non-zero coefficients
smallest_num_coefs = np.inf
for l1_ratio, alpha in opt_params:
    fit = SGDClassifier(l1_ratio=l1_ratio, alpha=alpha, penalty='elasticnet', loss='log', n_iter=100).fit(X, y)
    coefs = fit.coef_[0]
    num_non_zero_coefs = len(coefs[coefs != 0])
    print num_non_zero_coefs
    if num_non_zero_coefs < smallest_num_coefs:
        smallest_num_coefs = num_non_zero_coefs
        best_fit = fit

# write coefs from best model to file
coefs = best_fit.coef_[0]
with open(coef_out, 'w') as f:
    for predictor, coef in zip(X.columns, coefs):
        f.write('%s\t%0.4f\n'%(predictor, coef))

with open(loss_out, 'w') as f:
    f.write('mean_CV_log_loss\tSE\n')
    f.write('%s\t%0.4f\n'%(-max(mean_neg_log_loss), sem_neg_log_loss[mean_neg_log_loss.argmax()]))