#!/usr/bin/env python
import pandas as pd
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import scale
from sklearn.metrics import roc_curve
from sys import argv

df = argv[1]
y = argv[2]
in_fit_params = argv[3]
out = argv[4]

# df = "predictors.all.txt"
# y = "binary_output_06h.txt"
# in_fit_params = "predict_binary_output_06h.by.all.fit_params.txt"
# out = "predict_binary_output_06h.by.all.tpr_fpr.txt"

df = pd.read_csv(df, sep="\t", index_col=0)
y = pd.read_csv(y, sep="\t", index_col=0)

idx = sorted(set(df.index) | set(y.index)) 

df = df.ix[idx]
y = np.array(y.ix[idx])

X = scale(df)
y = y.flatten()

with open(in_fit_params, "r") as f:
    header = next(f)
    best_l1_ratio, best_alpha = next(f).strip().split()
    best_l1_ratio, best_alpha = float(best_l1_ratio), float(best_alpha)

model = SGDClassifier(l1_ratio=best_l1_ratio, alpha=best_alpha, 
                      loss="log", penalty="elasticnet")
fit = model.fit(X, y)

scores = fit.predict_proba(X)[:,1]
fpr, tpr, thresholds = roc_curve(y, scores)

df  = pd.DataFrame({'fpr':fpr,'tpr':tpr})

df.to_csv(out, sep="\t", index=False)