#!/usr/bin/env python
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
import numpy as np
from sys import argv

df = argv[1]
PC_var_explained_out = argv[2]
PC_loadings_out = argv[3]
df_transformed_out = argv[4]

# df = "EP300_H3K27ac_cJun.in.enhancers.log2CPM.t15.iter_short.txt"
# PC_var_explained_out = "EP300_H3K27ac_cJun.in.enhancers.log2CPM.t15.iter_short.PCA_var_explained.txt"
# PC_loadings_out = "EP300_H3K27ac_cJun.in.enhancers.log2CPM.t15.iter_short.PC_loadings.txt"
# df_transformed_out = "EP300_H3K27ac_cJun.in.enhancers.log2CPM.t15.iter_short.PC1_transformed.txt"

# df = "EP300_H3K27ac_cJun.initial_log2CPM.in.enhancers.iter_short.txt"
# PC_var_explained_out = "EP300_H3K27ac_cJun.initial_log2CPM.in.enhancers.iter_short.PCA.var_explained.txt"
# PC_loadings_out = "EP300_H3K27ac_cJun.initial_log2CPM.in.enhancers.iter_short.PCA.loadings.txt"

df = pd.read_csv(df, sep="\t", index_col=0)

X = np.array(df)
X = scale(X)
    
pca = PCA(n_components=X.shape[1])
pca.fit(X)
X_transformed = pca.fit_transform(X)

PC_loadings_df = pd.DataFrame(pca.components_.T, index=df.columns, columns=range(1,1+X.shape[1]))
PC_var_explained_df = pd.DataFrame(pca.explained_variance_ratio_).T 
PC_var_explained_df.index = ["variance explained"]
PC_var_explained_df.columns = range(1,1+X.shape[1])

df['PC1'] = X_transformed[:,0]

PC_loadings_df.to_csv(PC_loadings_out, sep="\t")
PC_var_explained_df.to_csv(PC_var_explained_out, sep="\t")
df[['PC1']].to_csv(df_transformed_out, sep="\t")