#!/usr/bin/env python
import numpy as np
import pandas as pd
from GGR import utils
from sys import argv

TF_binding = argv[1]
factors = argv[2].split(",")
timepoints = argv[3].split(",")
TF_binding_corr = argv[4]

# TF_binding = "TF_binding.in.distal_non_p300_DHSs.mean.txt"
# factors = "GR,EP300,JunB,FOSL2,HES2,BCL3,cJun,CEBPB,CTCF".split(",")
# timepoints = "t00,t05,t1,t2,t3,t4,t5,t6,t7,t8,t10,t12".split(",")
# TF_binding_corr = "TF_binding.in.distal_non_p300_DHSs.mean.corr.txt"

TF_binding = pd.read_csv(TF_binding, index_col=0, sep="\t")

mean_corr = np.zeros((len(factors),len(factors)))
# std_corr = np.zeros((len(factors),len(factors)))

for i, factor1 in enumerate(factors):
    for j in range(i):
        factor2 = factors[j]
        print factor1, factor2
        arr1 = np.array(TF_binding[["%s.%s"%(factor1,timepoint) for timepoint in timepoints]])
        arr2 = np.array(TF_binding[["%s.%s"%(factor2,timepoint) for timepoint in timepoints]])
        corrs = []
        for k in range(len(arr1)):
            corrs.append(np.corrcoef(arr1[k,:],arr2[k,:])[0,1])
        
        # ignore np.nans which results from correlating two zero vectors
        mean_corr[i,j] = np.nanmean(corrs)
#         std_corr[i,j] = np.std(corrs)

mean_corr = mean_corr + mean_corr.T + np.diag(np.ones(len(mean_corr)))
# std_corr = std_corr + std_corr.T # no imprecision in the identity correlation
# sem_corr = std_corr / np.sqrt(len(TF_binding)) 

mean_corr = pd.DataFrame(mean_corr, columns=factors, index=factors)
# sem_corr = pd.DataFrame(sem_corr, columns=factors, index=factors)

# mean_corr = mean_corr.sort('EP300', ascending=False)
# mean_corr = mean_corr[list(mean_corr.index)]

mean_corr.to_csv(TF_binding_corr, sep="\t", header=True, index=True)
  