"""
This script calculates TF accessible motif site enrichment in TAD boundaries.
usage: python tf_enrichment.py 
input: see motifdir and filedir variables below
output: see files in output/ directory 
"""
import pandas as pd
import sys
import numpy as np
from scipy.stats import hypergeom
import sys

parent_term = sys.argv[1]
# HiC resolution used for analysis
binsize = 10000 
# directory containing TADs
filedir = '../tgif_output/'

chros = ['chr'+str(i) for i in range (1,23)]
timepoints = "day0,day2,day5,day7,day15,day80".split(",")
T = len(timepoints)
successes = np.zeros(T)
samples = np.zeros(T)
expected = np.zeros(T)
n = np.zeros(T)
with open('enrichment/{}_specific.txt'.format(parent_term),'w') as o:
	o.write('Timepoint\tN\tExpected\tSamples\tSuccesses\tFold Enrichment\tEnrichment p-value\tDepletion p-value\n')
	for chro in chros:
		snp = np.loadtxt('map_snps_to_bins/{}/{}_snp_count.txt'.format(parent_term,chro))
		snp = (snp > 0)	
		df = pd.read_csv('../tgif_output/{:}/significant_boundaries_summit_only.txt'.format(chro),sep="\t",index_col=False)
		df.fillna(0,inplace=True)
		df = df.values
		#rowsum = np.sum(df,axis=1)
		#unique = (rowsum == 1)
		for t in range(T):
			sig = df[:,t]
			successes[t] += int(np.sum(np.logical_and(sig,snp)))
			samples[t] += int(np.sum(sig))
			expected[t] += int(np.sum(snp))
			n[t] += sig.shape[0]

	for t in range(T):
		timepoint = timepoints[t]
		b = 1.0*successes[t]/samples[t] # numerator in fold enrichment
		e = 1.0*expected[t]/n[t] # denominator in fold enrichment
		depletion_pval = hypergeom.cdf(successes[t],n[t],expected[t],samples[t])
		enrichment_pval = hypergeom.sf(successes[t],n[t],expected[t],samples[t])
		o.write('{}\t{:d}\t{:d}\t{:d}\t{:d}\t{:.5f}\t{:e}\t{:e}\n'.format(timepoint,int(n[t]),int(expected[t]),int(samples[t]),int(successes[t]),b/e,enrichment_pval,depletion_pval))

