"""
This script calculates TF accessible motif site enrichment in TAD boundaries.
usage: python tf_enrichment.py 
input: see motifdir and filedir variables below
output: see files in output/ directory 
"""
import pandas as pd
import sys
import numpy as np
from scipy.stats import hypergeom
import sys

parent_term = sys.argv[1]
# HiC resolution used for analysis
binsize = 10000 
# directory containing TADs
filedir = '../tgif_output/'

chros = ['chr'+str(i) for i in range (1,23)]

successes = 0
samples = 0
expected = 0
n = 0
with open('enrichment/{}.txt'.format(parent_term),'w') as o:
	o.write('N\tExpected\tSamples\tSuccesses\tFold Enrichment\tEnrichment p-value\tDepletion p-value\n')
	for chro in chros:
		#sig = np.loadtxt('boundaries/{}/significant_boundaries_summit_only.txt'.format(chro),skiprows=1)
		sig = pd.read_csv('../tgif_output/{:}/significant_boundaries_summit_only.txt'.format(chro),sep="\t",index_col=False)
		sig.fillna(0,inplace=True)
		sig = sig.values
		sig = np.sum(sig,axis=1)
		sig = (sig > 0)

		snp = np.loadtxt('map_snps_to_bins/{}/{}_snp_count.txt'.format(parent_term,chro))
		snp = (snp > 0)	

		successes += int(np.sum(np.logical_and(sig,snp)))
		samples += int(np.sum(sig))
		expected += int(np.sum(snp))
		n += sig.shape[0]

	b = 1.0*successes/samples # numerator in fold enrichment
	e = 1.0*expected/n # denominator in fold enrichment
	depletion_pval = hypergeom.cdf(successes,n,expected,samples)
	enrichment_pval = hypergeom.sf(successes,n,expected,samples)
	o.write('{:d}\t{:d}\t{:d}\t{:d}\t{:.5f}\t{:e}\t{:e}\n'.format(n,expected,samples,successes,b/e,enrichment_pval,depletion_pval))

