import pandas as pd
import numpy as np
from scipy.stats import hypergeom

days = ['day'+str(i) for i in [0,2,5,7,15,80]]
chroms = ['chr'+str(i) for i in range(1,23)]

o = open('ctcf_enrichment.txt','w')
o.write('Day\tMethod\tn\texpected\tsamples\tsuccesses\tfold enrichment\tp-value\n')

bed_and_bound = pd.read_csv('boundaries_per_timepoint.txt',sep="\t",index_col=False)
bed_and_bound[days] = (bed_and_bound[days] > 0).astype(int)
print(bed_and_bound.tail(n=10))

for day in days:
	ctcf = pd.read_csv('../ctcf_peaks/{:}.bed'.format(day),sep="\t",index_col=False,header=None,names=['chro','start'],usecols=[0,1])
	ctcf['hasPeak'] = True

	df = pd.merge(bed_and_bound,ctcf,on=['chro','start'],how='left')
	df.fillna(0,inplace=True)

	n = df.shape[0]
	expected = df['hasPeak'].sum()
	samples = df[day].sum()
	successes = ((df.hasPeak & df[day])).sum()

	fold_enrichment = ((successes/samples)/(expected/n))
	pval = hypergeom.sf(successes, n, expected, samples)
	o.write('{:}\tTADcompare\t{:d}\t{:d}\t{:d}\t{:d}\t{:.3f}\t{:.3e}\n'.format(day, n, int(expected), int(samples), int(successes), fold_enrichment, pval))	

o.close()
