#!/usr/bin/env python
from sys import argv
from collections import Counter
import pandas as pd

lists = argv[1].split(",")
names = argv[2].split(",")
all_sites = argv[3]
name = argv[4]
out_positive = argv[5]
out_negative = argv[6]

# lists = "enhancers.with_GR_binding.matched_by_accessibility.names.txt,enhancers.with_JunB_binding.matched_by_accessibility.names.txt,enhancers.with_cJun_binding.matched_by_accessibility.names.txt,enhancers.with_BCL3_binding.matched_by_accessibility.names.txt,enhancers.with_FOSL2_binding.matched_by_accessibility.names.txt,enhancers.with_CEBPB_binding.matched_by_accessibility.names.txt,enhancers.with_HES2_binding.matched_by_accessibility.names.txt,enhancers.with_CTCF_binding.matched_by_accessibility.names.txt".split(",")
# names = "GR,JunB,cJun,BCL3,FOSL2,CEBPB,HES2,CTCF".split(",")
# all_sites = "enhancers.matched_by_accessibility.names.txt"
# name = "enhancers"

sites = []
for inf in lists:
    with open(inf, "r") as f:
        sites.append(set([line.strip() for line in f]))

with open(all_sites, "r") as f:
    all_sites = [line.strip() for line in f]

names_overlapping = []
for site in all_sites:
    names_overlapping.append(tuple(sorted([name for name, overlap_sites in zip(names, sites) if site in overlap_sites])))

names_overlapping = Counter(names_overlapping)

from collections import defaultdict
num_by_names = {}
num_NOT_by_names = {}
for k,v in names_overlapping.iteritems():
    if len(k) not in num_by_names:
        num_by_names[len(k)] = defaultdict(int)
        num_NOT_by_names[len(k)] = defaultdict(int)
    
    for name in names:
        if name in k:
            num_by_names[len(k)][name] += v
        else:
            num_NOT_by_names[len(k)][name] += v

low = min(num_by_names.keys())
high = max(num_by_names.keys())

index = range(low, high+1)

df_num_by_names = pd.DataFrame(index=index)
df_num_NOT_by_names = pd.DataFrame(index=index)

for name in names:
    df_num_by_names[name] = [num_by_names[i][name] for i in df_num_by_names.index]
    df_num_NOT_by_names[name] = [num_NOT_by_names[i][name] for i in df_num_NOT_by_names.index]

df_num_by_names = (df_num_by_names.T / df_num_by_names.T.sum(axis=0)).T
df_num_NOT_by_names = (df_num_NOT_by_names.T / df_num_NOT_by_names.T.sum(axis=0)).T

df_num_by_names.to_csv(out_positive, sep="\t", index=True)
df_num_NOT_by_names.to_csv(out_negative, sep="\t", index=True)