#!/usr/bin/env python


import os


import sys


from Bio import SeqIO
from collections import defaultdict, Counter
from tqdm import tqdm_notebook as tqdm

directory = sys.argv[1]

methods_dir = sorted([method for method in os.listdir(directory) if not method.startswith(".")])







seqs = defaultdict(dict)
groups = defaultdict(dict)

CUT_OFF = 1

#for method in tqdm(methods_dir, desc='Loading groups for each method'):
for method in methods_dir:
    for group_fn in filter(lambda fn: (fn.endswith('.fa') or fn.endswith('.fasta') or fn.endswith('.out') ),os.listdir(os.path.join(directory, method))):
        group_seqs = list(SeqIO.parse(os.path.join(directory, method, group_fn), 
                                      'fasta'))
        if len(group_seqs) >= CUT_OFF:
            groups[method][group_fn] = len(group_seqs)
            for s in group_seqs:
                seqs[method][(s.id, str(s.seq))] = group_fn




from itertools import combinations
import csv
import sys


for (method1, method2) in combinations(methods_dir, 2):
    # Get the overlap in groups
    overlap = Counter((seqs[method1][k], seqs[method2][k]) 
                      for k in set(seqs[method1].keys()) & set(seqs[method2].keys()))
    
    # Now we need to get any non-overlapping groups.
    method1_missing = set(groups[method1]) - {k[0] for k in overlap.keys()}
    method2_missing = set(groups[method2]) - {k[1] for k in overlap.keys()}
    
    with open('{}vs{}.tsv'.format(method1, method2), 'wt') as fp:
        # Some information...
        fp.write('# {} vs {}\n'.format(method1, method2))
        fp.write('# {} has {} overlapping groups\n'.format(method1, len(groups[method1]) - len(method1_missing)))
        fp.write('# {} has {} overlapping groups\n'.format(method2, len(groups[method2]) - len(method2_missing)))
        fp.write('# {} has {} non-overlapping groups\n'.format(method1, len(method1_missing)))
        fp.write('# {} has {} non-overlapping groups\n'.format(method2, len(method2_missing)))
        fp.write('# Overlapping groups:\n')

        writer = csv.writer(fp, delimiter='\t')
        # Header
        writer.writerow(['method1_group', 'method2_group', 'overlap_count', 'method1_missing', 'method2_missing'])
        # The overlaps
        writer.writerows([g[0], g[1], c, (groups[method1][g[0]] - c), (groups[method2][g[1]] - c)]
                         for (g, c) in overlap.items())




