"""
Snakemake implementation of some of GREAT enrichment tests for cis regulatory regions for NMRDMR data.

Implementation is not optimized, nor tested on other datasets.
"""

import pandas as pd
from pretty_html_table import build_table


## Parse config args

SP = config.get("sp", "Fdam")
assert SP in ["Fdam", "Hgla", "Cpor"]

TISS = config.get("t", "Heart") # Heart or Liver
REG = config.get("r", "Enhancers") # Enhancers or Promoters
BR = config.get("b", "ancestral_MR_branch") # ancestral_MR_branch, Hgla_branch or Fdam_branch
SENS = config.get("s", "UP")

MAPPABLE = config.get("m", 'y')
assert MAPPABLE in ["y", "n"]

RED = config.get("reduce", "")
if RED != "":
    assert MAPPABLE == 'y'
    RED = '_remove_redundant'

OUT = config.get("outfolder", f"GREAT_GO_RESULTS_{SP}{RED}")

DOMAIN = config.get("go_domain", 'BP').upper()
assert DOMAIN in ["BP", "MF", "CC"]


RAND = config.get('rand', None)
rand_nb = ''
if RAND is not None:
    rand_nb = '_rand_'+str(int(RAND))


## Hard-coded path to input and outputs, given config args

if MAPPABLE == "y":

    BACKGROUND = f"../../dataset_s2/orthologous_elements/{TISS}/{SP}/{SP}_{REG.lower()}_orthologs.bed"
    FOREGROUND = f"../../dataset_s3/up_down_elements/{BR}/{SP}/{REG}_{TISS}_{SENS}.bed"

    GENETABLE = f'{OUT}{rand_nb}/all_mappable_for_{REG}_{TISS}_{BR}_{SENS}_genetable.tsv'

    OUTGENES = f'{OUT}{rand_nb}/{REG}_{TISS}_{BR}_{SENS}_genes_enrich.html'
    OUTGO = f'{OUT}{rand_nb}/{REG}_{TISS}_{BR}_{SENS}_go_reg_hypergeom_vs_backgrd.html'
    OUTGO2 = f'{OUT}{rand_nb}/{REG}_{TISS}_{BR}_{SENS}_go_genes_hypergeom_vs_genome.html'
    OUTGO3 = f'{OUT}{rand_nb}/{REG}_{TISS}_{BR}_{SENS}_go_reg_binom_vs_genome.html'

    OUTGO_VS_GENOME = f'{OUT}{rand_nb}/{REG}_{TISS}_{BR}_{SENS}_go_vs_genome.html'

else:
    FOREGROUND = f"../../repeats_denovo/{REG}_{TISS}_{SP}_non-mappable_enr_repeats.bed"
    BACKGROUND = f"../../repeats_denovo/{REG}_{TISS}_{SP}_non-mappable.bed"

    GENETABLE = f'{OUT}{rand_nb}/all_non-mappable_for_{REG}_{TISS}_REPEATS_genetable.tsv'

    OUTGENES = f'{OUT}{rand_nb}/{REG}_{TISS}_{SP}_REPEATS_genes_enrich.html'
    OUTGO = f'{OUT}{rand_nb}/{REG}_{TISS}_{SP}_REPEATS_go_reg_hypergeom_vs_backgrd.html'
    OUTGO2 = f'{OUT}{rand_nb}/{REG}_{TISS}_{SP}_REPEATS_go_genes_hypergeom_vs_genome.html'
    OUTGO3 = f'{OUT}{rand_nb}/{REG}_{TISS}_{SP}_REPEATS_go_reg_binom_vs_genome.html'

    OUTGO_VS_GENOME = f'{OUT}{rand_nb}/{REG}_{TISS}_{SP}_REPEATS_go_vs_genome.html'


## WORKLOW ##

rule Target:
    input: OUTGENES, OUTGO, OUTGO_VS_GENOME, OUTGO_VS_GENOME.replace('_go_', '_c2_')
        

rule regulatory_domains:
    """
    Associates regions to genes using GREAT default rule.
    """
    input: g = f"data/tss_longest_coding_transcript_{SP}_sorted.bed",
           c = f"data/{SP}_chrSizes.txt"
    output: f'{OUT}/reg_domains_{SP}.tsv'
    shell: 'python great_reg_domains.py --output {output} --genes {input.g} --chr {input.c}'


rule propagate_go:
    """
    Go up the GO DAG to associate genes to all ancestor GO terms.
    """
    input: go_genes = f"data/{SP}_GO.tsv",
           godag = "data/go-basic.obo"
    output: out1 = f"data/{SP}_gene_go_full.pkl",
            out2 = f"data/{SP}_go_infos.pkl",
            out3 = f"data/{SP}_gene_names.pkl"
    params: dom = DOMAIN
    shell:
        "python propagate_go_terms.py --go_domain {params.dom} -g {input.go_genes} "
        "-gd {input.godag} -o1 {output.out1} -o2 {output.out2} -o3 {output.out3}"


if RAND is not None:

    rule randomize_foreground:
        """
        Draw a random sample from the background to test that random regions do not enrich in specific functions.
        """
        input:
            f = FOREGROUND,
            b = BACKGROUND
        output: OUTGENES.replace("_genes_enrich.html", '_foreground_reg.bed')
        shell: """
        x=$(wc -l < "{input.f}") && shuf -n $x {input.b} | bedtools sort > {output}
        """

else:
    rule pass_randomization:
        """
        Dummy rule to pass randomization.
        """
        input: FOREGROUND
        output: OUTGENES.replace("_genes_enrich.html", '_foreground_reg.bed')
        shell: "cp {input} {output}"


if RED == "":
    rule foreground_regions_cp:
        """
        Dummy rule in case redundancy filter is not used.
        """
        input: OUTGENES.replace("_genes_enrich.html", '_foreground_reg.bed')
        output: OUTGENES.replace("_genes_enrich.html", '_foreground_reg_ok.bed')
        shell: "cp {input} {output}"

else:
    rule filter_redundant_reg:
        """
        Reduce redundancy before go tests: remove elements found differentially active in > 1 branch.
        """
        input: target = OUTGENES.replace("_genes_enrich.html", '_foreground_reg.bed'),
               other =  expand("../../dataset_s3/up_down_elements/{branches}/"+SP+"/"+REG+"_"+TISS+"_"+SENS+".bed", branches=[i for i in ["Fdam_branch", "Hgla_branch", "ancestral_MR_branch"] if i!=BR])
        output: o = OUTGENES.replace("_genes_enrich.html", '_foreground_reg_ok.bed'), tmp = f'{OUT}/spec_branches_{REG}_{TISS}_{SENS}_{BR}_elements.txt'
        shell: "cat {input.other} | sort | uniq > {output.tmp} && grep -vFwf {output.tmp} {input.target} > {output.o}"



rule enrichment_tests_go:
    """
    GREAT enrichment tests over GO BP
    """
    input:
        d = f'{OUT}/reg_domains_{SP}.tsv',
        fground = OUTGENES.replace("_genes_enrich.html", '_foreground_reg_ok.bed'),
        bground = BACKGROUND,
        go_annot = f"data/{SP}_gene_go_full.pkl",
        go_infos = f"data/{SP}_go_infos.pkl",
        gene_names = f"data/{SP}_gene_names.pkl"
    output:
        genes = OUTGENES.replace(".html", ".tsv"),
        go1 = OUTGO.replace(".html", ".tsv"),
        go2 = OUTGO2.replace(".html", ".tsv"),
        go3 = OUTGO3.replace(".html", ".tsv"),
        genetable = GENETABLE
    params:
        size = config.get("max_size", 1000)
    shell: "python great_enrichment_tests.py -d {input.d} -f {input.fground} -b {input.bground} -g3 {input.gene_names}"
           " -g1 {input.go_annot} -g2 {input.go_infos} -o1 {output.go1} -o2 {output.go2} -o3 {output.go3} "
           "-o {output.genes} --max_size {params.size} -ot {output.genetable}"



rule enrichment_tests_c2pathways:
    """
    GREAT enrichment tests over c2pathways
    """
    input:
        d = f'{OUT}/reg_domains_{SP}.tsv',
        fground = OUTGENES.replace("_genes_enrich.html", '_foreground_reg_ok.bed'),
        go_annot = f"data/{SP}_c2_pathways.pkl",
        gene_names = f"data/{SP}_gene_names.pkl"
    output:
        res2 = OUTGO2.replace(".html", ".tsv").replace("_go_", "_c2_"),
        res3 = OUTGO3.replace(".html", ".tsv").replace("_go_", "_c2_"),
    params:
        size = config.get("max_size", 1000),
    shell: "python great_enrichment_tests.py -d {input.d} -f {input.fground} -g3 {input.gene_names}"
           " -g1 {input.go_annot} -o2 {output.res2} -o3 {output.res3} "
           "--max_size {params.size} --fdr 0.1"


rule go_vs_genome:
    """
    Combines enrichment results for GREAT binomial test over regions and GREAT hypergeometric over genes.
    """
    input:
        hypergeom_genes = OUTGO2.replace(".html", ".tsv"),
        binom_reg = OUTGO3.replace(".html", ".tsv")
    output:
        out = OUTGO_VS_GENOME.replace(".html", ".tsv")
    run:
        data1 = pd.read_csv(input.hypergeom_genes, sep='\t')
        data2 = pd.read_csv(input.binom_reg, sep='\t')

        data1.drop(['Name'], axis=1, inplace=True)
        data2.drop(['Genes', 'Gene_IDS', 'Regions'], axis=1, inplace=True)

        data1.rename(columns={'BH p-value': 'Hyper BH p-value', 'Enrichment': 'Hyper Enrichment'},
                     inplace=True)
        data2.rename(columns={'BH p-value': 'Binom BH p-value', 'Enrichment': 'Binom Enrichment'},
                     inplace=True)

        df = data2.merge(data1, left_on="GO", right_on="GO", how="inner")

        df.to_csv(output.out, sep='\t', index=False)


#TODO use wildcards here for DRY
rule c2_vs_genome:
    """
    Combines enrichment results for GREAT binomial test over regions and GREAT hypergeometric over genes.
    """
    input:
        hypergeom_genes = OUTGO2.replace(".html", ".tsv").replace("_go_", "_c2_"),
        binom_reg = OUTGO3.replace(".html", ".tsv").replace("_go_", "_c2_")
    output:
        out = OUTGO_VS_GENOME.replace(".html", ".tsv").replace("_go_", "_c2_")
    run:
        data1 = pd.read_csv(input.hypergeom_genes, sep='\t')
        data2 = pd.read_csv(input.binom_reg, sep='\t')

        data1.drop(['Name'], axis=1, inplace=True)
        data2.drop(['Genes', 'Name', 'Gene_IDS', "Regions"], axis=1, inplace=True)

        data1.rename(columns={'BH p-value': 'Hyper BH p-value', 'Enrichment': 'Hyper Enrichment', 'GO':'Pathway'},
                     inplace=True)
        data2.rename(columns={'BH p-value': 'Binom BH p-value', 'Enrichment': 'Binom Enrichment', 'GO':'Pathway'},
                     inplace=True)

        df = data2.merge(data1, left_on="Pathway", right_on="Pathway", how="inner")

        df.to_csv(output.out, sep='\t', index=False)


rule format_html_tables:
    """
    Format results as html tebles. 
    """
    input: tsv = OUT+rand_nb+"/{table}.tsv"
    output: html = OUT+rand_nb+"/{table}.html"
    run:
        data = pd.read_csv(input.tsv, sep='\t')
        if 'Gene_IDS' in data.columns:
            data.drop(['Gene_IDS'], axis=1, inplace=True)
        data_html = build_table(data, 'blue_light')
        with open(output.html, 'w') as out:
            out.write(data_html)
