import os
import sys
import re
import yaml
#import glob3
import numpy as np
import pandas as pd
from snakemake.utils import min_version
# Enforce a minimum Snakemake version
min_version("4.7")

onstart:
    print("##########################################\n")
    print("# STARTING PIPELINE\n")
    print("##########‰################################\n")
#    print ("Running ChIP-seq pre-processing workflow for the following samples:\n " + ' \n '.join(map(str, SAMPLES)))

onsuccess:
    print("##########################################\n")
    print("# PIPELINE ENDED SUCCESSFULLY \n")
    print("##########################################\n")

ruleDisplayMessage = """\n\n####### START RULE EXECUTION ##########\n"""


"""
Since DiffBind loads all bam files in memory, these files should be moved to scratch (BAM_DIR_SCRATCH) before running this pipeline
Done with:
cp /g/furlong/project/68_F1_cisreg_ichip/data/ChIPseq/Alignments/with_indels/*  /scratch/sigalova/data/ChIPseq/Alignments/with_indels
"""

################################################################################
######################## PATHS AND DATA ########################################
################################################################################


### FOLDERS ON TIER1 ###
PROJECT_DIR = config["global"]["projectdir"]
DATA_DIR  = PROJECT_DIR + "/" + config["project_structure"]["datadir"]
EXEC_DIR = PROJECT_DIR + "/" + config["project_structure"]["utils"]
ANALYSIS_DIR = PROJECT_DIR + "/" + config["project_structure"]["analysisdir"]
SCRATCH_DIR = config["global"]["scratchdir"]

# BAM_DIR = DATA_DIR + "/ChIPseq/Alignments/with_indels"
BAM_DIR_SCRATCH = SCRATCH_DIR + "/ChIPseq/Alignments/with_indels"
PEAKS_DIR = DATA_DIR + "/ChIPseq/Peaks/with_indels"
LOG_DIR = "/log/peaks_and_signal/with_indels/consensus_peaks"


### TOOLS ###
#Rscript = config["tools"]["by_path"]["Rscript"]["v4.0"]

### Data and parameters ###

# Individuals
individuals_path = config['data']['samples']['individual_ids']
samples = open(individuals_path, 'r').read().splitlines()
individuals = set(["_".join(x.split("_")[0:2]) for x in samples]) # w/o replicates
print(individuals)

# TFs and time-points
conditions  = ["mef2/68", "mef2/1012", "bin/68", "bin/1012", "ctcf/68", "zld/24", "twi/24"]

# setting which input to used: 'merged_input' or 'unique_input'
input_types = ["merged_input", "unique_input"]

# IDR thresholds
IDR_thres = [0.05, 0.01]

# Min number of individuals that have a peak
num_individuals = [2, 3]


# test
# IDR_thres = 0.05
# conditions  = ["mef2/68"]
# input_types = ["merged_input"]
# num_individuals = 3

################################################################################
######################## FUNCTIONS #############################################
################################################################################

sys.path.append(EXEC_DIR)
import utils as utl

# def get_input_file(tp, individual, input_type, dir = BAM_DIR):
#     if input_type == "unique_input":
#         input_path = dir + "/input." + str(tp) + "." + individual + "_1.keep.merge.rmdup.sort.bam"
#     elif input_type == "merged_input":
#         input_path = dir + "/merged_samples/input." + str(tp) + ".rmdup.sort.merged.bam"
#     else:
#         raise KeyError("Incorrect input_type: 'merged_input' or 'unique_input' accepted")
#     return input_path

################################################################################
######################## MAIN ##################################################
################################################################################

localrules: all, get_samples_table

rule all:
    input:
        expand(PEAKS_DIR + "/{input_type}/Consensus_peaksets/{ab_tp}/Samples_table_{idr_thres}.csv", input_type = input_types, ab_tp = conditions, idr_thres = IDR_thres),
        expand(PEAKS_DIR + "/{input_type}/Consensus_peaksets/{ab_tp}/consensus_peaks_idr{idr_thres}_ind{num_ind}.bed", input_type = input_types, ab_tp = conditions, idr_thres = IDR_thres, num_ind = num_individuals),
        # expand(PEAKS_DIR + "/{input_type}/Consensus_peaksets/{ab_tp}/consensus_peaks_mean_sd.idr{idr_thres}.txt", input_type = input_types, ab_tp = conditions, idr_thres = IDR_thres)


wildcard_constraints:
    input_type = "merged_input|unique_input"

########################################################
########### Construct consensus sets ###################
########################################################

rule get_samples_table:
    input:
        peaks = expand(PEAKS_DIR + "/{{input_type}}/Significant_peaks/{{ab}}/{{tp}}/{individual}_IDR_peaks_{{idr_thres}}.bed", individual = individuals),
        idr_file = ANALYSIS_DIR + "/ChIPseq/IDR/with_indels/{input_type}/{ab}/{tp}/idr_significant_peaks.csv"
    output: temp(PEAKS_DIR + "/{input_type}/Consensus_peaksets/{ab}/{tp}/Samples_table_{idr_thres}.csv")
    message: "{ruleDisplayMessage}Construct DiffBind samples table for {wildcards.ab} at {wildcards.tp}, IDR={wildcards.idr_thres}..."
    params:
        peaks_dir = PEAKS_DIR + "/{input_type}/Significant_peaks/{ab}/{tp}",
        bam_dir = BAM_DIR_SCRATCH
    threads: 1
    shell:
        """
        Rscript  {EXEC_DIR}/construct_diffbind_samples_table.R --filter_idr --idr_file {input.idr_file}  \
                 -p {params.peaks_dir} -i '{samples}' -b {params.bam_dir} \
                 -o {output} --idr_thres {wildcards.idr_thres} --ab {wildcards.ab} --tp {wildcards.tp}
        """

rule get_consensus_peak_set:
    input: rules.get_samples_table.output
    output: PEAKS_DIR + "/{input_type}/Consensus_peaksets/{ab}/{tp}/consensus_peaks_idr{idr_thres}_ind{num_ind}.bed"
    message: "{ruleDisplayMessage}Get consensus peak set for {wildcards.ab}, {wildcards.input_type} at {wildcards.tp}, IDR={wildcards.idr_thres}, # individuals={wildcards.num_ind}..."
    params:
        out_dir = ANALYSIS_DIR + "/ChIPseq/Peaks/with_indels/{input_type}/Consensus_peaksets/{ab}/{tp}/",
        num_individuals = "{num_ind}"
    threads: 1
    shell:
        """
        mkdir -p {params.out_dir};
        Rscript {EXEC_DIR}/construct_consensus_peak_set.R -p {params.out_dir} -n {params.num_individuals} \
                -i {input} -o {output}
        """

# rule get_consensus_mean_and_sd:
#     input:
#         scores = rules.get_consensus_peak_set.output.table,
#         idr = ANALYSIS_DIR + "/ChIPseq/IDR/with_indels/{input_type}/{ab}/{tp}/idr_significant_peaks.csv"
#     output: PEAKS_DIR + "/{input_type}/Consensus_peaksets/{ab}/{tp}/consensus_peaks_mean_sd.idr{idr_thres}.txt",
#     message: "{ruleDisplayMessage}Calculate mean and variation of consensus peak set for {wildcards.ab}, {wildcards.input_type} at {wildcards.tp}, IDR={wildcards.idr_thres}..."
#     params:
#         plot_dir = PEAKS_DIR + "/{input_type}/Consensus_peaksets/{ab}/{tp}/"
#     threads: 1
#     # envmodules:
#     #     "R-bundle-Bioconductor-GBCS/3.10-foss-2019b-R-3.6.2"
#     shell:
#         """
#         Rscript {EXEC_DIR}/calculate_variance_of_consensus_peaks.R   \
#         -i {input.scores} -o {output} -p {params.plot_dir} \
#         --idr_file {input.idr} --idr_thres {wildcards.idr_thres}
#         """
