
import os
import sys
import re
import yaml
import pandas as pd
from snakemake.utils import min_version
import glob
# Enforce a minimum Snakemake version
min_version("4.7")

onstart:
    print("##########################################\n")
    print("# STARTING PIPELINE\n")
    print("##########‰################################\n")

onsuccess:
    print("##########################################\n")
    print("# PIPELINE ENDED SUCCESSFULLY \n")
    print("##########################################\n")

ruleDisplayMessage = """\n\n####### START RULE EXECUTION ##########\n"""

threadsMax = 16

################################################################################
######################## FUNCTIONS #############################################
################################################################################



################################################################################
######################## READ DATA #############################################
################################################################################

# parameters for consensus peak set
idr = config["pipelines"]["3_call_peaks"]["consensus_set"]["idr_thres"]
n_ind = config["pipelines"]["3_call_peaks"]["consensus_set"]["num_ind"]
input = config["pipelines"]["3_call_peaks"]["consensus_set"]["input_type"]
consensus_type_dir = input + "_idr" + str(idr) + "_ind" + str(n_ind)


### FOLDERS ON TIER1 ###

PROJECT_DIR = config["global"]["projectdir"]
DATA_DIR  = PROJECT_DIR + "/" + config["project_structure"]["datadir"]
LOG_DIR = PROJECT_DIR + "/" + config["project_structure"]["logdir"]
EXEC_DIR = PROJECT_DIR + "/" + config["project_structure"]["utils"]
ANALYSIS_DIR = PROJECT_DIR + "/" + config["project_structure"]["analysisdir"]

### FOLDERS ON SCRATCH ###

SCRATCH_DIR = config["global"]["scratchdir"]
PEAKS_DIR = DATA_DIR + "/ChIPseq/Peaks/with_indels"
MOTIF_ANALYSIS_SCRATCH = SCRATCH_DIR + "/MOTIF_ANALYSIS"
MOTIF_ANALYSIS_MAIN = ANALYSIS_DIR + "/ChIPseq/Motif_analysis/with_indels/" + consensus_type_dir

### SOFTWARE ###

BEDTOOLS = config["tools"]["by_path"]["bedtools"]
# MEME Suite is installed in the conda environment wasp_env
#MEME = config["tools"]["by_module"]["meme"]
#PERL = config["tools"]["by_module"]["perl"]

### Data and parameters ###

# Individuals
individuals_path = config['data']['samples']['individual_ids']
samples = open(individuals_path, 'r').read().splitlines()
individuals = set(["_".join(x.split("_")[0:2]) for x in samples]) # w/o replicates
print(individuals)

genome_fasta=config['data']['genome']['dm6']['fasta']

# motifs
cisbp_motifs = config["data"]["motif_databases"]["CISBP"]
selected_motifs = config["data"]["motif_databases"]["selected_motifs"]
combined_motifs = config["data"]["motif_databases"]["combined_motifs"]
functional_pfm_motifs = config["data"]["motif_databases"]["functional_pfm_motifs"]
#cisbp_info = config["data"]["motif_databases"]["CISBP_motifs_info"]
print(combined_motifs)

# TFs and time-points
ab_tp_list  = ["mef2/68", "mef2/1012", "bin/68", "bin/1012", "ctcf/68", "twi/24"]
# ab_tp_list  = ["bin/68"]

# Genome
genome_fasta = config["data"]["genome"]["dm6"]["fasta"]


# number of top ChIP-seq summits to take
n_top = 1000

# unused - currently motifs by peaks
# number of bp to add to each side to consensus peaks (resizing to 2kB in total)
# resize_radius = 750

################################################################################
####################### Pipeline ###############################################
################################################################################localrules: all

localrules: all, select_top_peaks, get_fasta_allele, meme_chip_alleles

rule all:
    input:
        #expand(MOTIF_ANALYSIS_MAIN + "/{ab_tp}/MemeChip/consensus_peaks.top_{n_top}/meme-chip.html", ab_tp = ab_tp_list, n_top = n_top),
        #expand(MOTIF_ANALYSIS_MAIN + "/{ab_tp}/FIMO/selected_motifs/fimo.tsv", ab_tp = ab_tp_list),
        #expand(MOTIF_ANALYSIS_MAIN + "/{ab_tp}/FIMO/cisbp_motifs/fimo.tsv", ab_tp = ab_tp_list)
        # expand(MOTIF_ANALYSIS_MAIN + "/{ab_tp}/FIMO/combined_motifs/fimo.tsv", ab_tp = ab_tp_list)
        expand(MOTIF_ANALYSIS_MAIN + "/{ab_tp}/FIMO/functional_motifs/fimo.tsv", ab_tp = ab_tp_list)
        # expand(MOTIF_ANALYSIS_MAIN + "/all_SNPs_alleles/FIMO/{allele}_{seq_len}bp/combined_motifs/fimo.tsv", allele = ["REF", "ALT"], seq_len = ["31"]),
        # expand(MOTIF_ANALYSIS_MAIN + "/all_SNPs_alleles/FIMO/{allele}_{seq_len}bp/cisbp_motifs/fimo.tsv", allele = ["REF", "ALT"], seq_len = ["31"]),
        # expand(MOTIF_ANALYSIS_MAIN + "/all_SNPs_alleles/FIMO/{allele}_{seq_len}bp/combined_motifs/fimo.tsv", allele = ["REF"], seq_len = ["301"]),
        # expand(MOTIF_ANALYSIS_MAIN + "/all_SNPs_alleles/FIMO/{allele}_{seq_len}bp/cisbp_motifs/fimo.tsv", allele = ["REF"], seq_len = ["301"]),
        # expand(MOTIF_ANALYSIS_MAIN + "/all_variants_alleles/FIMO/{allele}_radius{radius}bp/combined_motifs/fimo.tsv", allele = ["REF", "ALT"], radius = "150"),
        # expand(MOTIF_ANALYSIS_MAIN + "/all_variants_alleles/FIMO/{allele}_radius{radius}bp/combined_motifs/fimo.tsv", allele = ["REF", "ALT"], radius = "15"),
        # expand(MOTIF_ANALYSIS_MAIN + "/all_variants_alleles/FIMO/{allele}_radius{radius}bp/cisbp_motifs/fimo.tsv", allele = ["REF", "ALT"], radius = "15"),
        #expand(MOTIF_ANALYSIS_MAIN + "/{cond}/peaks_with{allele}_motif.fa", cond = ["mef2/68", "mef2/1012"], allele = ["9A", "9G"]),
        #expand(MOTIF_ANALYSIS_MAIN + "/{cond}/MemeChip/peaks_with{allele}_motif/meme-chip.html", cond = ["mef2/68", "mef2/1012"], allele = ["9A", "9G"])
        #expand(MOTIF_ANALYSIS_MAIN + "/{cond}/MemeChip/peaks_with{allele}_motif/meme-chip.html", cond = ["mef2/68"], allele = ["9G"])



########## 1. De novo motifs in consensus sets ##################################################

# peaks are selected based on total normalized signal
rule select_top_peaks:
    input: expand(PEAKS_DIR + "/{input_type}/Consensus_peaksets/{{ab}}/{{tp}}/consensus_peaks_idr{idr_thres}_ind{num_ind}.bed", input_type = input, idr_thres = idr, num_ind = n_ind)
    output: MOTIF_ANALYSIS_MAIN + "/{ab}/{tp}/consensus_peaks.top_" + str(n_top) + ".bed"
    threads: 1
    message: "{ruleDisplayMessage}Select top summits by score from {input} ..."
    shell: "sort -k5 -r -n {input} | awk 'FNR <= {n_top}' > {output}"


rule get_fasta_top:
    input:
        bed = rules.select_top_peaks.output,
        genome = genome_fasta
    output: MOTIF_ANALYSIS_MAIN + "/{ab}/{tp}/consensus_peaks.top_" + str(n_top) + ".fa"
    message: "{ruleDisplayMessage}Get fasta for {input}..."
    threads: 1
    shell: "{BEDTOOLS} getfasta -name -fi {input.genome} -bed {input.bed} -fo {output}"


rule meme_chip:
    input: rules.get_fasta_top.output
    output: MOTIF_ANALYSIS_MAIN + "/{ab}/{tp}/MemeChip/consensus_peaks.top_" + str(n_top) + "/meme-chip.html"
    message: "{ruleDisplayMessage}MEME-ChIP for {input}..."
    params:
        outdir = MOTIF_ANALYSIS_MAIN + "/{ab}/{tp}/MemeChip/consensus_peaks.top_" + str(n_top)
    threads: 1
	shell:
		"""
		meme-chip -oc {params.outdir} -db {cisbp} {input} -order 2
		"""


########## 2. Scan known motifs in consensus peaks  ##################################################

# rule resize_peaks:
#     input: PEAKS_DIR + "/{input_type}/Consensus_peaksets/{ab}/{tp}/consensus_peaks_idr{idr_thres}.bed"
#     output: temp(PEAKS_DIR + "/{input_type}/Consensus_peaksets/{ab}/{tp}/consensus_peaks_idr{idr_thres}_resized.bed")
#     message: "{ruleDisplayMessage}Resize peaks..."
#     threads: 1
#     params:
#         radius = resize_radius
#     shell: "awk '{{print $1, $2<{params.radius}?0:$2-{params.radius}, $3+{params.radius}, $4, $5, $6}}' OFS='\t' {input} > {output}"

rule get_fasta:
    input: expand(PEAKS_DIR + "/{input_type}/Consensus_peaksets/{{ab}}/{{tp}}/consensus_peaks_idr{idr_thres}_ind{num_ind}.bed", input_type = input, idr_thres = idr, num_ind = n_ind)
    output: MOTIF_ANALYSIS_MAIN + "/{ab}/{tp}/consensus_peaks.fa"
    message: "{ruleDisplayMessage}Get fasta..."
    threads: 1
    shell: "{BEDTOOLS} getfasta -name -fi {genome_fasta} -bed {input} -fo {output}"

# markov model order 1 means accounting for dinucleotides (=> corresponds to order 2 in MEME-ChIP), order 0 is for nucleotide frequencies
rule fasta_get_markov:
    input: rules.get_fasta.output
    output: MOTIF_ANALYSIS_MAIN + "/{ab}/{tp}/consensus_peaks_background.txt"
    message: "{ruleDisplayMessage}Calculate genomic background..."
    threads: 1
    shell: """
           fasta-get-markov {input} {output} -m 1
           """

# --thres 1e-5
rule fimo_cisbp_motifs:
    input:
        fasta = rules.get_fasta.output,
        bg_file = rules.fasta_get_markov.output
    output: MOTIF_ANALYSIS_MAIN + "/{ab}/{tp}/FIMO/cisbp_motifs/fimo.tsv",
    message: "{ruleDisplayMessage}Run FIMO on peaks for CIS-BP motifs..."
    threads: 1
    params:
        output_dir = MOTIF_ANALYSIS_MAIN + "/{ab}/{tp}/FIMO/cisbp_motifs/",
        pval = 1e-3
    shell:
        """
        fimo --bgfile {input.bg_file} -oc {params.output_dir} --thresh {params.pval} {cisbp} {input.fasta}
        """

# running with relaxed p-value threshold (1e-3 instead of default 1e-4)
rule fimo_selected_motifs:
    input:
        fasta = rules.get_fasta.output,
        bg_file = rules.fasta_get_markov.output
    output: MOTIF_ANALYSIS_MAIN + "/{ab}/{tp}/FIMO/selected_motifs/fimo.tsv",
    message: "{ruleDisplayMessage}Run FIMO on peaks for selected motifs..."
    threads: 1
    params:
        output_dir = MOTIF_ANALYSIS_MAIN + "/{ab}/{tp}/FIMO/selected_motifs",
        pval = 1e-3
    shell:
        """
        fimo --bgfile {input.bg_file} -oc {params.output_dir} --thresh {params.pval} {selected_motifs} {input.fasta}
        """

rule fimo_combined_motifs:
    input:
        fasta = rules.get_fasta.output,
        bg_file = rules.fasta_get_markov.output
    output: MOTIF_ANALYSIS_MAIN + "/{ab}/{tp}/FIMO/combined_motifs/fimo.txt",
    message: "{ruleDisplayMessage}Run FIMO on peaks for alternative motifs..."
    threads: 1
    params:
        output_dir = MOTIF_ANALYSIS_MAIN + "/{ab}/{tp}/FIMO/combined_motifs",
        motifs = combined_motifs,
        pval = 1e-4
    shell:
        """
        fimo --bgfile {input.bg_file} -oc {params.output_dir} --thresh {params.pval} {params.motifs} {input.fasta}
        """


rule fimo_functional_motifs:
    input:
        fasta = rules.get_fasta.output,
        bg_file = rules.fasta_get_markov.output
    output: MOTIF_ANALYSIS_MAIN + "/{ab}/{tp}/FIMO/functional_motifs/fimo.tsv",
    message: "{ruleDisplayMessage}Run FIMO on peaks for alternative motifs..."
    threads: 1
    params:
        output_dir = MOTIF_ANALYSIS_MAIN + "/{ab}/{tp}/FIMO/functional_motifs",
        motifs = functional_pfm_motifs,
        pval = 1e-4
    shell:
        """
        fimo --bgfile {input.bg_file} -oc {params.output_dir} --thresh {params.pval} {params.motifs} {input.fasta}
        """

########## 2. Denovo motif discovery for alternative alleles  ##################################################

# done for Mef2 alleles at position 9

rule get_fasta_allele:
    input: MOTIF_ANALYSIS_MAIN + "/{cond}/peaks_with{allele}_motif.bed"
    output: MOTIF_ANALYSIS_MAIN + "/{cond}/peaks_with{allele}_motif.fa"
    message: "{ruleDisplayMessage}Get fasta..."
    threads: 1
    shell: "{BEDTOOLS} getfasta -name -fi {genome_fasta} -bed {input} -fo {output}"


rule meme_chip_alleles:
    input: rules.get_fasta_allele.output
    output: MOTIF_ANALYSIS_MAIN + "/{cond}/MemeChip/peaks_with{allele}_motif/meme-chip.html"
    message: "{ruleDisplayMessage}MEME-ChIP for {input}..."
    params:
        outdir = MOTIF_ANALYSIS_MAIN + "/{cond}/MemeChip/peaks_with{allele}_motif"
    threads: 1
	shell:
		"""
		meme-chip -oc {params.outdir} -db {cisbp} {input} -order 2
		"""


########## 3. Scan motifs for 31bp sequences around REF and ALT alleles for all quantified variants  ################


rule fasta_get_markov_2alleles_snps:
    input: MOTIF_ANALYSIS_MAIN + "/all_SNPs_alleles/SNPsequences_{seq_len}bp_{allele}.fa"
    output: MOTIF_ANALYSIS_MAIN + "/all_SNPs_alleles/SNPsequences_{seq_len}bp_{allele}_background.txt"
    message: "{ruleDisplayMessage}Calculate genomic background for different alleles..."
    threads: 1
    shell: """
           fasta-get-markov {input} {output} -m 1
           """

rule fasta_get_markov_2alleles:
    input: MOTIF_ANALYSIS_MAIN + "/all_variants_alleles/All_variants_sequences_radius{radius}bp_{allele}.fa"
    output: MOTIF_ANALYSIS_MAIN + "/all_variants_alleles/All_variants_sequences_radius{radius}bp_{allele}_background.txt"
    message: "{ruleDisplayMessage}Calculate genomic background for different alleles..."
    threads: 1
    shell: """
           fasta-get-markov {input} {output} -m 1
           """


# rule fimo_combined_motifs_2alleles_snps:
#     input:
#         fasta = MOTIF_ANALYSIS_MAIN + "/all_SNPs_alleles/SNPsequences_{seq_len}bp_{allele}.fa",
#         bg_file = rules.fasta_get_markov_2alleles.output
#     output: MOTIF_ANALYSIS_MAIN + "/all_SNPs_alleles/FIMO/{allele}_{seq_len}bp/combined_motifs/fimo.tsv",
#     message: "{ruleDisplayMessage}Run FIMO around SNPs with alternative alleles..."
#     threads: 1
#     params:
#         output_dir = MOTIF_ANALYSIS_MAIN + "/all_SNPs_alleles/FIMO/{allele}_{seq_len}bp/combined_motifs/",
#         motifs = combined_motifs,
#         pval = 1e-4
#     shell:
#         """
#         fimo --bgfile {input.bg_file} -oc {params.output_dir} --thresh {params.pval} {params.motifs} {input.fasta}
#         """

#
# rule fimo_cisbp_motifs_2alleles_snps:
#     input:
#         fasta = MOTIF_ANALYSIS_MAIN + "/all_SNPs_alleles/SNPsequences_{seq_len}bp_{allele}.fa",
#         bg_file = rules.fasta_get_markov_2alleles.output
#     output: MOTIF_ANALYSIS_MAIN + "/all_SNPs_alleles/FIMO/{allele}_{seq_len}bp/cisbp_motifs/fimo.tsv",
#     message: "{ruleDisplayMessage}Run FIMO around SNPs with alternative alleles..."
#     threads: 1
#     params:
#         output_dir = MOTIF_ANALYSIS_MAIN + "/all_SNPs_alleles/FIMO/{allele}_{seq_len}bp/cisbp_motifs/",
#         motifs = cisbp_motifs,
#         pval = 1e-4
#     shell:
#         """
#         fimo --bgfile {input.bg_file} -oc {params.output_dir} --thresh {params.pval} {params.motifs} {input.fasta}
#         """

rule fimo_combined_motifs_2alleles:
    input:
        fasta = MOTIF_ANALYSIS_MAIN + "/all_variants_alleles/All_variants_sequences_radius{radius}bp_{allele}.fa",
        bg_file = rules.fasta_get_markov_2alleles.output
    output: MOTIF_ANALYSIS_MAIN + "/all_variants_alleles/FIMO/{allele}_radius{radius}bp/combined_motifs/fimo.tsv",
    message: "{ruleDisplayMessage}Run FIMO around SNPs with alternative alleles..."
    threads: 1
    params:
        output_dir = MOTIF_ANALYSIS_MAIN + "/all_variants_alleles/FIMO/{allele}_radius{radius}bp/combined_motifs/",
        motifs = combined_motifs,
        pval = 1e-4
    shell:
        """
        fimo --bgfile {input.bg_file} -oc {params.output_dir} --thresh {params.pval} {params.motifs} {input.fasta}
        """

rule fimo_cisbp_motifs_2alleles:
    input:
        fasta = MOTIF_ANALYSIS_MAIN + "/all_variants_alleles/All_variants_sequences_radius{radius}bp_{allele}.fa",
        bg_file = rules.fasta_get_markov_2alleles.output
    output: MOTIF_ANALYSIS_MAIN + "/all_variants_alleles/FIMO/{allele}_radius{radius}bp/cisbp_motifs/fimo.tsv",
    message: "{ruleDisplayMessage}Run FIMO around SNPs with alternative alleles..."
    threads: 1
    params:
        output_dir = MOTIF_ANALYSIS_MAIN + "/all_variants_alleles/FIMO/{allele}_radius{radius}bp/cisbp_motifs/",
        motifs = cisbp_motifs,
        pval = 1e-4
    shell:
        """
        fimo --bgfile {input.bg_file} -oc {params.output_dir} --thresh {params.pval} {params.motifs} {input.fasta}
        """



#
# rule parse_fimo_output:
#     input:
#         cisbp_info = {cisbp_info},
#         motifs_dir = lambda wildcards: fimoOutfile(wildcards.bgr_type),
#         utils_r = utils_r
#     output: OUTDIR + "/cisbp_motifs.{bgr_type}.dhs_resized." + file_prefix + ".motif2gene_parsed.csv"
#     message: "{ruleDisplayMessage}Parse fimo output for {wildcards.bgr_type}.."
#     threads: 1
#     script: EXECUTABLES + "/parse_scanned_cisbp_motifs_in_dhs.R"
