import os
import sys
import yaml
import pandas as pd
from snakemake.utils import min_version
# Enforce a minimum Snakemake version
min_version("4.7")

onstart:
    print("##########################################\n")
    print("# STARTING PIPELINE\n")
    print("##########‰################################\n")
    print ("Running ChIP-seq pre-processing workflow for the following samples:\n " + ' \n '.join(map(str, sample_names)))

onsuccess:
    print("##########################################\n")
    print("# PIPELINE ENDED SUCCESSFULLY \n")
    print("##########################################\n")

ruleDisplayMessage = """\n\n####### START RULE EXECUTION ##########\n"""

################################################################################
######################## FUNCTIONS #############################################
################################################################################

def get_fastq_read_path(samples2paths, sample_name, read):
    """Get path to fastq by sample name.
       Input:
        samples2paths (dict): {sample_name: [path_to_read1, path_to_read2]}
        sample_name (str): must correspond to keys from the dictionary
        read (str): either read1 or read2
    """
    return samples2paths[sample_name][read]


################################################################################
######################## READ DATA #############################################
################################################################################

### FOLDERS ON TIER1 ###
PROJECT_DIR = config["global"]["projectdir"]
DATA_DIR  = PROJECT_DIR + "/" + config["project_structure"]["datadir"]
LOG_DIR = PROJECT_DIR + "/" + config["project_structure"]["logdir"] + "/preprocess_fastq"


### FOLDERS ON SCRATCH ###
SCRATCH_DIR = config["global"]["scratchdir"]
ADAPTER_CLIP_DIR = SCRATCH_DIR + "/PREPROCESS_FASTQ/1.JE_CLIP"
SKEWER_DIR =  SCRATCH_DIR + "/PREPROCESS_FASTQ/2.SKEWER"
SEQTK_DIR = SCRATCH_DIR + "/PREPROCESS_FASTQ/3.SEQTK_TRIMFQ"


### SOFTWARE ###
JE = config["tools"]["by_path"]["je"]["v1"]
SKEWER = config["tools"]["by_path"]["skewer"]
SEQTK = config["tools"]["by_module"]["seqtk"]
SEQTK = config["tools"]["by_path"]["seqtk"]


### DATA ###

# samples info
samplesSummaryFile = config["data"]["samples"]["sample_table"]
samples_info = pd.read_csv(samplesSummaryFile, sep = ",")

# generate names of samples in format: antibody.mother.father.time.replicate
nrows = samples_info.shape[0]
sample_names = []
for i in range(0, nrows):
    genotype_rep = "_".join((map(str, samples_info.loc[i, [ "mother", "father", "replicate"]].tolist())))
    ab_tp = ".".join((map(str, samples_info.loc[i, [ "Antibody", "timePoint"]].tolist())))
    sample_name = ".".join([ab_tp, genotype_rep])
    sample_names.append(sample_name)

# paths to read1 and read2
read1_paths = samples_info["read1_path"].tolist()
read2_paths = samples_info["read2_path"].tolist()

# dictionary with sample names as keys and paths to read1 and read2 as values
reads2paths = [{'read1': read1, 'read2': read2} for read1, read2 in zip(read1_paths, read2_paths)]
samples2paths = dict(zip(sample_names, reads2paths))
print(reads2paths)

################################################################################
######################## RULES #################################################
################################################################################

localrules: all

rule all:
    input:
        expand(SEQTK_DIR + "/{sample}-trimmed-{pair}_seqtk.fastq", sample = sample_names, pair = ["pair1", "pair2"])


rule do_je_clip:
    '''
    Clip UMIs in reads with GBCS's JE tool
    '''
    input:
        read1 = lambda wildcards: get_fastq_read_path(samples2paths, wildcards.sample, "read1"),
        read2 = lambda wildcards: get_fastq_read_path(samples2paths, wildcards.sample, "read2"),
    output:
        clip1 = ADAPTER_CLIP_DIR + "/{sample}.clip_1.fastq.gz",
        clip2 = ADAPTER_CLIP_DIR + "/{sample}.clip_2.fastq.gz"
    message: "Clipping {wildcards.sample}'s UMIs"
    threads: 2
    params:
        je_options = config["pipelines"]["1_preprocess_fastq"]["software_params"]["je_clip"]
    log: LOG_DIR + "/1_je_clip/{sample}_je_clip.log"
    shell: """ {JE} clip \
                {params.je_options} \
                O={ADAPTER_CLIP_DIR} \
                F1={input.read1} \
                F2={input.read2} \
                OF1={output.clip1} \
                OF2={output.clip2} \
                &> {log}
           """


rule do_skewer:
    '''
    Remove adaptor readthrough with skewer
    '''
    input:
        read1 = rules.do_je_clip.output.clip1,
        read2 = rules.do_je_clip.output.clip2
    output:
        read1 = SKEWER_DIR + "/{sample}-trimmed-pair1.fastq.gz",
        read2 = SKEWER_DIR + "/{sample}-trimmed-pair2.fastq.gz"
    message: "Removing {wildcards.sample}'s adaptor readthrough"
    params:
        skewer_options = config["pipelines"]["1_preprocess_fastq"]["software_params"]["skewer"],
        outfile_basename = SKEWER_DIR + "/{sample}"
    threads: 4
    log: LOG_DIR + "/2_skewer_trim/{sample}_skewer.log"
    shell: """ {SKEWER} {params.skewer_options} \
                -o {params.outfile_basename} \
                {input.read1} {input.read2} \
                &> {log}
           """

rule do_seqtk_trim:
    '''
    Clean reads with SEQTK by removing some low quality bases that might cause alignment issues.
    '''
    input: SKEWER_DIR + "/{sample}-trimmed-{pair}.fastq.gz"
    output: SEQTK_DIR + "/{sample}-trimmed-{pair}_seqtk.fastq"
    message: "Trim low quality reads in {input} with seqtk trimfq"
    threads: 1
    log: LOG_DIR + "/3_seqtk/{sample}_seqtk_trimfq_{pair}.log"
    shell: """ #module load {SEQTK};
               {SEQTK} trimfq {input} > {output} 2> {log}
           """
