import csv
import pandas as pd

configfile: "/g/furlong/project/100_Enformer/data/ReMap_2022_Drosophila/config/config.yml"

# Paths
PROJECT_DIR = config["paths"]["project_dir"]
BAM_TIER2_DIR = config["paths"]["bam_dir_tier2"]
BAM_SCRATCH_DIR = config["paths"]["bam_dir_scratch"]
BIGWIG_FORCE_SCRATCH_DIR = config["paths"]["bigwig_force_input_subtraction_dir_scratch"]
BIGWIG_SCRATCH_DIR = config["paths"]["bigwig_dir_scratch"]
MACS2_SCRATCH_DIR = config["paths"]["macs2_dir_scratch"]
HTSEQ_SCRATCH_DIR = config["paths"]["htseq_dir_scratch"]
ENVS_DIR = PROJECT_DIR + "/config/envs"

# Variables
EXPERIMENTS = []
with open(expand("{path}/data/annotation/metatable.csv", path=PROJECT_DIR)[0]) as file_list:
    next(file_list)
    lines = csv.reader(file_list, delimiter=';')
    for line in lines:
        EXPERIMENTS.append(line[0])
SAMPLE_NAMES = []
with open(expand("{path}/data/annotation/metatable_samples.txt", path=PROJECT_DIR)[0]) as file_list:
    samples = csv.reader(file_list, delimiter='\t')
    for sample in samples:
        SAMPLE_NAMES.append(sample[0])
INPUT_NAMES = []
with open(expand("{path}/data/annotation/metatable_inputs.txt", path=PROJECT_DIR)[0]) as file_list:
    inputs = csv.reader(file_list, delimiter='\t')
    for inp in inputs:
        INPUT_NAMES.append(inp[0])
DATASET_DICT = {}
with open(expand("{path}/data/annotation/ReMap2022_datasets_metatable.txt", path=PROJECT_DIR)[0]) as file_list:
    datasets = csv.reader(file_list, delimiter='\t')
    for data in datasets:
        DATASET_DICT[data[0]] = data[1].split(",")
DATASET = list(DATASET_DICT.keys())


ALL_BAM_NAMES = SAMPLE_NAMES + INPUT_NAMES

# Debug
#SAMPLE_NAMES=["remap2022_ENCSR033IIP.pan.adult_whole-fly:FBgn0085432_ENCFF882HVE"]

# Tools
BAMCOVERAGE = config["tools"]["bamCoverage"]
BAMCOMPARE = config["tools"]["bamCompare"]
SAMTOOLS = config["tools"]["samtools"]
BIGWIGCOMPARE = config["tools"]["bigwigCompare"]

# Targets
COPIED_BAM_OUT = expand("{path}/{sample}.bam", path=BAM_SCRATCH_DIR, sample=ALL_BAM_NAMES)
BIGWIG_OUT = expand("{path}/{sample}.bw", path=BIGWIG_SCRATCH_DIR, sample=SAMPLE_NAMES)
BIGWIG_MERGE_REP_OUT = expand("{path}/merge_rep/{experiment}.bw", path=BIGWIG_SCRATCH_DIR, experiment=EXPERIMENTS)
INFO_OUT = expand("{path}/{sample}.info", path=BIGWIG_SCRATCH_DIR, sample=SAMPLE_NAMES)
MACS2_OUT = expand("{path}/{sample}_macs2_peaks.narrowPeak", path=MACS2_SCRATCH_DIR, sample=SAMPLE_NAMES)
HTSEQ_OUT = expand("{path}/{sample}_htseqcounts.txt", path=HTSEQ_SCRATCH_DIR, sample=SAMPLE_NAMES)



rule all:
    input:
        COPIED_BAM_OUT, MACS2_OUT, HTSEQ_OUT, BIGWIG_OUT, BIGWIG_MERGE_REP_OUT, #BIGWIG_FORCE_OUT
        


rule move_bam_files:
    input:
        expand("{path}/{{sample}}.bam", path=BAM_TIER2_DIR)
    output:
        expand("{path}/{{sample}}.bam", path=BAM_SCRATCH_DIR)
    shell:
        """
        cat {input} > {output}
        """


rule create_bigwigs:
    input:
        bw = expand("{path}/{{sample}}.bam", path=BAM_SCRATCH_DIR),
    output:
        bw = expand("{path}/{{sample}}.bw", path=BIGWIG_SCRATCH_DIR),
        info = expand("{path}/{{sample}}.info", path=BIGWIG_SCRATCH_DIR)
    params:
        metatable = expand("{path}/data/annotation/metatable.csv", path=PROJECT_DIR),
        bin_size = 10,
        normalizeUsing = "RPGC",
        effective_genome_size = 125464728,
        exclude_chromosome = "chrX chrM chrY"
    conda: "conda_congif.yml"
    shell:
        """
        # Define sample specific variants
        BIOTYPE=`cat {params.metatable} | grep {wildcards.sample} | sed 's/;/\\t/g' | cut -f6`
        TISSUE=`cat {params.metatable} | grep {wildcards.sample} | sed 's/;/\\t/g' | cut -f7` 
        TIME_POINT=`cat {params.metatable} | grep {wildcards.sample} | sed 's/;/\\t/g' | cut -f8` 
        CONDITION=`cat {params.metatable} | grep {wildcards.sample} | sed 's/;/\\t/g' | cut -f9` 
        PAIRED_READS=`samtools view -c -f 1 {input.bw}`
        INPUT_SAMPLES=`cat {params.metatable} | grep {wildcards.sample} | sed 's/;/\\t/g' \
            | cut -f12 | sed 's/\\ *//g'`
        samtools index {input.bw}


        # Define input bam file. 
        if [[ $INPUT_SAMPLES =~ "remap" ]];
            then
                INPUT_SAMPLES_BAM=`echo $INPUT_SAMPLES | sed 's/,/\\n/g' | sed 's/\\r//g' \
                | awk -v dir={BAM_SCRATCH_DIR} '{{print dir "/" $1 ".bam"}}' | tr "\\n" "\\ "`
            else
                INPUT_SAMPLES_BAM=""
        fi


        # Define fragment length expansion (different for pared- and single- end)
        if [[ $PAIRED_READS == 0 ]];
            then
                EXTEND_FRAGMENT_LENGTH=0
                PAIRED_READS="FALSE"
            else
                EXTEND_FRAGMENT_LENGTH=""
                PAIRED_READS="TRUE"
        fi


        # Check if file has assigned input bam file(s)
        if [[ $INPUT_SAMPLES_BAM =~ "remap" ]];
            then
            
            # If multiple files are found, merge them in one input bam file.
            samtools merge - $INPUT_SAMPLES_BAM \
            | samtools sort -T {output.bw}.tmp.samtools - > {output.bw}.tmp.input_files.bam
            samtools index {output.bw}.tmp.input_files.bam

            # Create bigwig for input files. Do not extend fragments by default.
            bamCoverage -b {output.bw}.tmp.input_files.bam \
                -o {output.bw}.tmp.input_files.bw \
                --binSize {params.bin_size} \
                --normalizeUsing {params.normalizeUsing} \
                --effectiveGenomeSize {params.effective_genome_size} \
                --ignoreForNormalization {params.exclude_chromosome} \
                --extendReads 0

            # Create bigwig for sample file
            bamCoverage -b {input.bw} \
                -o {output.bw}.tmp.sample.bw \
                --binSize {params.bin_size} \
                --normalizeUsing {params.normalizeUsing} \
                --effectiveGenomeSize {params.effective_genome_size} \
                --ignoreForNormalization {params.exclude_chromosome} \
                --extendReads $EXTEND_FRAGMENT_LENGTH

            # Subtract input from sample coverage
            bigwigCompare -b1 {output.bw}.tmp.sample.bw \
                -b2 {output.bw}.tmp.input_files.bw -o {output.bw} \
                --binSize {params.bin_size} \
                --operation subtract

        else

            # Create bigwig for sample file (no input subtraction)
            bamCoverage -b {input.bw} \
                -o {output.bw} \
                --binSize {params.bin_size} \
                --normalizeUsing {params.normalizeUsing} \
                --effectiveGenomeSize {params.effective_genome_size} \
                --ignoreForNormalization {params.exclude_chromosome} \
                --extendReads $EXTEND_FRAGMENT_LENGTH

        fi


        # Create info file
            echo -e "Filename:\\t"{wildcards.sample} > {output.info}
            echo -e "Biotype:\\t"$BIOTYPE >> {output.info}
            echo -e "Tissue:\\t"$TISSUE >> {output.info}
            echo -e "Time_point:\\t"$TIME_POINT >> {output.info}
            echo -e "Paired_end:\\t"$PAIRED_READS >> {output.info}
            echo -e "Input_samples:\\t"$INPUT_SAMPLES >> {output.info}


        # Clean temporary files
        rm -f {output.bw}.tmp*
        """



rule merge_replicates_bigwig:
    input:
        BIGWIG_OUT
    output:
        expand("{path}/merge_rep/{{experiment}}.bw", path=BIGWIG_SCRATCH_DIR)
    params:
        metatable = expand("{path}/data/annotation/metatable.csv", path=PROJECT_DIR),
        bin_size = 10,
        input_dir = BIGWIG_SCRATCH_DIR,
        out_dir = expand("{path}/merge_rep", path=BIGWIG_SCRATCH_DIR)
    conda: "conda_congif.yml"
    shell:
        """
        INPUTS_BIGWIGS=`cat {params.metatable} | sed 's/;/\\t/g; s/\\t\\t/\\t.\\t/g' \
            | awk '$1 == "{wildcards.experiment}"' | cut -f11 | sed 's/,/.bw\\ /g' \
            | awk '{{print $0 ".bw"}}'`

        if [[ `echo $INPUTS_BIGWIGS | wc -w` -gt 1 ]]; then
            cd {params.input_dir}
            bigwigAverage --bigwigs $INPUTS_BIGWIGS --binSize {params.bin_size} --outFileName {output};
        else
            cd {params.out_dir}
            ln -s {params.input_dir}/$INPUTS_BIGWIGS {wildcards.experiment}.bw
        fi
        """



rule make_complete_metatable:
    input:
        metatable = expand("{path}/data/annotation/metatable.csv", path=PROJECT_DIR),
        info = INFO_OUT
    output:
        expand("{path}/data/annotation/ReMap2022_metatable.txt", path=PROJECT_DIR),
    shell:
        """
        echo -e "Dataset\\tExperiment_ID\\tTF\\tFBgn\\tBiotyope\\tTissue\\tTime_point\\tTreatment\\tPaired_end\\tInput_subtracted\\tSRR\\tInput" > {output}

        for file in {input.info}; 
            do 
            SAMPLE=`echo $file | sed 's/.*\///; s/\.info//g'`; 
            METATABLE=`cat {input.metatable} | grep $SAMPLE`;

            DATASET=`echo $METATABLE | sed 's/;/\\t/g' | cut -f1`
            EXPERIMENT_ID=`echo $METATABLE | sed 's/;/\\t/g' | cut -f2`
            TF=`echo $METATABLE | sed 's/;/\\t/g' | cut -f3`
            FBGN=`echo $METATABLE | sed 's/;/\\t/g' | cut -f4`
            BIOTYPE=`echo $METATABLE | sed 's/;/\\t/g' | cut -f6`
            TISSUE=`echo $METATABLE | sed 's/;/\\t/g' | cut -f7`
            TIME_POINT=`echo $METATABLE | sed 's/;/\\t/g' | cut -f8`
            TREATMENT=`echo $METATABLE | sed 's/;/\\t/g' | cut -f10`
            PAIRED_END=`cat {BIGWIG_SCRATCH_DIR}/$SAMPLE.info | awk 'NR==5' | cut -f2`
            INPUT_SUBTRACTED=`if cat /scratch/forneris/ReMap_2022/bigwig/$SAMPLE.info | awk 'NR==6' | cut -f2 \
                    | sed 's/\\ /,/g' | grep -c remap > 0 ; then echo TRUE ; else echo FALSE; fi`
            SRR=`echo $METATABLE | sed 's/;/\\t/g' | cut -f11`
            INPUT=`cat {BIGWIG_SCRATCH_DIR}/$SAMPLE.info | awk 'NR==6' | cut -f2 | sed 's/\\ /,/g'`

            echo -e "$DATASET\\t$EXPERIMENT_ID\\t$TF\\t$FBGN\\t$BIOTYPE\\t$TISSUE\\t$TIME_POINT\\t$TREATMENT\\t$PAIRED_END\\t$INPUT_SUBTRACTED\\t$SRR\\t$INPUT" >> {output}
        done
        """




##########################
# FRiP scores per sample #
##########################

rule find_peaks_MACS2:
    input:
        test = expand("{path}/{{sample}}.bam", path=BAM_SCRATCH_DIR)
    output:
        expand("{path}/{{sample}}_macs2_peaks.narrowPeak", path=MACS2_SCRATCH_DIR)
    params:
        metatable = expand("{path}/data/annotation/metatable.csv", path=PROJECT_DIR),
        genome_size = config["parameters_FRiP"]["genome_size"],
        p_value_cutoff = config["parameters_FRiP"]["p_value_cutoff"],
        input_format = config["parameters_FRiP"]["input_format"],
        extsize = config["parameters_FRiP"]["extsize"],
        out_dir = MACS2_SCRATCH_DIR
    conda: ENVS_DIR + "/macs2.yml"
    shell:
        """
        mkdir -p {params.out_dir}

        # Define input bam file. 
        INPUT_SAMPLES=`cat {params.metatable} | grep {wildcards.sample} | sed 's/;/\\t/g' \
            | cut -f12 | sed 's/\\ *//g'`


        if [[ $INPUT_SAMPLES =~ "remap" ]];
            then

                INPUT_SAMPLES_BAM=`echo $INPUT_SAMPLES | sed 's/,/\\n/g' | sed 's/\\r//g' \
                | awk -v dir={BAM_SCRATCH_DIR} '{{print dir "/" $1 ".bam"}}' | tr "\\n" "\\ "`

                macs2 callpeak -t {input.test} -c $INPUT_SAMPLES_BAM \
                -g {params.genome_size} -p {params.p_value_cutoff} \
                -f {params.input_format} --call-summits \
                --nomodel --extsize {params.extsize} \
                --outdir {params.out_dir} -n {wildcards.sample}_macs2 

            else
            
                macs2 callpeak -t {input.test} -g {params.genome_size} \
                -p {params.p_value_cutoff} -f {params.input_format} \
                --call-summits --outdir {params.out_dir} \
                --nomodel --extsize {params.extsize} \
                -n {wildcards.sample}_macs2

        fi        
        """

rule quantify_peaks_htseq_count:
    input:
        peaks = expand("{path}/{{sample}}_macs2_peaks.narrowPeak", path=MACS2_SCRATCH_DIR),
        bam = expand("{path}/{{sample}}.bam", path=BAM_SCRATCH_DIR)
    output:
        expand("{path}/{{sample}}_htseqcounts.txt", path=HTSEQ_SCRATCH_DIR)
    conda: ENVS_DIR + "/htseq_count.yml"
    shell:
        """
        htseq-count -f bam -t "ReMap_peaks" -s no \
        {input.bam} <(cat {input.peaks} | cut -f1-3 \
        | sort -k1,1 -k2,2g | bedtools merge -i - | awk '{{OFS="\\t"}} \
        {{print $1, "peaks", "ReMap_peaks", $2+1, $3, \
        ".", ".", ".", "gene_id=" $1 "_" $2 "_" $3}}' \
        | sed 's/\\t-[0-9]*/\\t0/g' | sort -k1,1 -k4,4g) > {output}
        """

rule compute_FRiP:
    input:
        HTSEQ_OUT
    output:
        expand("{path}/remap2022_FRiP_scores.tsv", path=PROJECT_DIR)
    params:
        input_dir = HTSEQ_SCRATCH_DIR
    shell:
        """
        echo -e "sample\\treads_on_peaks\\ttot_reads\\tFRiP" > {output}

        for f in `ls /{params.input_dir}/remap2022_*txt`; 
            do 
            TOT=`cat $f | grep -v "ambiguous" | grep -v "too_low_aQual" \
                | grep -v "not_aligned" | awk '{{sum += $2}} END {{print sum}}'`; 
            PEAK=`cat $f | grep -v "__" | awk '{{sum += $2}} END {{print sum}}'`; 
            awk -v f=$f -v t=$TOT -v p=$PEAK 'BEGIN {{print f "\\t" p "\\t" t "\\t" p/t}}'; 
        done \
        | sed 's/.*remap2022_//g; s/_htseqcounts.txt//' >> {output}
        """


