
# Variables
CONDITIONS = ["twi.24", "bin.68", "ctcf.68", "mef2.68", "bin.1012", "mef2.1012"]
CONDITIONS_LIST = [["twi", "24"], ["bin", "68"], ["ctcf", "68"], ["mef2", "68"], ["bin", "1012"], ["mef2", "1012"]]

# Paths
BASENJI_PREDICTION_DIR = "/g/furlong/project/103_Basenji/analysis/variants_predictions"
ALLELIC_IMBALANCE_DIR = "/g/furlong/project/68_F1_cisreg_ichip/data/for_Frosina"
CORRELATIONS_DIR = "/g/furlong/project/103_Basenji/Mattia/analysis/correlations"
TFBS_DIR = "/g/furlong/project/103_Basenji/Mattia/analysis/TFBS"
FIMO_DIR = "/g/furlong/project/68_F1_cisreg_ichip/analysis/ChIPseq/Motif_analysis/with_indels/merged_input_idr0.01_ind3"

# Targets
MERGE_OUT = expand("{path}/Basenji_scores_and_allelic_imbalance_{condition}.txt", path=CORRELATIONS_DIR, condition=CONDITIONS)
TFBS_OUT = expand("{path}/Basenji_scores_and_allelic_imbalance_{condition}_annot.txt", path=CORRELATIONS_DIR, condition=CONDITIONS)
#[TFBS_DIR + "/fimo_TFBS_" + TF + "." + TP + ".bed" for TF, TP in CONDITIONS_LIST]


rule all:
    input:
        MERGE_OUT, TFBS_OUT



rule merge_basenji_and_AI:
    input:
        basenji = expand("{path}/variants_all_conditions_{{condition}}.csv", path=BASENJI_PREDICTION_DIR),
        AI = expand("{path}/quantified_variants_all_conditions.txt", path=ALLELIC_IMBALANCE_DIR)
    output:
        expand("{path}/Basenji_scores_and_allelic_imbalance_{{condition}}.txt", path=CORRELATIONS_DIR)
    shell:
        """
        echo -e "variant_peak_ID\\tvariant_ID\\tpeak_ID\\tBasenji_diff_REF_ALT" \
        "\\tBasenji_ref_position_minus1\\tBasenji_ref_position\\tBasenji_ref_position_plus1" \
        "\\tBasenji_alt_position_minus1\\tBasenji_alt_position\\tBasenji_alt_position_plus1" \
        "\\tchr\\tstart\\tend\\tvariant_ID\\tis_indel\\tREF\\tALT\\tAI\\tAI_abs" \
        "\\tpadjust\\tsignificant\\tcondition\\tcondition_label\\tpeak_ID" \
        | sed 's/\\ *\\t\\ */\\t/g' | cut -f1-13,15-23 > {output}

        join -1 1 -2 1 <(cat {input.basenji} | sed 's/,/\\t/g' | awk '{{print $1 ";" $2 "\\t" $0}}' \
        | sort -k1,1) <(cat {input.AI} | sed 's/,\\ /,/g; s/\\"//g; s/\\ /\\t/g' \
        | awk '{{print $1 ":" $2 ";" $14 "\\t" $0}}' | sort -k1,1) | sed 's/\\ /\\t/g' \
        | cut -f1-13,15-23 >> {output}
        """

# Merge variants with motifs (FIMO)

rule merge_variants_and_motifs:
    input:
        fimo = expand("{path}/{{TF}}/{{TP}}/FIMO/combined_motifs/fimo.tsv", path=FIMO_DIR),
        AI = expand("{path}/Basenji_scores_and_allelic_imbalance_{{TF}}.{{TP}}.txt", path=CORRELATIONS_DIR),
        gff = "/g/furlong/genome/D.melanogaster/Dm6/6.37/gff/dmel-all-filtered-r6.37_ProblemCaseFiltered.gff.gz"
    output:
        TFBS = expand("{path}/fimo_TFBS_{{TF}}.{{TP}}.bed", path=TFBS_DIR),
        AI = expand("{path}/Basenji_scores_and_allelic_imbalance_{{TF}}.{{TP}}_annot.txt", path=CORRELATIONS_DIR)
    params:
        summit_TSS_dist = 500
    conda: "../../config/bedtools_env.yml"
    shell:
        """
        cat {input.fimo} | grep -v "^#" | sed 's/:/\\t/g; s/-/\\t/' | tail -n +2 \
        | awk '{{OFS="\\t"}} {{print $4, $5+$7-1, $5+$8, $2, $12, $9, $4 "_" $5 "_" $6, $13, $1}}' \
        | sort -k1,1 -k2,2g | grep -i {wildcards.TF} > {output.TFBS}    

        awk 'NR == 1 {{print $0 "\\toverlaps_motif"}}' {input.AI} > {output.AI}.temp
        join -1 1 -2 1 <(cat {input.AI} | awk '{{print $2 "\\t" $0}}' | sort) \
        <(bedtools intersect -wao -a <(cat {input.AI} | tail -n +2 \
        | sed 's/:/\\t/; s/;/\\t/' | awk '{{print $1 "\\t" $2-1 "\\t" $2 "\\t" $1 ":" $2}}' \
        | sort -k1,1 -k2,2g) -b <(cat {output.TFBS} | cut -f1-3 | bedtools merge -i -) \
        | cut -f4,8 | sort | uniq) | sed 's/\\ /\\t/g' | cut -f2- >> {output.AI}.temp

        awk 'NR == 1 {{print $0 "\\toverlaps_peak"}}' {output.AI}.temp > {output.AI}.temp2
        join -1 1 -2 1 <(cat {output.AI}.temp | awk '{{print $2 "\\t" $0}}' | sort) \
        <(bedtools intersect -wao -a <(cat {input.AI} | tail -n +2 | cut -f2 \
        | sed 's/:/\\t/g' | awk '{{print $1 "\\t" $2-1 "\\t" $2 "\\t" $1 ":" $2}}' \
        | sort -k1,1 -k2,2g | uniq) -b <(cat {input.AI} | tail -n +2 | cut -f3 \
        | sed 's/_/\\t/g' | awk '{{print $1 "\\t" $2-250 "\\t" $2+250}}' \
        | sort -k1,1 -k2,2g | uniq) | cut -f4,8 | sort | uniq) | sed 's/\\ /\\t/g' \
        | cut -f2- >> {output.AI}.temp2

        awk 'NR == 1 {{print $0 "\\toverlaps_TSS"}}' {output.AI}.temp2 > {output.AI}
        join -1 1 -2 1 <(cat {output.AI}.temp2 | tail -n +2 | awk '{{print $3 "\\t" $0}}' | sort) \
        <(cat {output.AI}.temp2 | tail -n +2 | cut -f3 | sed 's/_/\\t/g' \
        | awk '{{print $1 "\\t" $2-{params.summit_TSS_dist} "\\t" \
        $2+{params.summit_TSS_dist} "\\t" $1 "_" $2 "_" $3}}' \
        | sort -k1,1 -k2,2g | uniq | bedtools intersect -wao -a - -b <(zcat {input.gff} \
        | awk '$3 == "mRNA" || $3 == "miRNA" || $3 == "ncRNA" || $3 == "rRNA" || $3 == "snRNA" \
        || $3 == "snoRNA"' | awk '{{if ($7=="+") {{print "chr" $1 "\\t" $4-1 "\\t" $4}} \
        else {{print "chr" $1 "\\t" $5-1 "\\t" $5}}}}' | sort -k1,1 -k2,2g | uniq) \
        | awk '{{P="no_TSS"; if ($8>0) {{P="TSS"}}; print $0 "\\t" P}}' \
        | cut -f4,9 | sort | uniq) | sed 's/\\ /\\t/g' | cut -f2- >> {output.AI}

        rm {output.AI}.temp*
        """

