configfile: "config.yaml"

n_iter = config['n_iterations']

rule all:
    input:
        "input/astCal-bubble.bed",
        "input/astCal-te.bed",
        "input/astCal-totalsizes.txt",
        "input/astCal-permitted_regions.bed",
        "results/astCal-intersect–te_in_flex.bed",
        "results/astCal-intersect–te_as_whole.bed",
        "results/astCal-stats_flex.csv",
        "results/astCal-stats_genomewide.csv",
        "results/astCal-stats_counts_perfect.csv",
        "results/astCal-stats_counts_madeofTE.csv"

#####################################################################################

rule create_input_bed_bubble:
    input:
        "datadir_local/{bb}-variants_raw.bed"
    output:
        "input/{bb}-bubble.bed"
    shell:
        "awk '{{print $1 \"\t\" $2 \"\t\" $3 \"\t\" $1 \":\" $2 \"-\" $3 \"\t\" $3-$2 \"\t*\"}}' {input} "
        "| awk '$5 > 0' "
        "| bedtools sort > {output}"

rule create_input_bed_transposon:
    input:
        "datadir_local/{bb}-TE.simple"
    output:
        "input/{bb}-te.bed"
    shell:
        "paste -d'\t' "
        "  <(cut -f3-5 {input}) "
        "  <(paste -d: <(cut -f11 {input}) <(cut -f8 {input}) <(cut -f1 {input}) <(cut -f7 {input}) ) "
        "  <(cut -f2,6 {input}) "
        "| awk 'NR>1' | bedtools sort > {output}"

rule calculate_total_sizes:
    input:
        flex = "input/{bb}-bubble.bed",
        genomewide = "datadir_local/{bb}.chrsizes"
    output:
        "input/{bb}-totalsizes.txt"
    script:
        "script/calculate_total_sizes.py"

rule get_covered_segments:
    input:
        "datadir_local/{bb}-segments_info.txt", 
        "datadir_local/{bb}-segments_cov.txt"
    output:
        "input/{bb}-covered_ranges.bed"
    params:
        coverage = 0.9 # threshold for segment to be covered
    shell:
        "paste -d'\t' "
        "  <(cut -f2-5 {input[0]}) <(gcut -d, -f1-2 --output-delimiter='\t' {input[1]}) "
        "| awk '$4==0' | awk '$6>{params.coverage}' | awk '$3=$2+$3' "
        "| gcut -d' ' -f1,2,3,5,6 --output-delimiter='\t' "
        "| bedtools merge -i - "
        "> {output}"

rule get_expanded_bubble_regions:
    input:
        bubble = "input/{bb}-bubble.bed",
        chrsizes = "datadir_local/{bb}.chrsizes"
    output:
        "input/{bb}-bubble_expanded.bed"
    shell:
        "bedtools flank "
        "   -i {input.bubble} -g {input.chrsizes} -l 5 -r 5 "
        "| cat - {input.bubble} | bedtools sort -i - | bedtools merge -i - "
        "> {output}"

rule get_permitted_regions:
    input:
        "input/{bb}-covered_ranges.bed",
        "input/{bb}-bubble_expanded.bed"
    output:
        "input/{bb}-permitted_regions.bed"
    shell:
        "bedtools intersect -a {input[0]} -b {input[1]} > {output}"

#####################################################################################

rule intersect_for_te_in_flex:
    input:
        te = "input/{bb}-te.bed", bubble = "input/{bb}-bubble.bed"
    output:
        "results/{bb}-intersect–te_in_flex.bed"
    shell:
        "bedtools intersect -a {input.te} -b {input.bubble} > {output}"

rule intersect_for_te_as_whole:
    input:
        te = "input/{bb}-te.bed", bubble = "input/{bb}-bubble.bed"
    output:
        "results/{bb}-intersect–te_as_whole.bed"
    shell:
        "bedtools intersect -a {input.te} -b {input.bubble} -wao > {output}" # then use the length of intersect $13 to filter further (based on $11 and $3-$2)

rule calculate_te_percent_flex:
    input:
        bed = "results/{bb}-intersect–te_in_flex.bed",
        totalsizes = "input/{bb}-totalsizes.txt"
    output:
        "results/{bb}-stats_flex.csv.tmp" #temp
    params: "flexible"
    script:
        "script/calculate_te_percentage.py"

rule calculate_te_percent_genomewide:
    input:
        bed = "input/{bb}-te.bed",
        totalsizes = "input/{bb}-totalsizes.txt"
    output:
        "results/{bb}-stats_genomewide.csv"
    params: "genomewide"
    script:
        "script/calculate_te_percentage.py"

rule count_intersections:
    input:
        "results/{bb}-intersect–te_as_whole.bed"
    output:
        perfect = "results/{bb}-stats_counts_perfect.csv.tmp", # temp
        madeofTE = "results/{bb}-stats_counts_madeofTE.csv.tmp" # temp
    script:
        "script/count_intersections.py"

#####################################################################################
# SHUFFLE VERSION OF THE COMMANDS

rule shuffle_bubbles:
    input:
        bubble = "input/{bb}-bubble.bed",
        permitted = "input/{bb}-permitted_regions.bed",
        chrsizes = "datadir_local/{bb}.chrsizes"
    output:
        "output/{bb}/bubble/s{i}.bed" #temp
    shell:
        "bedtools shuffle -i {input.bubble} "
        "  -chrom -incl {input.permitted} -g {input.chrsizes} "
        "| bedtools sort > {output}"

rule intersect_for_te_in_flex_shuffled:
    input:
        te = "input/{bb}-te.bed", bubble = "output/{bb}/bubble/s{i}.bed"
    output:
        "output/{bb}/intersect/s{i}.bed" #temp
    shell:
        "bedtools intersect -a {input.te} -b {input.bubble} > {output}"

rule intersect_for_te_as_whole_shuffled:
    input:
        te = "input/{bb}-te.bed", bubble = "output/{bb}/bubble/s{i}.bed"
    output:
        "output/{bb}/intersect_whole/s{i}.bed" #temp
    shell:
        "bedtools intersect -a {input.te} -b {input.bubble} -wao > {output}" # then use the length of intersect $13 to filter further (based on $11 and $3-$2)

rule calculate_te_percent_flex_shuffled:
    input:
        bed = "output/{bb}/intersect/s{i}.bed",
        totalsizes = "input/{bb}-totalsizes.txt"
    output:
        "output/{bb}/stats/s{i}-stats_flex.csv", #temp
    params: "flexible"
    script:
        "script/calculate_te_percentage.py"

rule count_intersections_shuffled:
    input:
        "output/{bb}/intersect_whole/s{i}.bed"
    output:
        perfect = "output/{bb}/stats/s{i}-stats_counts_perfect.csv", #temp
        madeofTE = "output/{bb}/stats/s{i}-stats_counts_madeofTE.csv" #temp
    script:
        "script/count_intersections.py"

#####################################################################################
# CONCATENATE FINAL RESULTS
rule concatenate_flex_stats:
    input:
        "results/astCal-stats_flex.csv.tmp",
        expand("output/{{bb}}/stats/s{i}-stats_flex.csv", i = range(1, n_iter+1))
    output:
        "results/{bb}-stats_flex.csv"
    shell:
        "cat <(head -n1 {input[0]}) "
        "    <(cat {input} | grep -v 'te_class') > {output}"

rule concatenate_counts_perfect:
    input:
        "results/astCal-stats_counts_perfect.csv.tmp",
        expand("output/{{bb}}/stats/s{i}-stats_counts_perfect.csv", i = range(1, n_iter+1))
    output:
        "results/{bb}-stats_counts_perfect.csv"
    shell:
        "cat <(head -n1 {input[0]}) "
        "    <(cat {input} | grep -v 'te_class') > {output}"

rule concatenate_counts_madeofTE:
    input:
        "results/astCal-stats_counts_madeofTE.csv.tmp",
        expand("output/{{bb}}/stats/s{i}-stats_counts_madeofTE.csv", i = range(1, n_iter+1))
    output:
        "results/{bb}-stats_counts_madeofTE.csv"
    shell:
        "cat <(head -n1 {input[0]}) "
        "    <(cat {input} | grep -v 'te_class') > {output}"
