configfile: "config.yaml"

import pandas as pd

wildcard_constraints:
    feature="[a-z]+"

# read coordinates of subgraphs to make
df_subgraphs = (
    pd.read_csv(
        "subgraph_coordinates.csv",
        dtype = {'graph': str, 'chr': str, 'start': int, 'end': int}
    )
      .set_index("graph")
)
df_subgraphs['coord'] = [
    f"{x[0]}:{x[1]}-{x[2]}" 
    for x in zip(df_subgraphs['chr'], df_subgraphs['start'], df_subgraphs['end'])
]

rule all:
    input:
        # master annotations
        "output/annotation/transcripts.bed", "output/annotation/exons.bed",
        expand("output/annotation/transposon-{i}.bed", i = range(len(config['species_ordered'])) ),
        
        # subgraph: GFA, feature GTF and BandageCSV
        expand("output/subgraph/{graph}/subgraph.gfa", graph = df_subgraphs.index),
        expand("output/subgraph/{graph}/subgraph.og", graph = df_subgraphs.index),
        expand("output/subgraph/{graph}/{feature}.gtf", graph = df_subgraphs.index, feature = ['transcripts','exons']),        
        expand("output/subgraph/{graph}/transposon-{i}.gtf", graph = df_subgraphs.index, i = range(len(config['species_ordered'])) ),

        # visualisation and compiled annotations
        expand("output/subgraph/{graph}/subgraph-1D.png", graph = df_subgraphs.index),
        expand("output/subgraph/{graph}/annotation.Bandage.csv", graph = df_subgraphs.index),
        expand("output/subgraph/{graph}/annotation_TE.Bandage.csv", graph = df_subgraphs.index)
        

######################################################################
# PREPROCESS MASTER ANNOTATION FILES FOR GENE 

rule decompress_gtf:
    input:
        "datadir/annotation_gene/annotation.gtf.gz"
    output:
        "output/annotation/annotation.gtf"
    shell:
        "gunzip -c {input} | grep -v '^#!' > {output}"

rule extract_transcripts_from_gtf:
    input:
        "output/annotation/annotation.gtf"
    output:
        "output/annotation/transcripts.gtf"
    shell:
        "cat {input} | "
        "grep 'transcript\t' | cut -f 1 -d';' | sed 's/gene_id //g' | sed 's/\"//g' | uniq "
        "> {output}"

rule format_transcripts_to_bed:
    input:
        "output/annotation/transcripts.gtf"
    output:
        "output/annotation/transcripts.bed"
    shell:
        "paste <(cut -f1 {input})"
        "      <(cut -f4-5 {input})"
        "      <(cut -f9 {input})"
        "      <(cut -f6-7 {input}) | "
        "bedtools sort > {output}"

rule extract_exons_from_gtf:
    input:
        "output/annotation/annotation.gtf"
    output:
        "output/annotation/exons.gtf"
    shell:
        "cat {input} | "
        "grep 'exon\t' | cut -f 1 -d';' | sed 's/gene_id //g' | sed 's/\"//g' | uniq "
        "> {output}"

rule format_exons_to_bed:
    input:
        "output/annotation/exons.gtf"
    output:
        "output/annotation/exons.bed"
    shell:
        "paste <(cut -f1 {input})"
        "      <(cut -f4-5 {input})"
        "      <(cut -f9 {input})"
        "      <(cut -f6-7 {input}) | "
        "bedtools sort > {output}"

######################################################################
# PROCESS MASTER ANNOTATION FILE FOR TRANSPOSON

rule extract_transposons_from_RepMask_simple:
    input:
        lambda wildcards: f"datadir/annotation_te/path_{config['species_ordered'][int(wildcards.i)]}.simple" 
    output:
        temp("output/annotation/transposon-{i}.bed.tmp")
    shell:
        "paste -d'\t' "
        "  <(cut -f3-5 {input}) "
        "  <(paste -d: <(cut -f8 {input}) <(cut -f12 {input}) ) "
        "  <(cut -f2,6 {input}) "
        "| awk 'NR>1' > {output}"

rule filter_transposons:
    input:
        "output/annotation/transposon-{i}.bed.tmp"
    output:
        "output/annotation/transposon-{i}.bed"
    params:
        age = config['te_filter_age']
    shell:
        "awk '$5<{params.age}' {input} | grep -v 'Unknown' | grep -v 'Simple_repeat' > {output}" 

######################################################################
# SUBGRAPH EXTRACTION 

rule extract_subgraph_round1:
    input:
        "datadir/odgi_graphs/graph_chop100.og"
    output:
        temp("output/subgraph/{graph}/subgraph_round1.gfa")
    params:
        lambda wildcards: df_subgraphs.loc[wildcards.graph]['coord']
    shell:
        "odgi extract -t2 -i {input} -r {params} -d0 -o - | odgi view -i - -g > {output}"

rule extract_subgraph_round1_paths:
    input:
        "output/subgraph/{graph}/subgraph_round1.gfa"
    output:
        temp("output/subgraph/{graph}/subgraph_round1.paths.fragments")
    shell:
        "grep '^P' {input} | cut -f2 | sed 's/:/,/g' | sed 's/\(.*\)-/\\1,/' > {output}"

rule calculate_subgraph_round1_superpaths_by_chr:
    input:
        "output/subgraph/{graph}/subgraph_round1.paths.fragments"
    output:
        temp("output/subgraph/{graph}/subgraph_round1.paths.bed")
    script:
        "script/calculate_superpaths_by_chr.py"

rule extract_subgraph_round2:
    input:
        og  = "datadir/odgi_graphs/graph_chop100.og",
        bed = "output/subgraph/{graph}/subgraph_round1.paths.bed"
    output:
        "output/subgraph/{graph}/subgraph.og"  # don't make this temp, because many files depend on it (to avoid repeated regeneration)
    shell:
        "odgi extract -t2 -i {input.og} -b {input.bed} -d0 -o {output}"

rule convert_subgraph_round2_to_gfa:
    input:
        "output/subgraph/{graph}/subgraph.og"
    output:
        "output/subgraph/{graph}/subgraph.gfa"
    shell:
        "odgi view -i {input} -g > {output}"

rule plot_linear_visualisation:
    input:
        "output/subgraph/{graph}/subgraph.og"
    output:
        "output/subgraph/{graph}/subgraph-1D.png"
    shell:
        "odgi unchop -i {input} -o - | odgi sort -i - -O -P -Y -o - | odgi viz -i - -o {output}"  # -Y option seems to help with complex graphs, -H: fixes things?

######################################################################
# GENE ANNOTATION EXTRACTION 
rule extract_subgraph_transcripts_to_BED:
    input:
        "output/annotation/transcripts.bed"
    output:
        temp("output/subgraph/{graph}/transcripts.bed")
    params:
        chr   = lambda wildcards: df_subgraphs.loc[wildcards.graph]['chr'],
        start = lambda wildcards: df_subgraphs.loc[wildcards.graph]['start'],
        end   = lambda wildcards: df_subgraphs.loc[wildcards.graph]['end']
    shell:
        "bedtools intersect -a {input} "
        "-b <(printf '{params.chr}\t{params.start}\t{params.end}') "
        "-wa > {output}"

rule extract_subgraph_exons_to_BED:
    input:
        "output/annotation/exons.bed"
    output:
        temp("output/subgraph/{graph}/exons.bed")
    params:
        chr   = lambda wildcards: df_subgraphs.loc[wildcards.graph]['chr'],
        start = lambda wildcards: df_subgraphs.loc[wildcards.graph]['start'],
        end   = lambda wildcards: df_subgraphs.loc[wildcards.graph]['end']
    shell:
        "bedtools intersect -a {input} "
        "-b <(printf '{params.chr}\t{params.start}\t{params.end}') "
        "-wa > {output}"

rule convert_subgraph_feature_from_BED_to_GTF:
    input:
        "output/subgraph/{graph}/{feature}.bed"
    output:
        "output/subgraph/{graph}/{feature}.gtf"
    shell:
        "paste -d'\t' "
        "  <(cut -f1 {input}) "
        "  <(python script/print_text_for_n_rows.py 'ensembl\tgene' $(( $(wc -l < {input}) -1 )) | ghead -n -1 ) " 
        "  <(cut -f2-3 {input}) "
        "  <(python script/print_text_for_n_rows.py '.' $(( $(wc -l < {input}) -1 )) | ghead -n -1) " 
        "  <(cut -f6 {input}) "
        "  <(python script/print_text_for_n_rows.py '.' $(( $(wc -l < {input}) -1 )) | ghead -n -1) " 
        "  <(cut -f4 {input}) "
        "> {output}"

rule format_feature_from_GTF_to_BandageCSV:
    input:
        og  = "datadir/odgi_graphs/graph_chop100.og",
        gtf = "output/subgraph/{graph}/{feature}.gtf"
    output:
        temp("output/subgraph/{graph}/{feature}.Bandage.csv.tmp")
    shell:
        "odgi position -i {input.og} -E {input.gtf} -t2 > {output}"


rule change_color_transcript_BandageCSV:
    input:
        "output/subgraph/{graph}/transcripts.Bandage.csv.tmp"
    output:
        temp("output/subgraph/{graph}/transcripts.Bandage.csv")
    params:
        config["color_transcript"]
    shell:
        "paste -d, "
        "  <(cut -d, -f1,2 {input}) "
        "  <( cat <(printf 'COLOR\n') "
        "         <(python script/print_text_for_n_rows.py '{params}' $(( $(wc -l < {input}) -1 )) ) | ghead -n -2 ) "
        "> {output}"

rule change_color_exon_BandageCSV:
    input:
        "output/subgraph/{graph}/exons.Bandage.csv.tmp"
    output:
        temp("output/subgraph/{graph}/exons.Bandage.csv")
    params:
        config["color_exon"]
    shell:
        "paste -d, "
        "  <(cut -d, -f1,2 {input}) "
        "  <( cat <(printf 'COLOR\n') "
        "         <(python script/print_text_for_n_rows.py '{params}' $(( $(wc -l < {input}) -1 )) ) | ghead -n -2 ) "
        "> {output}"

######################################################################
# TRANSPOSON ANNOTATION EXTRACTION 

rule extract_subgraph_transposons_to_BED:
    input:
        te   = "output/annotation/transposon-{i}.bed",
        path = "output/subgraph/{graph}/subgraph_round1.paths.bed"
    output:
        temp("output/subgraph/{graph}/transposon-{i}.bed")
    shell:
        "bedtools intersect -a {input.te} -b {input.path} -wa > {output}"

rule convert_subgraph_transposons_from_BED_to_GTF:
    input:
        "output/subgraph/{graph}/transposon-{i}.bed"
    output:
        "output/subgraph/{graph}/transposon-{i}.gtf"
    shell:
        "paste -d'\t' "
        "  <(cut -f1 {input}) "
        "  <(python script/print_text_for_n_rows.py 'ensembl\tgene' $(( $(wc -l < {input}) -1 )) | ghead -n -1 ) " 
        "  <(cut -f2-3 {input}) "
        "  <(python script/print_text_for_n_rows.py '.' $(( $(wc -l < {input}) -1 )) | ghead -n -1) " 
        "  <(cut -f6 {input}) "
        "  <(python script/print_text_for_n_rows.py '.' $(( $(wc -l < {input}) -1 )) | ghead -n -1) " 
        "  <(cut -f4 {input}) "
        "> {output}"

rule format_transposon_from_GTF_to_BandageCSV:
    input:
        og  = "datadir/odgi_graphs/graph_chop100.og",
        gtf = "output/subgraph/{graph}/transposon-{i}.gtf"
    output:
        temp("output/subgraph/{graph}/transposon-{i}.Bandage.csv.tmp")
    shell:
        "odgi position -i {input.og} -E {input.gtf} -t2 > {output}"

rule fix_color_in_transposon_BandageCSV:
    input:
        "output/subgraph/{graph}/transposon-{i}.Bandage.csv.tmp"
    output:
        temp("output/subgraph/{graph}/transposon-{i}.Bandage.csv")
    script:
        "script/fix_transposon_colors.py"

rule concatenate_annotations:
    input:
        "output/subgraph/{graph}/transcripts.Bandage.csv", 
        "output/subgraph/{graph}/exons.Bandage.csv",
        expand("output/subgraph/{{graph}}/transposon-{i}.Bandage.csv", i=range(len(config['species_ordered'])) )
    output:
        "output/subgraph/{graph}/annotation.Bandage.csv"
    shell:
        "cat <(head -n1 {input[0]}) <(cat {input} | grep -v '^NODE') > {output}"

rule concatenate_annotations_te_only:
    input:
        expand("output/subgraph/{{graph}}/transposon-{i}.Bandage.csv", i=range(len(config['species_ordered'])) )
    output:
        "output/subgraph/{graph}/annotation_TE.Bandage.csv"
    shell:
        "cat <(head -n1 {input[0]}) <(cat {input} | grep -v '^NODE') > {output}"


