# We will be using using differential expression results from Li2020
# As those results are reported for the Neff genome, we first need to liftover
# Neff annotations to C3 to get a correspondance of reported genes into C3.

# Extract sheet from excel file for desired timepoint
rule xls_to_diff_expr:
    output: join(TMP, 'diff_expr', 'li2020_de_genes.tsv')
    params:
        xls = config['rnaseq_li2020'],
        time = 8
    conda: '../envs/viz.yaml'
    script: '../scripts/06_xls_to_diff_expr.py'

# parse the excel file from Li. et al 2020
# Extract the gene expression value at each time point
# Generate a new table with rows=genes and cols=time
rule get_expr_vs_time:
    output: join(OUT, 'diff_expr', 'li2020_expr_vs_time.tsv')
    params:
        xls = config['rnaseq_li2020']
    conda: '../envs/viz.yaml'
    script: '../scripts/06_get_expr_vs_time.py'

# Note: the current assembly url seems broken in genomepy 
# we will use it when fixed, for now, the genome path must be provided in config
rule download_neff:
    output: join(TMP, 'liftover', 'neff_v1.fa')
    params:
        accession = "Acastellanii.strNEFF_v1"
    conda: '../envs/genomepy.yaml'
    shell:
        """
        genomepy install {params.accession}
        """


# Liftover annotations from the published Neff (v1) genome
# to the current assemblies
rule liftover_annotations:
    output: join(TMP, 'liftoff', 'neffv1_{strain}_liftover.gff')
    params:
        neff_fa = config['neff_v1']['genome'],
        neff_annot = config['neff_v1']['annot'],
        target_fa = lambda w: config['reference'][w.strain]
    conda: '../envs/liftoff.yaml'
    threads: NCPUS
    shell:
        """
        liftoff -g {params.neff_annot} \
            -p {threads} \
            -o {output} \
            {params.target_fa} \
            {params.neff_fa}
        """

# intersect liftover coordinates with de novo annotations
# to get mapping from Neff v1 to new identifiers
rule map_neff_c3_identifiers:
    input: join(TMP, 'liftoff', 'neffv1_{strain}_liftover.gff')
    output: join(TMP, 'liftoff', 'neffv1_{strain}_gene_mapping.tsv')
    params:
        c3_annot = lambda w: config['annot'][w.strain]
    conda: '../envs/hic_processing.yaml'
    shell:
        """
        # GFF files are converted to bed (using awk) and sorted 
        awk -vOFS='\t' '$3== "gene" {{
            gsub("ID=gene:","",$9);gsub(";.*", "", $9);print $1,$4,$5,$9}}' \
            {input} \
          | sort -k1,1 -k2,2n > tmp.a
        awk -vOFS='\t' '$3 == "gene" {{
            gsub("ID=","",$9);gsub(";.*", "", $9);print $1,$4,$5,$9}}' \
            {params.c3_annot} \
          | sort -k1,1 -k2,2n > tmp.b

        # The bed files are then compared to find the best overlap
        # for each gene.
        bedtools intersect -a tmp.a -b tmp.b -wao \
          | awk -vOFS='\t' '$8 != "." {{print $4,$8}}' \
          > {output}
        """

# Use the liftover to map gene identifiers from Neff v1 to new assemblies
rule convert_identifiers:
    input:
        mapping = join(TMP, 'liftoff', 'neffv1_{strain}_gene_mapping.tsv'),
        de_genes = join(TMP, 'diff_expr', 'li2020_de_genes.tsv')
    output: join(OUT, 'diff_expr', 'neffv1_liftoff_{strain}_de_genes.tsv')
    params:
    conda: '../envs/viz.yaml'
    script: '../scripts/06_convert_identifiers.py'

# Retrieve DESeq2 experiment generated by nf-core/rnaseq and
# extract differential expression satistics
rule get_c3_diff_expr:
    input: join(SHARED, 'rnaseq', 'diff_expr', 'results', 'star_salmon', 'deseq2_qc', 'deseq2.dds.RData')
    output: join(OUT, 'diff_expr', 'c3_diff_expr.tsv')
    conda: "../envs/r_env.yaml"
    shell: "Rscript scripts/06_get_diff_expr.R {input} {output}"