"""
Snakemake workflow module of the nmrdmr pipeline.

Step 3 : first liftover pass, map all regulatory elements into a common coordinates system
(using a ref species).
"""

REQUIRED_CONFIG = ["mapping_ref_sp", "spnames", "assemblies", "ens_release"]
for i in REQUIRED_CONFIG:
    assert i in config, f"Incomplete information in config file, missing {i} please check :)"

REF = config["mapping_ref_sp"]
REFASSMBL = config["assemblies"][REF]
db = config.get('ens_release', '99')

rule download_ensembl_lastZ:
    """
    Download lastZ pairwise alignement to ref from ensembl.
    """
    output: protected(LIFTOVER+"/lastZ/"+REFASSMBL+".v.{sps}.lastz_net.tar.gz")
    params: filename = lambda wildcards: REFASSMBL+".v."+config["assemblies"][wildcards.sps]\
                                         +".lastz_net.tar.gz"
    shell:
        "rsync -av  --info=progress2 "
        "rsync://ftp.ensembl.org/ensembl/pub/release-{db}/maf/ensembl-compara/pairwise_alignments/"
        "{params.filename} {output}"


rule extract_lastZ_maf:
    """
    Extract the .maf files (lastZ pairwise alignment) from the ensembl archive
    and concatenate them into a single file.
    """
    input: LIFTOVER+"/lastZ/"+REFASSMBL+".v.{sps}.lastz_net.tar.gz"
    output: LIFTOVER+"/lastZ/"+REFASSMBL+".v.{sps}_lastz_net.all.maf"
    shell:
        "mkdir {LIFTOVER}/lastZ/{REFASSMBL}.v.{wildcards.sps}.lastz_net; "
        "tar -xvf {input} -C {LIFTOVER}/lastZ/{REFASSMBL}.v.{wildcards.sps}.lastz_net "
        "--strip-components 1; "
        "gunzip {LIFTOVER}/lastZ/{REFASSMBL}.v.{wildcards.sps}.lastz_net/*.gz || true; " #Sometimes ensembl does not gzip maf --> don't throw error if unable to unzip
        "rm {LIFTOVER}/lastZ/{REFASSMBL}.v.{wildcards.sps}.lastz_net/README.maf || true; " #Sometimes ensembl has a README.maf!!! --> should not be cat with other maf files obvly
        "find '{LIFTOVER}/lastZ/{REFASSMBL}.v.{wildcards.sps}.lastz_net/' -name '*.maf' -exec cat '{{}}' ';' > {output}; "
        "rm -r {LIFTOVER}/lastZ/{REFASSMBL}.v.{wildcards.sps}.lastz_net; "
        "sed -i.bak 's/^a#/#/g; s/^ score/a score/g' {output}; rm {output}.bak" #fix bad .maf syntax in some lastZ from ensembl

rule correct_minus_strands:
    """
    Correct minus strand in ensembl .maf to be compatible with UCSC conventions for .maf files.
    """
    input: LIFTOVER+"/lastZ/"+REFASSMBL+".v.{sps}_lastz_net.all.maf"
    output: temp(LIFTOVER+"/lastZ/"+REFASSMBL+".v.{sps}_lastz_net.all.ok.maf")
    shell:"""
    awk '($5 == "-") {{$3=$6-$3-$4}};1' {input} > {output}
    """

rule maf_to_psl:
    """
    Convert .maf to .psl format.
    """
    input: LIFTOVER+"/lastZ/"+REFASSMBL+".v.{sps}_lastz_net.all.ok.maf"
    output: temp(LIFTOVER+"/lastZ/"+REFASSMBL+".v.{sps}_lastz_net.all.psl")
    conda: "envs/ucsc_maftopsl.yaml"
    params: s = lambda wildcards: config["spnames"][wildcards.sps],
            ref = config["spnames"][REF]
    shell:
        "mafToPsl {params.s} {params.ref} {input} {output} && [[ -s {output} ]]"

rule psl_to_chain:
    """
    Convert .psl to .chain format.
    """
    input: LIFTOVER+"/lastZ/"+REFASSMBL+".v.{sps}_lastz_net.all.psl"
    output: LIFTOVER+"/lastZ/"+REFASSMBL+".v.{sps}_lastz_net.all.chain"
    conda: "envs/ucsc_psltochain.yaml"
    shell:
        "pslToChain {input} {output} && [[ -s {output} ]]"

rule chain_swap:
    """
    Swap query and target species in the chain file.
    """
    input: CHAIN
    output: LIFTOVER+"/lastZ/"+REFASSMBL+".v.{sps}_lastz_net.all.ok.chain"
    conda: "envs/ucsc_chainswap.yaml"
    shell:
        "chainSwap {input} {output} && [[ -s {output} ]]"

rule to_bed4:
    """
    Convert .bed file of regions to .bed with 4 columns, which is required for liftover.
    ID in the 4th column is the line number.
    """
    input: REGMAP_DIR+"/{reg}/{reg}-{species}-{tissue}.bed"
    output: temp(REGMAP_DIR+"/{reg}/{reg}-{species}-{tissue}.bed4")
    shell:"""
        if ! awk -F '\\t' 'NF!=4{{exit 1}}' {input}; 
        then awk '{{print $1, $2, $3, NR}}' {input} > {output}; 
        else cp {input} {output}; fi
    """

rule liftover:
    """
    Run liftover to convert regions coordinates from one species to the selected reference species.
    """
    input: chain = LIFTOVER+"/lastZ/"+REFASSMBL+".v.{sps}_lastz_net.all.ok.chain",
           regions = REGMAP_DIR+"/{reg}/{reg}-{sps}-{tissue}.bed4"
    output: regions = temp(LIFTOVER+"/{reg}-{sps}-{tissue}_mappedTo_"+REF+".bed"),
            unMapped = temp(LIFTOVER+"/unMapped/{reg}-{sps}-{tissue}")
    params: minMatch = config.get("minMatch", 0.001), multiple = config.get('liftover_flag', '-multiple')
    conda: "envs/ucsc_liftover.yaml"
    shell:
        "liftOver {input.regions} {input.chain} {output.regions} {output.unMapped} "
        "-minMatch={params.minMatch} {params.multiple}"

INV = {"-multiple": '', '':'--no_multi'}

rule filter_by_coverage:
    """
    Lenient filter to retain liftovered regions if the size of the converted regions >= 30% of the original region.
    In case of multiple matches, the largest region is selected.
    """
    input: outregions = LIFTOVER+"/{reg}-{sps}-{tissue}_mappedTo_{ref}.bed",
           inregions = REGMAP_DIR+"/{reg}/{reg}-{sps}-{tissue}.bed.bed4"
    output: regions = temp(LIFTOVER+"/{reg}-{sps}-{tissue}_mappedTo_{ref}.filtered.bed")
    conda: "envs/bedtools_py.yaml"
    params: cutoff = config.get("liftover_coverage_cutoff", 0.30), m = INV[config.get('liftover_flag', '-multiple')]
    shell:
        "python scripts/filter_by_coverage.py -q {input.inregions} -m {input.outregions} -c {params.cutoff} -o {output} {params.m} && "
        "bedtools sort -i {output} > {output}_temp && mv {output}_temp {output}"