#############################################################################
 #
 #  This file is part of Verkko, a software program that assembles
 #  whole-genome sequencing reads into telomere-to-telomere
 #  haplotype-resolved chromosomes.
 #
 #  Except as indicated otherwise, this is a 'United States Government
 #  Work', and is released in the public domain.
 #
 #  File 'README.licenses' in the root directory of this distribution
 #  contains full conditions and disclaimers.
 #
 ##

import glob


##########
#
#  Paths to Verkko componenets.
#
#    VERKKO - the root directory of your Verkko installation.
#
#    PYTHON - name of yourpreffered python interpreter, or empty to use
#             whatever is in your environment.
#
VERKKO = config.get('VERKKO', 'please-set-VERKKO-in-config.yml')
PYTHON = config.get('PYTHON', '')
PERL   = config.get('PERL', '')


##########
#
#  Inputs!
#
#  While the shell should expand vars/globs in the input reads, silly
#  advanced users might add those in a hand-generated config file, and
#  so we need to expand wildcards ourself with:
#
#    HIFI_READS  = glob.glob(os.path.expandvars(config['HIFI_READS']))
#    ONT_READS   = glob.glob(os.path.expandvars(config['ONT_READS']))
#
#  But from verkko.sh, we get lists of expanded filenames directly.
#
HIFI_READS = config.get('HIFI_READS')
ONT_READS  = config.get('ONT_READS')
HIC_READS1  = config.get('HIC_READS1')
HIC_READS2  = config.get('HIC_READS2')


##########
#
#  Outputs!
#
#  The primary output is 'consensus'.
#
#  The two coverage files are from rule create_final_coverages in the original.
#  Computed here in 5-untip.sm getFinalCoverages
#

rule emptyfile:
    output:   'emptyfile'
    shell:    'touch emptyfile'

rule verkko:
    input:
        graph       = '5-untip/unitig-unrolled-unitig-unrolled-popped-unitig-normal-connected-tip.gfa',
        nseqgraph   = '6-rukki/unitig-unrolled-unitig-unrolled-popped-unitig-normal-connected-tip.noseq.gfa'  if config['ruk_enable'] == "True" else {rules.emptyfile.output},
        layout      = '6-layoutContigs/unitig-popped.layout',
        scfmap      = '6-layoutContigs/unitig-popped.layout.scfmap',
        bam         = '7-consensus/unitig-popped.bam',
        cns         = '7-consensus/unitig-popped.fasta',
        cnshap1     = '7-consensus/unitig-popped.haplotype1.fasta',
        cnshap2     = '7-consensus/unitig-popped.haplotype2.fasta',
        cnsunas     = '7-consensus/unitig-popped.unassigned.fasta',
        pathstsv    = '6-rukki/unitig-unrolled-unitig-unrolled-popped-unitig-normal-connected-tip.paths.tsv'   if config['ruk_enable'] == "True" else {rules.emptyfile.output},
        colorscsv   = '6-rukki/unitig-unrolled-unitig-unrolled-popped-unitig-normal-connected-tip.colors.csv'  if config['ruk_enable'] == "True" else {rules.emptyfile.output},
       #hifi2cov    = '2-processGraph/unitig-unrolled-hifi-resolved.hifi-coverage.csv',   #  These are generated at the
       #ont2cov     = '2-processGraph/unitig-unrolled-hifi-resolved.ont-coverage.csv',    #  end of untip, but are
       #hifi4cov    = '4-processONT/unitig-unrolled-ont-resolved.hifi-coverage.csv',      #  otherwise unused.
       #ont4cov     = '4-processONT/unitig-unrolled-ont-resolved.ont-coverage.csv',
        hifi5cov    = '5-untip/unitig-unrolled-unitig-unrolled-popped-unitig-normal-connected-tip.hifi-coverage.csv',
        ont5cov     = '5-untip/unitig-unrolled-unitig-unrolled-popped-unitig-normal-connected-tip.ont-coverage.csv'
    output:
        graph       = 'assembly.homopolymer-compressed.gfa',
        nseqgraph   = 'assembly.homopolymer-compressed.noseq.gfa',
        layout      = 'assembly.homopolymer-compressed.layout',
        hificov     = 'assembly.hifi-coverage.csv',
        ontcov      = 'assembly.ont-coverage.csv',
        cns         = 'assembly.fasta',
        scfmap      = 'assembly.scfmap'
    params:
        keepinter   = config['keep_intermediate'],
        cnshap1     = 'assembly.haplotype1.fasta',    #  Actually, these are optional outputs.
        cnshap2     = 'assembly.haplotype2.fasta',
        cnsunas     = 'assembly.unassigned.fasta',
        bam         = 'assembly.bam',
        pathstsv    = 'assembly.paths.tsv',
        colorscsv   = 'assembly.colors.csv'
    shell:
        '''
#  Copy the rukki results to the output directory if they exist, remove them
#  if they are empty (no haplotypes walked).
#
#  If rukki is not enabled, inject coverage into the graph and make a noseq version.

if [ -s {input.nseqgraph} ]; then
   cp -p {input.nseqgraph} {output.nseqgraph}
   cp -p {input.cnshap1}   {params.cnshap1}
   cp -p {input.cnshap2}   {params.cnshap2}
   cp -p {input.cnsunas}   {params.cnsunas}
   cp -p {input.pathstsv}  {params.pathstsv}
   cp -p {input.colorscsv} {params.colorscsv}

   if [ ! -s {params.cnshap1} -a ! -s {params.cnshap2} ]; then
      rm -f {params.cnshap1}
      rm -f {params.cnshap2}
      rm -f {params.pathstsv}
   fi

else
   awk < {input.graph} \\
     'BEGIN \\
      {{ \\
        FS="[ \\t]+"; OFS="\\t"; \\
      }} \\
      {{ \\
        if ($1 == "S") {{ \\
          print "S", $2, "*", "LN:i:"length($3); \\
        }} else {{ \\
          print $0; \\
        }} \\
      }}' \\
   | \\
   {PYTHON} {VERKKO}/scripts/inject_coverage.py --allow-absent \\
     {input.hifi5cov} \\
   > {output.nseqgraph}
fi

#  Copy the full assembly to the output directory.

cp -p {input.graph}     {output.graph}
cp -p {input.layout}    {output.layout}
cp -p {input.cns}       {output.cns}
cp -p {input.hifi5cov}  {output.hificov}
cp -p {input.ont5cov}   {output.ontcov}
cp -p {input.scfmap}    {output.scfmap}

if [ -s {input.bam} ]; then
   cp -p {input.bam} {params.bam}
fi

#  Clean up intermediate files.

if [ {params.keepinter} = "False" ] ; then
  rm -rf 3-align/split
fi
        '''


rule cnspath:
    input:
        graph       = '5-untip/unitig-unrolled-unitig-unrolled-popped-unitig-normal-connected-tip.gfa',
        hifi5cov    = '5-untip/unitig-unrolled-unitig-unrolled-popped-unitig-normal-connected-tip.hifi-coverage.csv',
        ont5cov     = '5-untip/unitig-unrolled-unitig-unrolled-popped-unitig-normal-connected-tip.ont-coverage.csv',
        layout      = '6-layoutContigs/unitig-popped.layout',
        scfmap      = '6-layoutContigs/unitig-popped.layout.scfmap',
        consensus   = '7-consensus/unitig-popped.fasta',
        bam         = '7-consensus/unitig-popped.bam',
        cnshap1     = '7-consensus/unitig-popped.haplotype1.fasta',
        cnshap2     = '7-consensus/unitig-popped.haplotype2.fasta',
        cnsunas     = '7-consensus/unitig-popped.unassigned.fasta'
    output:
        graph       = 'assembly.homopolymer-compressed.gfa',
        hificov     = 'assembly.hifi-coverage.csv',
        ontcov      = 'assembly.ont-coverage.csv',
        layout      = 'assembly.homopolymer-compressed.layout',
        consensus   = 'assembly.fasta',
        scfmap      = 'assembly.scfmap'
    params:
        cnshap1     = 'assembly.haplotype1.fasta',    #  Actually, these are optional outputs.
        cnshap2     = 'assembly.haplotype2.fasta',
        cnsunas     = 'assembly.unassigned.fasta',
        bam         = 'assembly.bam'
    shell:
        '''
cp -p {input.graph}     {output.graph}
cp -p {input.layout}    {output.layout}
cp -p {input.hifi5cov}  {output.hificov}
cp -p {input.ont5cov}   {output.ontcov}
cp -p {input.consensus} {output.consensus}
if [  -s {input.cnshap1} -a -s {input.cnshap2} ]; then
  cp -p {input.cnshap1}   {params.cnshap1}
  cp -p {input.cnshap2}   {params.cnshap2}
fi
if [ -s {input.bam} ]; then
   cp -p {input.bam} {params.bam}
fi
cp -p {input.cnsunas}   {params.cnsunas}
cp -p {input.scfmap}    {output.scfmap}
        '''

##########
#
#  Local rules.
#
#  If your head node allows I/O tasks, include some of the following tasks
#  as local, to run them on the head node directly, instead of submitting
#  a job to the grid:
#
#    splitONT          - partitions ONT reads for alignment to the
#                        initial graph.
#
#    combineONT        - collects results of ONT-to-graph alignments.
#
#    buildPackages     - collects reads for contig consensus.  uses
#                        lots of memory but does no compute, except for
#                        sequence compression/decompression.
#
#    combineConsensus  - collects results of contig consensus computations.
#
#    configureOverlaps - runs a Canu binary to load read lengths and decide
#                        on batche sizes to compute overlaps.
#
localrules: verkko, correctHiFi, configureFindErrors, cnspath, emptyfile



##########
#
#  Rules.
#
include: 'Snakefiles/functions.sm'

include: 'Snakefiles/c1-buildStore.sm'
include: 'Snakefiles/c2-findOverlaps.sm'
include: 'Snakefiles/c4-findErrors.sm'

include: 'Snakefiles/1-buildGraph.sm'
include: 'Snakefiles/2-processGraph.sm'
include: 'Snakefiles/3-splitONT.sm'
include: 'Snakefiles/3-alignONT.sm'
include: 'Snakefiles/3-combineONT.sm'
include: 'Snakefiles/3-alignTips.sm'
include: 'Snakefiles/4-processONT.sm'
include: 'Snakefiles/5-untip.sm'
include: 'Snakefiles/6-rukki.sm'
include: 'Snakefiles/6-layoutContigs.sm'
include: 'Snakefiles/7-extractONT.sm'
include: 'Snakefiles/7-buildPackage.sm'
include: 'Snakefiles/7-generateConsensus.sm'
include: 'Snakefiles/7-combineConsensus.sm'

if config['withHIC'] == "True" or config['withPOREC'] == 'True':
    include: 'Snakefiles/8-hicPipeline.sm'
