#!/usr/bin/make -rRf
# Correct misassemblies using Tigmint
# Written by Shaun Jackman @sjackman
# Contributions by Lauren Coombe

# Usage:
# tigmint-make draft=myassembly reads=myreads

# To run Tigmint and calculate assembly metrics:
# tigmint-make draft=myassembly reads=myreads ref=GRCh38 G=3088269832

# Name of the draft assembly, draft.fa
draft=draft

# Name of the reads, reads.fq.gz, after running longranger basic
reads=reads

# Reference genome, ref.fa, for calculating assembly contiguity metrics
#ref=ref

# Size of the reference genome, for calculating NG50 and NGA50
#G=0

# Minimum molecule size
minsize=2000

# Minimum AS/Read length ratio
as=0.65

# Maximum number of mismatches
nm=5

# Maximum distance between reads to be considered the same molecule
dist=100000

# Mapping quality threshold
mapq=1

# Bp of contigs to trim after cutting at error
trim=0

#Threshold for number of spanning molecules to consider properly assembled
span=20

#Window size for checking for spanning molecules
window=1000

# Parameters of ARCS
c=5
e=30000
r=0.05

# Parameters of LINKS
a=0.1
l=10

# Number of threads
t=58

SHELL=bash -e -o pipefail
ifneq ($(shell command -v zsh),)
# Set pipefail to ensure that all commands of a pipe succeed.
SHELL=zsh -e -o pipefail
# Report run time and memory usage with zsh.
export REPORTTIME=1
export TIMEFMT=time user=%U system=%S elapsed=%E cpu=%P memory=%M job=%J
endif

# Determine path to tigmint executables
bin=$(shell dirname `command -v $(MAKEFILE_LIST)`)
ifdef bin
PATH:=$(bin):$(PATH)
endif

# Use pigz or bgzip for parallel compression if available.
ifneq ($(shell command -v pigz),)
gzip=pigz -p$t
else
ifneq ($(shell command -v bgzip),)
gzip=bgzip -@$t
else
gzip=gzip
endif
endif

# Record run time and memory usage in a file using GNU time.
ifdef time
ifneq ($(shell command -v gtime),)
gtime=command gtime -v -o $@.time
else
gtime=command time -v -o $@.time
endif
endif

.DELETE_ON_ERROR:
.SECONDARY:
.PHONY: help version all tigmint arcs metrics draft_metrics tigmint_metrics arcs_metrics

help:
	@echo 'Tigmint: Correct misassemblies using linked reads'
	@echo 'Usage: tigmint-make [COMMAND]... [PARAMETER=VALUE]...'
	@echo 'Example: tigmint-make tigmint draft=myassembly reads=myreads'
	@echo 'For more information see https://bcgsc.github.io/tigmint/'

version:
	@echo "Tigmint 1.1.2"
	@echo "Written by Shaun Jackman @sjackman."

all: tigmint arcs
ifdef ref
ifdef G
all: metrics
endif
endif

metrics: draft_metrics tigmint_metrics arcs_metrics

# Calculate assembly metrics of the draft assembly using abyss-fac and abyss-samtobreak
draft_metrics: \
	$(draft).abyss-fac.tsv \
	$(draft).scaftigs.abyss-fac.tsv \
	$(draft).scaftigs.$(ref).samtobreak.tsv

# Correct misassemblies using Tigmint
tigmint: \
	$(draft).$(reads).as$(as).nm$(nm).molecule.size$(minsize).trim$(trim).window$(window).span$(span).breaktigs.fa \
	$(draft).tigmint.fa

# Calculate assembly metrics after Tigmint
tigmint_metrics: \
	$(draft).$(reads).as$(as).nm$(nm).molecule.size$(minsize).trim$(trim).window$(window).span$(span).breaktigs.abyss-fac.tsv \
	$(draft).$(reads).as$(as).nm$(nm).molecule.size$(minsize).trim$(trim).window$(window).span$(span).breaktigs.scaftigs.abyss-fac.tsv \
	$(draft).$(reads).as$(as).nm$(nm).molecule.size$(minsize).trim$(trim).window$(window).span$(span).breaktigs.scaftigs.$(ref).samtobreak.tsv

# Scaffold using ARCS
arcs: \
	$(draft).$(reads).as$(as).nm$(nm).molecule.size$(minsize).trim$(trim).window$(window).span$(span).breaktigs.$(reads).c$c_e$e_r$r.arcs.a$a_l$l.links.fa \
	$(draft).tigmint.arcs.fa

# Calculate assembly metrics after ARCS
arcs_metrics: \
	$(draft).$(reads).as$(as).nm$(nm).molecule.size$(minsize).trim$(trim).window$(window).span$(span).breaktigs.$(reads).c$c_e$e_r$r.arcs.a$a_l$l.links.abyss-fac.tsv \
	$(draft).$(reads).as$(as).nm$(nm).molecule.size$(minsize).trim$(trim).window$(window).span$(span).breaktigs.$(reads).c$c_e$e_r$r.arcs.a$a_l$l.links.scaftigs.abyss-fac.tsv \
	$(draft).$(reads).as$(as).nm$(nm).molecule.size$(minsize).trim$(trim).window$(window).span$(span).breaktigs.$(reads).c$c_e$e_r$r.arcs.a$a_l$l.links.scaftigs.$(ref).samtobreak.tsv

# Symlink the Tigmint results
$(draft).tigmint.fa: $(draft).$(reads).as$(as).nm$(nm).molecule.size$(minsize).trim$(trim).window$(window).span$(span).breaktigs.fa
	ln -sf $< $@

# Symlink the ARCS results
$(draft).tigmint.arcs.fa: $(draft).$(reads).as$(as).nm$(nm).molecule.size$(minsize).trim$(trim).window$(window).span$(span).breaktigs.$(reads).c$c_e$e_r$r.arcs.a$a_l$l.links.fa
	ln -sf $< $@

# BWA

# Index the genome.
%.fa.bwt: %.fa
	bwa index $<

# Align paired-end reads to the draft genome and sort by BX tag.
$(draft).%.sortbx.bam: %.fq.gz $(draft).fa.bwt
	$(gtime) bwa mem -t$t -pC $(draft).fa $< | samtools view -u -F4 | samtools sort -@$t -tBX -T$$(mktemp -u -t $@.XXXXXX) -o $@

# samtools

# Index a FASTA file.
%.fa.fai: %.fa
	samtools faidx $<

# Create molecule extents BED
%.as$(as).nm$(nm).molecule.size$(minsize).bed: %.sortbx.bam
	$(gtime) $(bin)/tigmint-molecule -a$(as) -n$(nm) -q$(mapq) -d$(dist) -s$(minsize) $< | sort -k1,1 -k2,2n -k3,3n >$@

# Create molecule extents TSV
%.as$(as).nm$(nm).molecule.size$(minsize).tsv: %.sortbx.bam
	$(gtime) $(bin)/tigmint-molecule --tsv -a$(as) -n$(nm) -q$(mapq) -d$(dist) -s$(minsize) -o $@ $<

# Report summary statistics of a Chromium library
%.molecule.summary.html: %.molecule.tsv
	Rscript -e 'rmarkdown::render("$(bin)/../summary.rmd", "html_document", "$@", params = list(input_tsv="$<", output_tsv="$*.summary.tsv"))'

# bedtools

# Compute statistics on the depth of coverage of a BED file.
%.bed.genomecov.tsv: %.bed $(draft).fa.fai
	(printf "Rname\tDepth\tCount\tRsize\tFraction\n"; awk '$$2 != $$3' $< | bedtools genomecov -g $(draft).fa.fai -i -) >$@

# Calculate depth of coverage statistics from bedtools genomecov.
%.genomecov.stats.tsv: %.genomecov.tsv
	mlr --tsvlite \
		then filter '$$Rname == "genome" && $$Depth > 0' \
		then step -a rsum -f Fraction \
		then put -q '@Depth_count += $$Count; if (is_null(@p25) && $$Fraction_rsum >= 0.25) { @p25 = $$Depth }; if (is_null(@p50) && $$Fraction_rsum >= 0.50) { @p50 = $$Depth }; if (is_null(@p75) && $$Fraction_rsum >= 0.75) { @p75 = $$Depth } end { emitf @Depth_count, @p25, @p50, @p75 }' \
		then rename p25,Depth_p25,p50,Depth_p50,p75,Depth_p75 \
		then put '$$Depth_IQR = $$Depth_p75 - $$Depth_p25' \
		$< >$@

# Identify breakpoints

# Make breakpoints BED file
%.trim$(trim).window$(window).span$(span).breaktigs.fa: %.bed $(draft).fa $(draft).fa.fai
	$(gtime) $(bin)/tigmint-cut -p$t -w$(window) -n$(span) -t$(trim) -o $@ $(draft).fa $<

################################################################################
# Calculate assembly contiguity and correctness metrics.

# BWA

# Align an assembly to the reference using BWA-MEM.
%.$(ref).sam.gz: %.fa $(ref).fa.bwt
	bwa mem -xintractg -t$t $(ref).fa $< | $(gzip) >$@

# Align paired-end reads to the draft genome and do not sort.
%.$(reads).sortn.bam: %.fa.bwt $(reads).fq.gz
	bwa mem -t$t -pC $*.fa $(reads).fq.gz | samtools view -@$t -h -F4 -o $@

# ARCS

# Create a graph of linked contigs using ARCS.
%.$(reads).c$c_e$e_r$r.arcs_original.gv %.$(reads).c$c_e$e_r$r.arcs.dist.gv %.$(reads).c$c_e$e_r$r.arcs.dist.tsv: %.$(reads).sortn.bam %.fa
	arcs -s98 -c$c -l0 -z500 -m4-20000 -d0 -e$e -r$r -v \
		-f $*.fa \
		-b $*.$(reads).c$c_e$e_r$r.arcs \
		-g $*.$(reads).c$c_e$e_r$r.arcs.dist.gv \
		--tsv=$*.$(reads).c$c_e$e_r$r.arcs.dist.tsv \
		--barcode-counts=$<.barcode-counts.tsv \
		$<

# Convert the ARCS graph to LINKS TSV format.
%.$(reads).c$c_e$e_r$r.arcs.links.tsv: %.$(reads).c$c_e$e_r$r.arcs_original.gv %.fa
	$(bin)/tigmint-arcs-tsv $< $@ $*.fa

# Scaffold the assembly using the ARCS graph and LINKS.
%.$(reads).c$c_e$e_r$r.arcs.a$a_l$l.links.scaffolds.fa %.$(reads).c$c_e$e_r$r.arcs.a$a_l$l.links.assembly_correspondence.tsv: %.$(reads).c$c_e$e_r$r.arcs.links.tsv %.fa
	cp $< $*.$(reads).c$c_e$e_r$r.arcs.a$a_l$l.links.tigpair_checkpoint.tsv
	LINKS -k20 -l$l -t2 -a$a -x1 -s /dev/null -f $*.fa -b $*.$(reads).c$c_e$e_r$r.arcs.a$a_l$l.links

# Rename the scaffolds.
%.links.fa: %.links.scaffolds.fa
	sed -r 's/^>scaffold([^,]*),(.*)/>\1 scaffold\1,\2/' $< >$@

# ABySS

# Convert scaffolds to scaftigs.
%.scaftigs.fa: %.fa
	seqtk seq $< | tr _ '~' | abyss-fatoagp -f $@ >$@.agp

# Calculate assembly contiguity metrics with abyss-fac.
%.abyss-fac.tsv: %.fa
	abyss-fac -G$G -t500 $< >$@

ifdef G
abyss_samtobreak=abyss-samtobreak -l500 -G$G
else
abyss_samtobreak=abyss-samtobreak -l500
endif

# Calculate assembly contiguity and correctness metrics.
%.samtobreak.tsv: %.sam.gz
	gunzip -c $< | $(abyss_samtobreak) >$@
