#!/usr/bin/env bash
#!
#! Name of the job:
#SBATCH -J minigraph
#SBATCH --mail-type=END
#SBATCH -n 20 
#SBATCH -N 1
#SBATCH --mem=64GB
#SBATCH -o "./s_out/slurm-%j_minigraph.txt"
#SBATCH -p 2004

# FOR CONDA: ACTIVATE YOUR ENVIRONMENT FIRST
# single line command to run, replace as necessary
# export SBATCH_CMD="bash helloworld.sh"; mkdir -p s_out/; sbatch ~/clst/submit_sbatch_big.sh; export SBATCH_CMD=""

# -N node (how many compute units)
# -n tasks (how much each node is pushed, correlated w/  n_cores)
# -c number of cpus (how many processors per task, don't change)

FASTADIR="$HOME/code/malawi_transposon/cloud/genome/longread-decompress/renamed"
FASTALIST=(troMau2.0 astCal1.2_ensembl aulStu5.0 otoArg1.0 copChr1.0 rhaChi1.0 rhaChi2.0)

GRAPHDIR="$HOME/code/malawi_transposon/cloud/graph_genome/malawi_haplochromines_zebra/"
GRAPHNAME="malawi_haplochromines_zebra"

BASEGENOME="$FASTADIR/mayZeb2.0_ensembl.renamed.fa"

date
echo "BUILDING GRAPH..."
cd $FASTADIR
FASTAFILES=("${FASTALIST[@]/%/.renamed.fa}")
minigraph -xggs -c -t20 $BASEGENOME $(cat <(for i in ${FASTAFILES[@]}; do echo -n "$i "; done)) > $GRAPHDIR/$GRAPHNAME-graph.gfa
echo

date
echo "CALLING VARIANTS..."
cd $GRAPHDIR
gfatools bubble $GRAPHNAME-graph.gfa > $GRAPHNAME-variants.bed

echo "EXTRACT NODE INFO FROM GFA..."
printf "segment\tchr\tstart\tlength\trank\n" > $GRAPHNAME-graph.nodeinfo
paste -d'\t' \
    <(grep '^S' $GRAPHNAME-graph.gfa | cut -f2) \
    <(grep '^S' $GRAPHNAME-graph.gfa | cut -f5 | cut -f3 -d:)\
    <(grep '^S' $GRAPHNAME-graph.gfa | cut -f6 | cut -f3 -d:)\
    <(grep '^S' $GRAPHNAME-graph.gfa | cut -f4 | cut -f3 -d:)\
    <(grep '^S' $GRAPHNAME-graph.gfa | cut -f7 | cut -f3 -d:)\
    >> $GRAPHNAME-graph.nodeinfo

echo "EXTRACT NODE LENGTH FROM GFA..."
cd $GRAPHDIR
paste -d'\t' \
    <(grep '^S' $GRAPHNAME-graph.gfa | cut -f2) \
    <(grep '^S' $GRAPHNAME-graph.gfa | cut -f4 | cut -f3 -d:)\
    > $GRAPHNAME-graph.nodelengths
date
