#!/bin/bash

set -e
function print_help() {
  echo "Usage: $0 -f genomes_info -s/-l -r read.fq [option]"
  echo "       paired-end: $0 -f genomes_info -s -p -r read.fq --species-level --strain-level"
  echo ""
  echo "Strain-level metagenomic profiling using pangenome graphs with PanTax"
  echo ""
  echo "Author: Wenhai Zhang"
  echo "Date:   May 2024"
  echo ""
  echo "    General options:"
  echo "        -f, --genomesInformation file:    A list of reference genomes in specified format. (Mandatory)"
  echo "        -db dir                           Name for pantax DB (default: pantax_db)."
  echo "        -T dir                            Temporary directory (default: pantax_db_tmp)."
  echo "        -n, --next                        Keep the temporary folder for later use at the strain level (resume)." 
  echo "        -t, --threads int                 Number of processes to run in parallel. (default: all available)"
  echo "        -v, --verbose                     Detailed database build log."
  echo "        --vg file                         Path to vg executable file."
  echo "        --debug                           Keep the temporary folder for any situation."
  echo "        --help, -h                        Print this help message."
  echo "        --version                         Print the version info."
  echo "    Database creation:"
  echo "        --create                          Create the database only."
#   echo "        --mode                            0 (use all genomes), 1 (use sylph filter genomes, dataset specificity)."
  echo "        --fast                            Create the database using genomes filtered by sylph query instead of all genomes."
  echo "        -g, --save                        Save species graph information."
  echo "            --lz                          Serialized zip graph file saved with lz4 format (for save option)."
  echo "            --zstd                        Serialized zip graph file saved with zstd format (for save option)."
#   echo "            --h5                          Serialized zip graph file saved with hdf5 format (only used for experimental graph parsing)."
  echo "        --force                           Force to rebuild pangenome."
  echo "        -e file                           Path to pangenome building tool executable file. (default: pggb)" 
  echo "        -A, --ani float                   ANI threshold for sylph query result filter. (default: 99)"                                  
#   echo "        --parallel str                    Multiple species at once (default: True/on)."
  echo "    Index construction(for vg giraffe):"
  echo "        --index                           Create the index only."
  echo "        --best                            Best autoindex, which corresponds to vg autoindex. (only used with -s)"
  echo "        --fast-aln                        Long read fast alignment with vg instead of Graphaligner. (only used with -l)"
  echo "    Read alignment:"
  echo "        -r, --fastq-in file               Read and align FASTQ-format reads from FILE (two are allowed with -p)."
  echo "        -s, --short-read                  Short read alignment."
  echo "        -p, --paired                      For paired-end alignment."
  echo "        -l, --long-read                   Long read alignment."
  echo "        -lt, --long-read-type str         Long read type (hifi, clr, ontr9, ontr10). Set precise-clipping based on read type."
  echo "        --precise-clipping float          clip the alignment ends with arg as the identity cutoff between correct / wrong alignments. (default: 0.66)"    
  echo "    Abundacnce calculation:"
  echo "        --species-level | --species       Species abundance calulation."
  echo "        --strain-level | --strain         Strain abundance calulation."
  echo "        --solver str                      MLP solver. (options: gurobi, cbc, glpk, highs. default: gurobi)"
#   each "        -m, --select                        Select output pangenome result." 
#   echo "        --parallel-poa                    Parallel POA."
#   echo "        -ds, --designated_species         Only return the strain result of designated species."
# echo "        --sample int                        Sampling nodes(500000) are used for the graph with too many nodes."
  echo "        -a float                          Species with relative abundance above the threshold are used for strain abundance estimation. (default: 0.0001)"
#   echo "        -fr float                         Unique trio nodes fraction. (default: multi species 0.3/ single species 0.43)"
#   echo "        -fc float                         Unique trio nodes mean count fraction (default: 0.46)."
  echo "        -fr float                         The fraction of strain-specific triplet nodes covered by reads for one strain. The larger, the stricter." 
  echo "                                          (default: short multi species 0.3/ short single species 0.43/ long multi species 0.5)"
  echo "        -fc float                         The divergence between first rough and second refined strain abundance estimates. The smaller, the stricter. (default: 0.46)"
  echo "        -sh, --shift bool                 Shifting fraction of strain-specific triplet nodes. (multi-species: off, single-species: on)"
  echo "        -sd float                         Coverage depth difference between the strain and its species, with only one strain. (default: 0.2)"
#   echo "        -sr float                         Single species strain level coverage ratio. (default: 0.85)"
  echo "        -sr float                         Rescued strain retention score. (default: 0.85)"
#   echo "        --filter str                      MAPQ-based filter (default: True/on)."
  echo "        --min_cov int                     Minimum coverage depth required per strain. (default: 0)"
  echo "        --min_depth int                   Graph nodes with sequence coverage depth more than <min_depth>. (default: 0)"
  echo "        -gt int                           Gurobi threads. (default: 1)"
  echo "        -S, --classified-out str          File for alignment output(prefix)."
  echo "        -R, --report str                  File for read classification(binning) output(prefix)."
  echo "        -o, --ouput str                   File for abundance output(prefix)."
#   echo "    Other:"
#   echo "        --sample-test                     Sampling nodes(500) are used for small model testing (set for codeocean)."
#   echo "        --test                            Experiment save file."
#   echo "        -qf                               Sylph query and filter test."
  echo ""
  exit 0
}

function help_test() {
    if [[ $(which $0) == *conda* ]]; then
        conda_run_flag="True"
    else 
        conda_run_flag="False"
    fi
    # set tools working path
    script_path=$(readlink -f $0)
    script_dir=$(dirname $script_path)
    if [ $conda_run_flag == "True" ]; then
        # tools_dir=$script_dir/tools
        tools_dir=$script_dir
        # fastix=$tools_dir/fastix
    else
        tools_dir=$(dirname $script_dir)/tools
        # fastix=$tools_dir/fastix/bin/fastix
    fi
    pantaxr=$script_dir/pantaxr
    
    $pantaxr -h
    $pantaxr profile -h
    print_help
}

timestamp=$(date "+%Y-%m-%d %H:%M:%S")
echo "$timestamp - $0 $@"

# set default value
version="v2.0.0"
# General
genomes_info=""
pantax_db="pantax_db"
tmp_dir=""
next_for_strain="False"
threads=$(grep -c ^processor /proc/cpuinfo)
verbose="False"
vg=""
debug="False"

# Database creation
create_db="False"
save="False"
lz=false
zstd=false
h5=false
force="False"
pangenome_building_tool="pggb"
parallel_build="True"
default_ani_threshold=99
mode="0"

create_index="False"
short_read_best_alignment="False"
long_read_fast_alignment="False"

short_reads="False"
long_reads="False"
pair_short_read="False"
precise=""
long_read_type=""

species_level_flag="False"
strain_level_flag="False"
select_mode="2"
parallel_poa="True"
designated_species=None
isfilter=1
species_abund_threshold=0.0001
default_short_unique_trio_nodes_fraction=0.3
default_long_unique_trio_nodes_fraction=0.5
unique_trio_nodes_fraction=None
default_unique_trio_nodes_count=0.46
unique_trio_nodes_count=None
shift_unique_trio_nodes_fraction=None
single_cov_diff=0.2
default_single_cov_ratio=0.85
single_cov_ratio=None
min_cov=0
min_depth=0
solver=gurobi
solver_threads=1

sample_test=0
sample=500000
test="False"
query_and_filter="False"


wd=$(pwd)
interleaved_short_read="False"
input_fq=""
input_fq1=""
input_fq2=""
pantax_report=None
#Print help if no argument specified
if [[ "$1" == "" ]]; then
  print_help
fi

# get parameters
while [[ "$1" != "" ]]; do
    case "$1" in
        "-f" | "--genomesInformation") [[ -z "$2" ]] && { echo "Error: $1 expects an argument"; exit 1; } || { genomes_info="$2"; shift 2; } ;;
        "-db") [[ -z "$2" ]] && { echo "Error: $1 expects an argument"; exit 1; } || { pantax_db="$2"; shift 2; } ;;
        "-T") [[ -z "$2" ]] && { echo "Error: $1 expects an argument"; exit 1; } || { tmp_dir="$2"; shift 2; } ;;
        "-n" | "--next") next_for_strain="True"; shift 1;;
        "-t" | "--threads") [[ -z "$2" ]] && { echo "Error: $1 expects an argument"; exit 1; } || { threads=$2; shift 2; } ;;
        "-v" | "--verbose") verbose="True"; shift 1 ;;
        "--vg") [[ -z "$2" ]] && { echo "Error: $1 expects an argument"; exit 1; } || { vg="$2"; shift 2; } ;;
        "--debug") debug="True"; shift 1;;
        "--help" | "-h") print_help;;
        "--ht") help_test;;
        "--version") echo $version; exit 0;;

        "--create") create_db="True"; shift 1;;
        "-g" | "--save") save="True"; shift 1;; 
        "--lz") lz=true; shift 1;;
        "--zstd") zstd=true; shift 1;;
        "--h5") h5=true; shift 1;;     
        "--force") force="True"; shift 1;;
        "-e") [[ -z "$2" ]] && { echo "Error: $1 expects an argument"; exit 1; } || { pangenome_building_tool="$2"; shift 2; } ;;
        "--parallel") parallel_build=$2; shift 2;;
        "-A" | "--ani") [[ -z "$2" ]] && { echo "Error: $1 expects an argument"; exit 1; } || { ani_threshold="$2"; shift 2; } ;;
        "--mode") [[ -z "$2" ]] && { echo "Error: $1 expects an argument"; exit 1; } || { mode="$2"; shift 2; } ;;
        "--fast") mode="1"; shift 1;;

        "--index") create_index="True"; shift 1;;
        "--best") short_read_best_alignment="True"; shift 1;;
        "--fast-aln") long_read_fast_alignment="True"; shift 1;;  

        "-s" | "--short-read") short_reads="True"; shift 1;;
        "-p" | "--paired") pair_short_read="True"; shift 1;;
        "-l" | "--long-read") long_reads="True"; shift 1;;   
        "-r" | "--fastq-in")
            [[ -z "$2" ]] && { echo "Error: $1 expects an argument"; exit 1; }
            if [[ -z $input_fq ]] && [ $pair_short_read == "False" ]; then
                input_fq=$2 
            elif [[ $pair_short_read == "True" ]] && ( [[ -z $input_fq1 ]] || [[ -z $input_fq2 ]] ); then
                if [ -z $input_fq1 ]; then
                    input_fq1="$2"
                elif [ -z $input_fq2 ]; then
                    input_fq2="$2"
                fi
            else
                echo "Error: Too many -r/--fastq-in arguments"
                exit 1
            fi
            shift 2
            ;;
        "-lt" | "--long-read-type") [[ -z "$2" ]] && { echo "Error: $1 expects an argument"; exit 1; } || { long_read_type="$2"; shift 2; };;
        "--precise-clipping") [[ -z "$2" ]] && { echo "Error: $1 expects an argument"; exit 1; } || { precise="$2"; shift 2; };;

        "--strain-level" | "--strain") strain_level_flag="True"; shift 1;;
        "--species-level" | "--species") species_level_flag="True"; shift 1;;
        "--solver") [[ -z "$2" ]] && { echo "Error: $1 expects an argument"; exit 1; } || { solver="$2"; shift 2; };;
        "-m" | "--select") select_mode=$2; shift 2;;
        "--parallel-poa") parallel_poa=$2; shift 2;;
        "-ds" | "--designated_species") [[ -z "$2" ]] && { echo "Error: $1 expects an argument"; exit 1; } || { designated_species="$2"; shift 2; } ;;
        "--filter") isfilter=$2; shift 2;;
        "-a") [[ -z "$2" ]] && { echo "Error: $1 expects an argument"; exit 1; } || { species_abund_threshold="$2"; shift 2; } ;;
        "-fr") [[ -z "$2" ]] && { echo "Error: $1 expects an argument"; exit 1; } || { unique_trio_nodes_fraction="$2"; shift 2; } ;;   
        "-fc") [[ -z "$2" ]] && { echo "Error: $1 expects an argument"; exit 1; } || { unique_trio_nodes_count="$2"; shift 2; } ;;
        "-sh" | "--shift") [[ -z "$2" ]] && { echo "Error: $1 expects an argument"; exit 1; } || { shift_unique_trio_nodes_fraction="$2"; shift 2; } ;;
        "-sd") [[ -z "$2" ]] && { echo "Error: $1 expects an argument"; exit 1; } || { single_cov_diff="$2"; shift 2; } ;;  
        "-sr") [[ -z "$2" ]] && { echo "Error: $1 expects an argument"; exit 1; } || { single_cov_ratio="$2"; shift 2; } ;;      
        "--min_cov") [[ -z "$2" ]] && { echo "Error: $1 expects an argument"; exit 1; } || { min_cov="$2"; shift 2; } ;;
        "--min_depth") [[ -z "$2" ]] && { echo "Error: $1 expects an argument"; exit 1; } || { min_depth="$2"; shift 2; } ;;
        "-S" | "--classified-out") [[ -z "$2" ]] && { echo "Error: $1 expects an argument"; exit 1; } || { read_aln="$2".gaf; shift 2; } ;;                                                                          
        "-R" | "--report") [[ -z "$2" ]] && { echo "Error: $1 expects an argument"; exit 1; } || { pantax_report="$2".tsv; shift 2; } ;;
        "-o" | "--output") [[ -z "$2" ]] && { echo "Error: $1 expects an argument"; exit 1; } || { pantax_output=$2; shift 2; } ;;
        "-gt") [[ -z "$2" ]] && { echo "Error: $1 expects an argument"; exit 1; } || { solver_threads=$2; shift 2; } ;;
        "--sample") [[ -z "$2" ]] && { echo "Error: $1 expects an argument"; exit 1; } || { sample=$2; shift 2; } ;;                                                             

        "--sample-test") sample_test=1; shift 1;;
        "--test") test="True"; shift 1;;
        "-qf") query_and_filter="True"; shift 1;;

        --) shift; break;;
        *) echo "Error: invalid option \"$1\""; exit 1;;
    esac
done

if [ ! -d $pantax_db ]; then
    mkdir $pantax_db
elif [ -d $pantax_db ] && [ $force == "True" ]; then
    rm -rf $pantax_db
    mkdir $pantax_db
fi
pantax_db=$(readlink -f $pantax_db)

if [ ! -z $genomes_info ] && [ -f $genomes_info ]; then
    genomes_info=$(readlink -f $genomes_info)
elif [ -z $genomes_info ] && [ -f $pantax_db/genomes_info.txt ]; then
    genomes_info=$(readlink -f $pantax_db/genomes_info.txt)
elif [ $create_db == "False" ] && [ $create_index == "True" ]; then
    :
else
    echo "genomes information file does not exist!"
    exit 1
fi

if [ $parallel_build == "False" ] || [ $parallel_build == "false" ] || [ $parallel_build == "off" ]; then
    parallel_build="False"
else 
    parallel_build="True"
fi

if [[ ( $mode == "0" && $create_db == "False" && $create_index == "False" ) || ( $mode == "1" && $create_index == "False" ) ]]; then
    if [ ! -z $input_fq ] && [ -f $input_fq ]; then
        input_fq=$(readlink -f $input_fq)
    elif [ ! -z $input_fq1 ] && [ -f $input_fq1 ] && [ ! -z $input_fq2 ] && [ -f $input_fq2 ]; then
        input_fq1=$(readlink -f $input_fq1)
        input_fq2=$(readlink -f $input_fq2)
        interleaved_short_read="False"
    elif [ ! -z $input_fq1 ] && [ -f $input_fq1 ] && [ -z $input_fq2 ]; then
        input_fq1=$(readlink -f $input_fq1)
        input_fq=$input_fq1
        input_fq1=""
        interleaved_short_read="True"        
    else
        echo "fastq file does not exist!"
        exit 1    
    fi
fi

# if [ $isfilter == 0 ] || [ $isfilter == "off" ]; then
#     # isfilter_flag=0
#     isfilter_flag=""
# else 
#     # isfilter_flag=1
#     isfilter_flag="--filtered"
# fi

if [ $create_db == "False" ] && [ $create_index == "False" ] && [ $query_and_filter == "False" ]; then
    echo "Main parameters settings:"
    echo "  threads: $threads"
    echo "Database creation:"
    echo "  genomes_info: $genomes_info"
    echo "  pantax_db: $pantax_db"
    echo "Index construction:"
    if [ $short_reads == "True" ]; then
        echo "  short_reads: $short_reads"
    elif [ $long_reads == "True" ]; then
        echo "  long_reads: $long_reads"
    else 
        echo "No read type!"
        exit 1
    fi
    echo "Read alignment:"
    echo "  input_fq: $input_fq"
    echo "  input_fq1: $input_fq1"
    echo "  input_fq2: $input_fq2"
    echo "  strain_level_flag: $strain_level_flag"
    echo "  species_level_flag: $species_level_flag"
    if [ $strain_level_flag == "False" ] && [ $species_level_flag == "False" ]; then
        echo "No species or strain option specified. Please set --species-level or --strain-level."
        exit 1
    fi
fi

# some tools work in different directory when install with conda or from source.
if [[ $(which $0) == *conda* ]]; then
    conda_run_flag="True"
else 
    conda_run_flag="False"
fi
# set tools working path
script_path=$(readlink -f $0)
script_dir=$(dirname $script_path)
if [ $conda_run_flag == "True" ]; then
    # tools_dir=$script_dir/tools
    tools_dir=$script_dir
    # fastix=$tools_dir/fastix
else
    tools_dir=$(dirname $script_dir)/tools
    # fastix=$tools_dir/fastix/bin/fastix
fi
pantaxr=$script_dir/pantaxr
if [ ! -z $vg ] && [ -f $vg ]; then
    :
elif [ -z $vg ]; then
    vg=$tools_dir/vg
fi

# tmp
if [ -z $tmp_dir ]; then
    tmp_dir=$(basename $pantax_db)_tmp
    tmp_dir=$(readlink -f $tmp_dir)
elif [ ! -z $tmp_dir ] && [ -d $tmp_dir ]; then
    tmp_dir=$tmp_dir/$(basename $pantax_db)_tmp
elif [ ! -z $tmp_dir ] && [ ! -d $tmp_dir ]; then
    :
fi
if [ $next_for_strain == "False" ] && [ $debug == "False" ]; then
    rm -rf $tmp_dir
fi
mkdir -p $tmp_dir
cd $tmp_dir

cleanup() {
    if [ $next_for_strain == "False" ] && [ $debug == "False" ]; then
        echo "$(date "+%Y-%m-%d %H:%M:%S") - PanTax encountered an exception while running and has deleted the tmp folder."
        echo "If you need to start checking not to delete the tmp directory, please add the --debug option."
        rm -rf $tmp_dir
    else
        echo "$(date "+%Y-%m-%d %H:%M:%S") - PanTax encountered an exception."
    fi
}
if [ $debug == "False" ]; then
    trap cleanup ERR
fi

# ---------------------------- Building reference pangenome ----------------------------
if [ ! -s $pantax_db/reference_pangenome.gfa ]; then 
    # backup genomes_info for each custom database
    cp -f $genomes_info $pantax_db/ori_genomes_info.txt
    cp -f $genomes_info $pantax_db/genomes_info.txt
    reconstruction_flag="True"
else
    reconstruction_flag="False"
fi

if [ ! -f $pantax_db/reference_pangenome.gfa ] && [ $create_index == "True" ]; then
    echo "$pantax_db/reference_pangenome.gfa does not exist."
fi

if [ $reconstruction_flag == "True" ]; then 
    if [ $mode == "1" ] || [ $query_and_filter == "True" ]; then
        if [ -z $ani_threshold ]; then
            if [ -n "${long_read_type}" ] && { [ "${long_read_type,,}" == "hifi" ] || [ "${long_read_type,,}" == "ngs" ]; }; then
                ani_threshold=$default_ani_threshold
            elif [ -n "${long_read_type}" ] && { [ "${long_read_type,,}" == "ontr9" ] || [ "${long_read_type,,}" == "ontr10" ]; }; then
                ani_threshold=85
            else
                ani_threshold=$default_ani_threshold
            fi
        fi
        echo "$(date "+%Y-%m-%d %H:%M:%S") - Sylph query and filter with ANI threshold $ani_threshold..."
        mkdir -p $tmp_dir/sylph_db
        if [ ! -s $tmp_dir/sylph_db/sylph_done.txt ]; then
            awk -F '\t' 'NR > 1 {print $5}' $genomes_info > $tmp_dir/sylph_db/reference_genomes.txt
            sylph sketch -l $tmp_dir/sylph_db/reference_genomes.txt -o $tmp_dir/sylph_db/database
            if [ ! -z $input_fq ] && [ -f $input_fq ] && [ $interleaved_short_read == "False" ]; then 
                sylph query $tmp_dir/sylph_db/database.syldb $input_fq -o $tmp_dir/sylph_db/query_result
            elif [ ! -z $input_fq ] && [ -f $input_fq ] && [ $interleaved_short_read == "True" ]; then
                if [[ $input_fq == *.gz ]]; then 
                    zcat $input_fq | paste - - - - - - - - | tee >(cut -f 1-4 | tr "\t" "\n" > $tmp_dir/sylph_db/read1.fq) | cut -f 5-8 | tr "\t" "\n" > $tmp_dir/sylph_db/read2.fq
                else
                    cat $input_fq | paste - - - - - - - - | tee >(cut -f 1-4 | tr "\t" "\n" > $tmp_dir/sylph_db/read1.fq) | cut -f 5-8 | tr "\t" "\n" > $tmp_dir/sylph_db/read2.fq
                fi
                sylph query $tmp_dir/sylph_db/database.syldb -1 $tmp_dir/sylph_db/read1.fq -2 $tmp_dir/sylph_db/read2.fq -o $tmp_dir/sylph_db/query_result
                rm $tmp_dir/sylph_db/read1.fq $tmp_dir/sylph_db/read2.fq
            elif [ -f $input_fq1 ] && [ -f $input_fq2 ]; then
                sylph query $tmp_dir/sylph_db/database.syldb -1 $input_fq1 -2 $input_fq1 -o $tmp_dir/sylph_db/query_result
            fi
            python $script_dir/get_filter_genomes_info.py $genomes_info $tmp_dir/sylph_db/query_result $ani_threshold $tmp_dir/sylph_db
            cp -f $tmp_dir/sylph_db/filter_genomes_info.txt $pantax_db/genomes_info.txt
            genomes_info=$pantax_db/genomes_info.txt
            echo "done" > $tmp_dir/sylph_db/sylph_done.txt
            echo "$(date "+%Y-%m-%d %H:%M:%S") - Sylph query and filter with $ani_threshold...done"
        else
            echo "Sylph query and filter with $ani_threshold has been done. Skipping."
        fi
        if [ $query_and_filter == "True" ]; then
            if [ $debug == "False" ]; then
                rm -rf $tmp_dir
            fi
            exit 0
        fi
    fi
    echo "$(date "+%Y-%m-%d %H:%M:%S") - Building reference pangenome ..."
    # Prepare genomes information. Return species_pangenome directory and species_eq1_genomes.txt
    # Species(in species_pangenome directory) more than 2 genomes will be used to build pangenome firstly.
    python $script_dir/prepare_building.py $genomes_info $wd $tmp_dir
    gfa_build_dir=$tmp_dir/gfa_build
    # first step: build pangenome for the species with more than two genomes
    if [ $lz = true ]; then
        lz_flag="--lz"
    else
        lz_flag=""
    fi 
    if [ $zstd = true ]; then
        zstd_flag="--zstd"
    else
        zstd_flag=""
    fi 
    if [ $save == "True" ] && [ $lz = false ] && [ $zstd = false ]; then
        serialize_flag="--serialize"
    else
        serialize_flag=""
    fi    
    python $script_dir/task_scheduling.py $gfa_build_dir $tmp_dir/species_pangenome $pantaxr $vg -t $threads -v $verbose -f $force -e $pangenome_building_tool -d $debug -p $parallel_build -g $save -o $tmp_dir/species_range.txt $lz_flag $zstd_flag

    # second step: build variation graph for the species with one genomes
    $pantaxr build -i species_eq1_genomes.txt -w $gfa_build_dir
    if [ $debug == "False" ]; then
        rm -rf $tmp_dir/species_pangenome $tmp_dir/species_eq1_genomes.txt
    fi
    if [ $save == "True" ]; then
        # cat $gfa_build_dir/gfa_build_done.txt | xargs -I {} -P $(($threads/2)) bash -c 'vg=$1; gfa_build_dir=$2; script_dir=$3; input=$4; name=$(basename $input .gfa); $vg convert -g $input -p > $gfa_build_dir/$name.vg; python $script_dir/get_h5_from_gfa.py $input $gfa_build_dir/$name.h5' _ $vg $gfa_build_dir $script_dir {}
        zip_options="$serialize_flag $lz_flag $serialize_flag"
        cat $gfa_build_dir/gfa_build_done.txt | xargs -I {} -P $(($threads/2)) bash -c 'vg=$1; tmp_dir=$2; script_dir=$3; pantaxr=$4; zip_options=$5; input=$6; name=$(basename $input .gfa); $vg convert -g $input -p > $tmp_dir/gfa_build/$name.vg; $pantaxr zip -i $input -o $tmp_dir/gfa_build2/ -f $tmp_dir/species_range.txt $zip_options' _ $vg $tmp_dir $script_dir $pantaxr $zip_options {}
    elif [ $save == "False" ]; then
        cat $gfa_build_dir/gfa_build_done.txt | xargs -I {} -P $(($threads/2)) bash -c 'vg=$1; gfa_build_dir=$2; input=$3; name=$(basename $input .gfa); $vg convert -g $input -p > $gfa_build_dir/$name.vg' _ $vg $gfa_build_dir {}
    fi

    sed 's:.*/::; s/\.[^.]*$//' $gfa_build_dir/gfa_build_done.txt > $gfa_build_dir/pangenome_eq1.txt
    echo "$(date "+%Y-%m-%d %H:%M:%S") - Completed all pangenome building commands for the species with only one genomes."
    # vg_files=$(ls $gfa_build_dir/*.vg)
    vg_files=""
    if [ -s $gfa_build_dir/finished_pangenome.txt ] && [ -n "$(grep -v '^[[:space:]]*$' $gfa_build_dir/finished_pangenome.txt)" ]; then
        for file_name in $(cat $gfa_build_dir/finished_pangenome.txt); do
            vg_files="$vg_files $gfa_build_dir/$file_name.vg"
        done
    fi
    if [ -s $gfa_build_dir/pangenome_eq1.txt ] && [ -n "$(grep -v '^[[:space:]]*$' $gfa_build_dir/pangenome_eq1.txt)" ]; then
        while IFS=$'\t' read -r col1 col2; do
            vg_files="$vg_files $gfa_build_dir/$col1.vg"
        done < "$gfa_build_dir/pangenome_eq1.txt"
    fi
    vg_files=${vg_files# }
    echo $vg_files > vg_files.txt
    $vg combine -p $vg_files > reference_pangenome.vg
    $vg convert -f reference_pangenome.vg > reference_pangenome.gfa

    echo "$(date "+%Y-%m-%d %H:%M:%S") - Obtain the mapping information of graph and species..."
    if [ $save == "False" ]; then
        $pantaxr range --count_only --pangenome_vg_files vg_files.txt --pangenome_ge2 $gfa_build_dir/finished_pangenome.txt --pangenome_eq1 $gfa_build_dir/pangenome_eq1.txt --vg $vg --range $tmp_dir/species_range.txt -t $threads
    else 
        $pantaxr range --pangenome_vg_files vg_files.txt --range $tmp_dir/species_range.txt
    fi
    # python $script_dir/otu_range.py vg_files.txt $vg $gfa_build_dir/finished_pangenome.txt $gfa_build_dir/pangenome_eq1.txt -t $threads
    # python $script_dir/otu_range.py reference_pangenome.gfa $genomes_info $gfa_build_dir/finished_pangenome.txt $gfa_build_dir/pangenome_eq1.txt --species-level -t $threads
    echo "$(date "+%Y-%m-%d %H:%M:%S") - Obtain the mapping information of graph and species...done"
    
    if [ $debug == "False" ]; then
        rm reference_pangenome.vg $vg_files
    fi
    # extract gfa files for strain abundance computing
    if [ $save == "True" ]; then
        mv ${gfa_build_dir}2 $pantax_db/species_graph_info
        mv reference_pangenome* species_range.txt $pantax_db
    elif [ $save == "False" ]; then
        mkdir -p $tmp_dir/species_gfa
        find $gfa_build_dir -name "*gfa" -exec bash -c 'input=$1; tmp_dir=$2; name=$(basename $input); mv $input $tmp_dir/species_gfa/' _ {} $tmp_dir \;
        mv $tmp_dir/species_gfa reference_pangenome* species_range.txt $pantax_db
    fi
    if [ ! -f $pantax_db/species_genomes_stats.txt ]; then
        $pantaxr stat -g $pantax_db/genomes_info.txt -t $threads -o $pantax_db/species_genomes_stats.txt -w $wd
    fi
    echo "$(date "+%Y-%m-%d %H:%M:%S") - Building reference pangenome completely."
else
    echo "$(date "+%Y-%m-%d %H:%M:%S") - Reference_pangenome.gfa exists. Skipping."
fi

# for experiment format modify, this is not important
if [ -f $script_dir/mark_pangenome_eq1.py ]; then
    if [ -f $pantax_db/genomes_info.txt ] && [ -f $pantax_db/species_range.txt ]; then
        python $script_dir/mark_pangenome_eq1.py $pantax_db/genomes_info.txt $pantax_db/species_range.txt
    else
        echo "$pantax_db/genomes_info.txt or $pantax_db/species_range.txt does not exist."
        exit 0
    fi
fi
if [ ! -f $pantax_db/species_genomes_stats.txt ]; then
    $pantaxr stat -g $pantax_db/genomes_info.txt -t $threads -o $pantax_db/species_genomes_stats.txt -w $wd
fi 

if [ $create_db == "True" ] && [ $create_index == "False" ];then
    cd ..
    if [ $debug == "False" ]; then
        rm -rf $tmp_dir
    fi
    exit 0
fi

if [ $create_index == "True" ] && [ ! -e $pantax_db/reference_pangenome.min ]; then
    echo "$(date "+%Y-%m-%d %H:%M:%S") - Construct index...."
    if [ $short_read_best_alignment == "False" ]; then
        # # these commands all can show progress with option -p
        $vg gbwt -g reference_pangenome.giraffe.gbz --gbz-format -G $pantax_db/reference_pangenome.gfa --num-jobs $threads
        $vg index -j reference_pangenome.dist reference_pangenome.giraffe.gbz -t $threads
        $vg minimizer -d reference_pangenome.dist -o reference_pangenome.min reference_pangenome.giraffe.gbz
    else
        $vg autoindex -p reference_pangenome -w giraffe -g $pantax_db/reference_pangenome.gfa -t $threads
    fi
    mv reference_pangenome* $pantax_db    
    echo "$(date "+%Y-%m-%d %H:%M:%S") - Construct index.... done"
else
    if [ $short_reads == "True" ]; then
        echo "$(date "+%Y-%m-%d %H:%M:%S") - Giraffe index exists. Skipping." 
    fi
fi

if [ $create_index == "True" ];then
    cd ..
    if [ $debug == "False" ]; then
        rm -rf $tmp_dir
    fi
    exit 0
fi

# ---------------------------- Reads alignment ----------------------------
# short reads alignment(the latest vg giraffe can be used for long reads)
if [[ ( $species_level_flag == "True" || $strain_level_flag == "True" ) && ( $short_reads == "True" || $long_read_fast_alignment == "True" ) ]]; then
    echo "$(date "+%Y-%m-%d %H:%M:%S") - Reads alignment with vg giraffe."    
    # this step produce three index files: reference_pangenome.giraffe.gbz, reference_pangenome.dist, reference_pangenome.min.
    # default mode(quick)
    if [ $reconstruction_flag == "True" ] || [ ! -e $pantax_db/reference_pangenome.min ]; then
        echo "$(date "+%Y-%m-%d %H:%M:%S") - Construct index...."
        if [ $short_read_best_alignment == "False" ] && [ ! -e $pantax_db/reference_pangenome.min ]; then
            # # these commands all can show progress with option -p
            $vg gbwt -g reference_pangenome.giraffe.gbz --gbz-format -G $pantax_db/reference_pangenome.gfa --num-jobs $threads
            $vg index -j reference_pangenome.dist reference_pangenome.giraffe.gbz -t $threads
            $vg minimizer -d reference_pangenome.dist -o reference_pangenome.min reference_pangenome.giraffe.gbz
        else
            $vg autoindex -p reference_pangenome -w giraffe -g $pantax_db/reference_pangenome.gfa -t $threads
        fi
        mv reference_pangenome* $pantax_db
        echo "$(date "+%Y-%m-%d %H:%M:%S") - Construct index.... done"
    fi
    echo "$(date "+%Y-%m-%d %H:%M:%S") - vg giraffe mapping..."
    if [ ! -s gfa_mapped.gaf ]; then
        if [ $pair_short_read == "True" ] && [ $interleaved_short_read == "True" ]; then
            $vg giraffe -Z $pantax_db/reference_pangenome.giraffe.gbz -m $pantax_db/reference_pangenome.min -d $pantax_db/reference_pangenome.dist -i -f $input_fq -t $threads --named-coordinates -o gaf > gfa_mapped.gaf
        elif [ $pair_short_read == "True" ] && [ $interleaved_short_read == "False" ]; then
            $vg giraffe -Z $pantax_db/reference_pangenome.giraffe.gbz -m $pantax_db/reference_pangenome.min -d $pantax_db/reference_pangenome.dist -f $input_fq1 -f $input_fq2 -t $threads --named-coordinates -o gaf > gfa_mapped.gaf
        elif [ $pair_short_read == "False" ]; then
            $vg giraffe -Z $pantax_db/reference_pangenome.giraffe.gbz -m $pantax_db/reference_pangenome.min -d $pantax_db/reference_pangenome.dist -f $input_fq -t $threads --named-coordinates -o gaf > gfa_mapped.gaf
        fi
        echo "$(date "+%Y-%m-%d %H:%M:%S") - vg giraffe mapping... done."
    else
        echo "$(date "+%Y-%m-%d %H:%M:%S") - gfa_mapped.gaf exists... Skipping."
    fi 
fi

# long reads alignment
if [ $species_level_flag == "True" ] && [ $long_reads == "True" ] && [ $long_read_fast_alignment == "False" ]; then
    echo "$(date "+%Y-%m-%d %H:%M:%S") - Reads alignment with Graphaligner..."
    if [ -z $long_read_type ] && [ -z $precise ]; then
        precise=0.66
    elif [ ! -z $long_read_type ] && [ -z $precise ]; then
        if [ ${long_read_type,,} == "hifi" ]; then
            precise=0.9
        elif [ ${long_read_type,,} == "clr" ] || [ ${long_read_type,,} == "ontr9" ]; then
            precise=0.75
        elif [ ${long_read_type,,} == "ontr10" ]; then
            precise=0.8
        fi
    elif [ ! -z $precise ]; then
        :
    fi
    if [ ! -s gfa_mapped.gaf ]; then
        GraphAligner -g $pantax_db/reference_pangenome.gfa -f $input_fq -a gfa_mapped.gaf -x vg -t $threads --precise-clipping $precise
        # GraphAligner -g $pantax_db/reference_pangenome.gfa -f $input_fq -a gfa_mapped.gaf -x vg -t $threads --bandwidth 15 --multimap-score-fraction 0.99 --precise-clipping 0.66 --min-alignment-score 100 --clip-ambiguous-ends 100 --overlap-incompatible-cutoff 0.15 --max-trace-count 5 --mem-index-no-wavelet-tree
        $pantaxr filter -m gfa_mapped.gaf -t $threads
        mv gfa_mapped_filtered.gaf gfa_mapped.gaf
        echo "$(date "+%Y-%m-%d %H:%M:%S") - Graphaligner mapping... done."
    else
        echo "$(date "+%Y-%m-%d %H:%M:%S") - gfa_mapped.gaf exists... Skipping."
    fi         
fi


if [ $isfilter == 0 ] || [ $isfilter == "off" ]; then
    isfilter_flag=""
else 
    isfilter_flag="--filtered"
fi

if [ $sample_test == 1 ]; then
    sample_test="--sample_test"
else
    sample_test=""
fi

if [ $next_for_strain == "True" ] || [ $debug == "True" ]; then
    if [ $pantax_report == "None" ]; then
        pantax_report="reads_classification.tsv"
    fi
fi
if [ $species_level_flag == "True" ]; then
    species_flag="--species"
else
    species_flag=""
fi
if [ $strain_level_flag == "True" ]; then
    strain_flag="--strain"
else
    strain_flag=""
fi
if [ $debug == "True" ]; then
    debug_flag="--debug"
else
    debug_flag=""
fi

if [ $save == "True" ] && [ $lz = false ] && [ $zstd = false ] && [ $h5 = false ]; then
    save_flag="--zip serialize"
elif [ $save == "True" ] && [ $lz = true ] && [ $zstd = false ] && [ $h5 = false ]; then
    save_flag="--zip lz"
elif [ $save == "True" ] && [ $lz = false ] && [ $zstd = true ] && [ $h5 = false ]; then
    save_flag="--zip zstd"
elif [ $save == "True" ] && [ $lz = false ] && [ $zstd = false ] && [ $h5 = true ]; then
    save_flag="--zip h5"
else 
    save_flag=""
fi

# strain paras
line_count=$(wc -l < $pantax_db/species_range.txt)
if [ $unique_trio_nodes_fraction == "None" ] || [ $unique_trio_nodes_count == "None" ]; then
    if [ $unique_trio_nodes_fraction == "None" ]; then
        if [[ $line_count -eq 1 && $unique_trio_nodes_fraction = "None" ]]; then
            unique_trio_nodes_fraction=0.43
        else
            if [ $short_reads == "True" ]; then
                unique_trio_nodes_fraction=$default_short_unique_trio_nodes_fraction
            elif [ $long_reads == "True" ]; then
                unique_trio_nodes_fraction=$default_long_unique_trio_nodes_fraction
            fi
        fi  
    fi 
    if [ $unique_trio_nodes_count == "None" ]; then
        if [[ $line_count -eq 1 && $unique_trio_nodes_count = "None" ]]; then
            # unique_trio_nodes_count=0.44
            unique_trio_nodes_count=$default_unique_trio_nodes_count
        else
            unique_trio_nodes_count=$default_unique_trio_nodes_count
        fi   
    fi        
fi
if [ $single_cov_ratio == "None" ]; then
    if [[ $line_count -eq 1 && $single_cov_ratio = "None" ]]; then
        # single_cov_ratio=0.90
        single_cov_ratio=$default_single_cov_ratio
    else
        single_cov_ratio=$default_single_cov_ratio
    fi   
fi  
if [ $shift_unique_trio_nodes_fraction == "None" ]; then
    if [ $line_count -eq 1 ]; then
        shift_unique_trio_nodes_fraction="--shift"
    else
        shift_unique_trio_nodes_fraction=""
    fi
elif [ $shift_unique_trio_nodes_fraction == "False" ] || [ $shift_unique_trio_nodes_fraction == "false" ]; then
    shift_unique_trio_nodes_fraction=""
elif [ $shift_unique_trio_nodes_fraction == "True" ] || [ $shift_unique_trio_nodes_fraction == "true" ]; then
    shift_unique_trio_nodes_fraction="--shift"
fi

# profiling opt and cmd
pantaxr_opt="-d $pantax_db -w ./ -m gfa_mapped.gaf \
                $isfilter_flag \
                --min_species_abundance $species_abund_threshold \
                -f $unique_trio_nodes_fraction \
                -r $unique_trio_nodes_count \
                --scr $single_cov_ratio \
                --scd $single_cov_diff \
                --mmcov 0 \
                -c $min_cov \
                --min_depth $min_depth \
                --binning_out $pantax_report \
                --sample $sample \
                --ds $designated_species \
                --mode $select_mode \
                --full \
                -o ./ \
                --solver $solver \
                --gthreads $solver_threads \
                -t $threads \
                $shift_unique_trio_nodes_fraction $sample_test $save_flag $species_flag $strain_flag $debug_flag"
$pantaxr profile $pantaxr_opt

# move and rename files
cd ..
if [ $species_level_flag == "True" ]; then
    if [ $pantax_output ]; then
        specie_abund_file=${pantax_output}_species_abundance.txt
        cp $tmp_dir/species_abundance.txt $specie_abund_file         
    else
        cp $tmp_dir/species_abundance.txt $wd
    fi
fi
if [ $strain_level_flag == "True" ]; then
    if [ $pantax_output ]; then
        strain_abund_file=${pantax_output}_strain_abundance.txt
        cp $tmp_dir/strain_abundance.txt $strain_abund_file
    else
        cp $tmp_dir/strain_abundance.txt $wd
    if [ $test == "True" ]; then
        cp $tmp_dir/ori_strain_abundance.txt $wd
    fi
    fi
fi

# if next_for_strain is true, we will save tmp files for strain level profiling.
if [[ ( $strain_level_flag == "True" && $next_for_strain == "True" ) || ( $species_level_flag == "True" && $next_for_strain == "False" ) ]]; then
    if [ $read_aln ]; then
        mv $tmp_dir/gfa_mapped.gaf $read_aln
    fi

    if [ $debug == "False" ]; then
        if [ -f $tmp_dir/pantax_report.tsv ]; then
            mv $tmp_dir/pantax_report.tsv $pantax_report
        fi
        rm -rf $tmp_dir
    fi
fi
echo "$(date "+%Y-%m-%d %H:%M:%S") - PanTax Done."
