#!/bin/bash

set -e
function print_help() {
  echo ""
  echo "Strain-level taxonomic classification of metagenomic data using pangenome graphs(data preprocessing)"
  echo ""
  echo "Author: Wenhai Zhang"
  echo "Date:   May 2024"
  echo ""
  echo "    General options:"
  echo "        --help, -h                        Print this help message."
  echo "        -r str                            Genomes directory."
  echo "        -o str                            Output genomes database directory."
  echo "        -s str                            Assembly summary file path."
  echo "        -g str                            Output genomes information file."
  echo "        --custom, -c str                  Specify custom genomes database(use without -r -s). A file format: genomeID\tstrain_taxid\tspecies_taxid\ttorganism_name\tid."
  echo "    Three main functions:"
  echo "        --remove                          Whether to remove plasmid."
  echo "        --compute                         ANI Estimation for species cluster. (for graph-based clustering)"
  echo "        --cluster                         Whether to cluster and remove redundancy. (for graph-based clustering)"
  echo "    Genomes choice options:"
  echo "        -sc, species_cluster str          Specified process species(multi with ',' join, eg. 562,1282)."
  echo "        -l, --genome-assembly-lvl str     Genome assembly level.(all/complete)." 
  echo "    Cluster options:"
  echo "        --cluster-method str              Clustering method. Option: graph, hcls. (default: graph). 
                                                  The hcls method is currently only applicable to a single species."
  echo "        --hcls-method str                 Hierarchical clustering method. Option: single/complete/average. (default:complete)"
  echo "            -e, --cutoff float            Hierarchical clustering cutoff threshold. (default:99)"
  echo "            --matrix file                 ANI matrix file from FastANI."
  echo "            -mf, --matrix-filter          Filter matrix if genomes in matrix not all in genomes information file."
  echo "        -m int                            Max genomes number used to cluster every species. (max:-1, default:100)"
  echo "        -n int                            Max genomes number used to build pangenome every species. (max:-1, default:10 for graph / all for hcls)"
  echo "        -p int                            Number of parallel processes used for ANI calculation. (default:2)"
  echo "        -j int                            Number of parallel processes used for fastANI. (default:32)"  
  echo "    GTDB options:"  
  echo "        --db str                          Database source(default: None, gtdb: GTDB)."
  echo "        --gm str                          GTDB metadata file(Specify that this file will default to the GTDB database)."   
  exit 0
}

genomes_database=None
output_database="db"
summary_file="assembly_summary_bacteria.txt"

custom_db=None
remove="False"
compute="False"
cluster="False"
cluster_max_genome=100
pan_max_genome=10
species_cal_parllel_num=2
fastani_process=32

# cluster method
cluster_method=graph
hcls_method=complete
hcls_genome_num=None
hcls_cutoff=99
matrix_file=None
matrix_filter=False

db=None
gtdb_metadata=None
genome_asm_lvl="all"
species_cluster=None

remove_scaffold_len=0

timestamp=$(date "+%Y-%m-%d %H:%M:%S")
echo "$timestamp - $0 $@"

#Print help if no argument specified
if [[ "$1" == "" ]]; then
  print_help
fi

while [[ "$1" != "" ]]; do
    case "$1" in
        "--help" | "-h") print_help;;
        "-r") [[ -z "$2" ]] && { echo "Error: $1 expects an argument"; exit 1; } || { genomes_database="$2"; shift 2; } ;;
        "-o") [[ -z "$2" ]] && { echo "Error: $1 expects an argument"; exit 1; } || { output_database="$2"; shift 2; } ;;
        "-s") [[ -z "$2" ]] && { echo "Error: $1 expects an argument"; exit 1; } || { summary_file="$2"; shift 2; } ;;
        "-g") [[ -z "$2" ]] && { echo "Error: $1 expects an argument"; exit 1; } || { genomes_info="$2"; shift 2; } ;;
        "-c" | "--custom") [[ -z "$2" ]] && { echo "Error: $1 expects an argument"; exit 1; } || { custom_db="$2"; shift 2; } ;;
        "--remove") remove="True"; shift 1 ;;
        "--compute") compute="True"; shift 1 ;;
        "--cluster") cluster="True"; shift 1 ;;
        "--cluster-method") [[ -z "$2" ]] && { echo "Error: $1 expects an argument"; exit 1; } || { cluster_method="$2"; shift 2; } ;;
        "--hcls-method") [[ -z "$2" ]] && { echo "Error: $1 expects an argument"; exit 1; } || { hcls_method="$2"; shift 2; } ;;
        "-e" | "--cutoff") [[ -z "$2" ]] && { echo "Error: $1 expects an argument"; exit 1; } || { hcls_cutoff="$2"; shift 2; } ;;
        "--matrix") [[ -z "$2" ]] && { echo "Error: $1 expects an argument"; exit 1; } || { matrix_file="$2"; shift 2; } ;;
        "-mf" | "--matrix-filter") matrix_filter="True"; shift 1 ;;
        "-m") [[ -z "$2" ]] && { echo "Error: $1 expects an argument"; exit 1; } || { cluster_max_genome="$2"; shift 2; } ;;
        "-n") [[ -z "$2" ]] && { echo "Error: $1 expects an argument"; exit 1; } || { pan_max_genome="$2"; hcls_genome_num="$2"; shift 2; } ;;
        "-p") [[ -z "$2" ]] && { echo "Error: $1 expects an argument"; exit 1; } || { species_cal_parllel_num="$2"; shift 2; } ;;
        "-j") [[ -z "$2" ]] && { echo "Error: $1 expects an argument"; exit 1; } || { fastani_process="$2"; shift 2; } ;;
        "--db") [[ -z "$2" ]] && { echo "Error: $1 expects an argument"; exit 1; } || { db="$2"; shift 2; } ;;
        "--gm") [[ -z "$2" ]] && { echo "Error: $1 expects an argument"; exit 1; } || { gtdb_metadata="$2"; db="gtdb"; shift 2; } ;;
        "-sc" | "--species_cluster") [[ -z "$2" ]] && { echo "Error: $1 expects an argument"; exit 1; } || { species_cluster="$2"; shift 2; } ;;
        "-l" | "--genome-assembly-lvl") [[ -z "$2" ]] && { echo "Error: $1 expects an argument"; exit 1; } || { genome_asm_lvl="$2"; shift 2; } ;;
        "-rl") [[ -z "$2" ]] && { echo "Error: $1 expects an argument"; exit 1; } || { remove_scaffold_len="$2"; shift 2; } ;;
        --) shift; break;;
        *) echo "Error: invalid option \"$1\""; exit 1;;
    esac
done

# create dir
output_database=$(readlink -f $output_database)
script_path=$(readlink -f $0)
script_dir=$(dirname $script_path)

if [ ! -d $output_database ]; then
    mkdir -p $output_database
fi

if [ ! -d $output_database/library ]; then
    mkdir -p $output_database/library
fi

if [ ! -d $output_database/output_cluster ]; then
    mkdir -p $output_database/output_cluster
fi

# set gtdb flag
gtdb_flag=""
if [ "$db" == "gtdb" ]; then
    gtdb_flag="--gtdb"
fi
# set remove flag
remove_flag=""
if [ $remove == "True" ]; then
    remove_flag="--remove"
fi

matrix_filter_flag=""
if [ $matrix_filter == "True" ]; then
    matrix_filter_flag="-mf"
fi

run_genomes_process() {
    python $script_dir/genomes_process.py -i $genomes_database \
                                          -o $output_database/library \
                                          -c $custom_db -f $gtdb_metadata \
                                          -l $genome_asm_lvl \
                                          -p $species_cluster \
                                          -d $db \
                                          -s $summary_file \
                                          -rl $remove_scaffold_len \
                                          $remove_flag
}

run_genomes_cluster() {
    python $script_dir/genomes_cluster.py -i $output_database/library/$genomes_info_provided_origin \
                                          -d $output_database/library \
                                          -gs $genome_statics \
                                          -o $output_database/output_cluster \
                                          -m $cluster_max_genome \
                                          -n $pan_max_genome \
                                          -p $species_cal_parllel_num \
                                          -j $fastani_process \
                                          $1 $gtdb_flag
}

run_get_genomes_info() {
    python $script_dir/get_genomes_info.py -c $output_database/library/$genomes_info_provided_origin \
                                           -o $output_database/library $gtdb_flag
}

run_hcls_select_rep() {
    python $script_dir/hcls_select_rep.py -m $matrix_file \
                                          -n $hcls_genome_num \
                                          -f $1 \
                                          -hm $hcls_method \
                                          -e $hcls_cutoff \
                                          -o $output_database/hcls \
                                          -t $fastani_process \
                                          $gtdb_flag $matrix_filter_flag
}

genomes="genomes.txt"
genome_statics=$output_database/genome_statics.txt
genomes_info_provided_origin="genomes_info_provided_origin.txt"
if [ $cluster_method == "graph" ]; then
    echo "$timestamp - $0 - INFO: Using graph-based clustering..."
    if [ $custom_db != "None" ] || [ $db = "gtdb" ]; then
        # this step will produce two files in running path
        if [ ! -f $output_database/genomes_process_done ]; then
            run_genomes_process
            touch $output_database/genomes_process_done
        fi
        if [ $compute == "True" ] || [ $cluster == "True" ]; then
            if [ -e "$genomes" ] && [ ! -e "$genome_statics" ]; then
                python $script_dir/staticsData.py --filelist $genomes -t $fastani_process -o $genome_statics
            fi 
        fi
        if [ $compute == "True" ] && [ $cluster == "False" ]; then
            run_genomes_cluster "--compute"
        elif [ $compute == "False" ] && [ $cluster == "True" ]; then
            run_genomes_cluster "--cluster"
            find $output_database/output_cluster -name "*fna*" > used_genomes.txt
            run_get_genomes_info
        elif [ $compute == "True" ] && [ $cluster == "True" ]; then
            run_genomes_cluster "--compute --cluster"
            find $output_database/output_cluster -name "*fna*" > used_genomes.txt
            run_get_genomes_info
            cp $output_database/library/genomes_info.txt $output_database
        elif [ $compute == "False" ] && [ $cluster == "False" ]; then
            if [ $db == "gtdb" ]; then
                awk -F'\t' 'NR > 1 {gsub(/ /, "_", $3); print $0} NR == 1 {print $0}' $output_database/library/$genomes_info_provided_origin > $output_database/library/genomes_info.txt
            else    
                cp $output_database/library/$genomes_info_provided_origin $output_database/library/genomes_info.txt
            fi
            cp $output_database/library/genomes_info.txt $output_database
        fi
    elif [ $db == "rs" ]; then
        if [ ! -f $summary_file ]; then
            echo "Assembly summary file does not exist, prepare to download..."
            wget -c https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/assembly_summary.txt -O $output_database/library/assembly_summary_bacteria.txt
            summary_file=$output_database/library/assembly_summary_bacteria.txt
        fi
        if [ ! -f $output_database/genomes_process_done ]; then
            run_genomes_process
            touch $output_database/genomes_process_done
        fi
        if [ $cluster == "True" ]; then
            if [ -e "$genomes" ] && [ ! -e "$genome_statics" ]; then
                python $script_dir/staticsData.py --filelist $genomes -t $fastani_process -o $genome_statics
            fi    
            run_genomes_cluster "--compute --cluster"
            find $output_database/output_cluster -name "*fna*" > used_genomes.txt
            run_get_genomes_info
            cp $output_database/library/genomes_info.txt $output_database
        elif [ $cluster == "False" ]; then
            if [ $db == "gtdb" ]; then
                awk -F'\t' 'NR > 1 {gsub(/ /, "_", $3); print $0} NR == 1 {print $0}' $output_database/library/$genomes_info_provided_origin > $output_database/library/genomes_info.txt
            else    
                cp $output_database/library/$genomes_info_provided_origin $output_database/library/genomes_info.txt
            fi
            cp $output_database/library/genomes_info.txt $output_database
        fi
    fi
    
elif [ $cluster_method == "hcls" ]; then
    echo "$timestamp - $0 - INFO: Using hierarchical clustering $hcls_method $cutoff"
    if [ -f $custom_db ] && [ -f $matrix_file ]; then
        run_hcls_select_rep $custom_db
    else
        run_genomes_process
        run_hcls_select_rep $output_database/library/$genomes_info_provided_origin
    fi
    cp $output_database/hcls/hcls_filtered_genomes_info.txt $output_database/genomes_info.txt
    echo "$timestamp - $0 - INFO: Clustering done"
fi

# if [ $cluster == "True" ]; then
#     rm $genomes $genome_statics used_genomes.txt
# fi
timestamp=$(date "+%Y-%m-%d %H:%M:%S")
echo "$timestamp - $0 - INFO: Data preprocessing completely"
