#!/usr/bin/env perl

use strict;
use warnings;
use Carp;
use Getopt::Long qw(:config no_ignore_case bundling pass_through);
use FindBin;
use Cwd qw(abs_path);
use lib ("$FindBin::Bin/PerlLib");
use Pipeliner;
use Process_cmd;
use File::Basename;

my $VERSION = "v1.0.0";
#my $VERSION = "DEVEL";

my $CPU = 4;
my $output_directory = "ctat_LR_fusion_outdir";


#  long read filters
my $MIN_NUM_LR = 1;
my $MIN_LR_NOVEL = 2;


# short read filters
my $MIN_J = 1;
my $MIN_SUM_JS = 1;
my $MIN_NOVEL_J = 1;

my $MIN_PER_ID = 70;

my $genome_lib_dir = $ENV{CTAT_GENOME_LIB} || "";

my $help_flag;
my $transcripts_file;
my $left_fq = "NA";
my $right_fq = "NA";

my $DEBUG = 0;


my $SHOW_VERSION = 0;
my $NO_ANNOTATION_FILTER = 0;

my $PREP_REFERENCE = 0;
my $PREP_REFERENCE_ONLY = 0;

my $max_intron_length = 100000;

my $NO_SHRINK_INTRONS = 0;
my $shrink_intron_max_length = 1000;

my $NO_ABUNDANCE_FILTER = 0;

my $IGV_REPORTS = 0;

my $MAX_EXON_DELTA = 50;

my $max_IGV_LR_per_fusion = 100;

my $SNAP_dist = 3;

my $MIN_FRACTION_DOMINANT_ISO = 0.05;

my $CHIM_CANDIDATES_ONLY = 0; # force stop after phase 1

my $MIN_FFPM = 0.1;

my $EXAMINE_CODING_EFFECT = 0;

my $FI_extra_params = "";

#my $min_mapping_quality = 20;

my $min_trans_overlap_length = 100;


my $usage = <<__EOUSAGE__;

############################################################################################################
#
#  Required:
#
#  --transcripts|T <string>        :transcript fasta or fastq file
#
#  --genome_lib_dir <string>       directory containing genome lib (see https://github.com/NCIP/ctat-genome-lib-builder/wiki for details)
#                                      (defaults to env CTAT_GENOME_LIB if set. -- currently: [ $genome_lib_dir ] )      
#  Optional:
#
#  --vis                           :include igv-report visualization html
#
#  --CPU <int>                     :number threads (default $CPU)
#
#  --left_fq <string>              :Illumina paired-end reads /1
#
#  --right_fq <string>             :Illumina paired-end reads /2
#
#  --min_per_id <int>              :minimum percent identity (default: $MIN_PER_ID) minimap2 alignments < --min_per_id are discarded.
#
#  --min_FFPM <float>              :min fusion expression (default: $MIN_FFPM FFPM - meaning at least 1 fusion read per 10 M total reads )
#
#  --output|o <string>             :output directory name (default: $output_directory)
#
#  ##############################
#  Filtering by evidence support:
#  
#   ###############################################################################################################################################  
#   # Fusions are reported if the meet the following requirements:
#   #
#   # Long reads:  (min_num_LR & canonical splice) or (min_LR_novel_junction_support & non canonical splicing))
#   #    (or)
#   # Short reads: (min_J & min_sumJS & canonical splice & min_FFPM) or (min_novel_junction_support & min_sumJS & non canonocial splicing & min_FFPM) 
#   #################################################################################################################################################
#
#  Long read filtering:
#
#  --min_num_LR <int>            : min number of long reads supporting fusion including canonical splicing at breakpoint (default: $MIN_NUM_LR)
#
#  --min_LR_novel_junction_support <int>   : min number of long reads with support at noncanonical splice breakpoint (eg. maybe not spliced!) (default: $MIN_LR_NOVEL)
#  
#  --min_trans_overlap_length <int>           : minimum read overlap length for each gene in the fusion pair, default: $min_trans_overlap_length
#
# Short read abundance filters:
#
#  --min_J <int>                 :minimum number of junction frags (default: $MIN_J)  
#
#  --min_sumJS <int>             :minimum sum (junction + spanning) frags (default: $MIN_SUM_JS)
#
#  --min_novel_junction_support <int>   :minimum number of junction reads required for novel (non-reference) exon-exon junction support.
#                                        (default: $MIN_NOVEL_J)
#
#  --min_frac_dom_iso_expr <float>     : minimum expression of dominant isoform (default: $MIN_FRACTION_DOMINANT_ISO)
#
#  #############
#  Misc options:
#
#  --max_exon_delta <int>          : maximum allowed distance of fusion breakpoint from reference exon boundary in initial candidate search. (default: $MAX_EXON_DELTA)
#
#  --max_intron_length <int>       : maximum intron length during minimap2 search (default: $max_intron_length)
#
#  --shrink_intron_length <int>    : length for shrinking long introns to during the FusionInspector-style alignments. default($shrink_intron_max_length)
#  --no_shrink_introns             : disable intron shrinking during FusionInspector-style alignments.
#
#  --snap_dist <int>               :  if breakpoint is at most this distance from a reference exon boundary, position gets snapped to the splice site. (default: $SNAP_dist)
#
#  --no_annot_filter               : disable filtering outputs based on fusion annotations.
#
#  --no_abundance_filter           : disable final phase-2 filtering based on read support (does not impact phase-1 min_FFPM based filtering!)
#
#  --prep_reference                : prepares mininmap2 genome index
#  --prep_reference_only           : only prepares genome index and then stops.
#
#  --max_IGV_LR_per_fusion <int>   : for the --vis, select a max number of LR per fusion for the html. (default: $max_IGV_LR_per_fusion)
#                                          (more reads means longer browser load times)
#
#  --examine_coding_effect         : explore impact of fusions on coding sequences (guesses only based on the fusion breakpoint and
#                                                                                   ref annotations, prioritizing in-frame over out-of-frame 
#                                                                                   coding isoform fusion pairs)
#                                 
#
#  --extract_fusion_LR_fasta <string>  : extract the long reads that serve as evidence to the fusion and write them to the corresponding file.  
#                                        The fusion name is prefixed to the read name like so:  fusion_name|read_name
#
#  --chim_candidates_only             : stop after first phase of identifying candidates before evaluating them in the second phase.
#                                                     see chimera candidate listing as: output_dir/fusion_intermediates_dir/chimeric_read_candidates.FI_listing
#
#  --FI_extra_params <string>         : extra parameters to give to FusionInspector (eg. "--STAR_xtra_params '--limitBAMsortRAM 61419850732' "
#
#
#  --version                             report version ($VERSION)
#
##############################################################################################################



__EOUSAGE__

    ;

# deprecated for now
#  --min_mapping_quality <int>             : minimum mapping quality for phase-2 (FI-like), default: $min_mapping_quality
#

my $top_candidates_only = 0; # hidden opt for troubleshooting igv-reports
my $extract_fusion_LR_fasta = "";

&GetOptions ( 'help|h' => \$help_flag,

              ## Required
              'transcripts|T=s' => \$transcripts_file,
              'genome_lib_dir=s' => \$genome_lib_dir,
              
              'left_fq=s' => \$left_fq,
              'right_fq=s' => \$right_fq,

              ## optional
              'CPU=i' => \$CPU,
              'output|o=s' => \$output_directory,
              'min_J=i' => \$MIN_J,
              'min_sumJS=i' => \$MIN_SUM_JS,
              'min_per_id=i' => \$MIN_PER_ID,
              'min_novel_junction_support=i' => \$MIN_NOVEL_J,
              'version' => \$SHOW_VERSION,
              
              'no_annot_filter' => \$NO_ANNOTATION_FILTER,
              'DEBUG' => \$DEBUG,
              
              'prep_reference' => \$PREP_REFERENCE,
              'prep_reference_only' => \$PREP_REFERENCE_ONLY,
              
              'max_intron_length=i' => \$max_intron_length,
              'max_exon_delta=i' => \$MAX_EXON_DELTA,
	      
              'min_frac_dom_iso_expr=f' => \$MIN_FRACTION_DOMINANT_ISO,
              
              'snap_dist=i' => \$SNAP_dist,
              
              'no_shrink_introns' => \$NO_SHRINK_INTRONS, # introns are shrunk by default
              'shrink_intron_max_length=i' => \$shrink_intron_max_length,
              
              'vis' => \$IGV_REPORTS,

              'no_abundance_filter' => \$NO_ABUNDANCE_FILTER,

              'top_candidates_only=i' => \$top_candidates_only,

              'extract_fusion_LR_fasta=s' => \$extract_fusion_LR_fasta,

              'min_FFPM=f' => \$MIN_FFPM,
    
              'chim_candidates_only' => \$CHIM_CANDIDATES_ONLY,

              'examine_coding_effect' => \$EXAMINE_CODING_EFFECT,

              'FI_extra_params=s' => \$FI_extra_params, 

              # 'min_mapping_quality=i' => \$min_mapping_quality,

              'min_trans_overlap_length=i' => \$min_trans_overlap_length,
);



if ($help_flag) {
    die $usage;
}

if ($SHOW_VERSION) {
    print "\n\nctat-LR-fusion $VERSION\n\n";
    exit(0);
}


unless ($transcripts_file && $genome_lib_dir) {
    die $usage;
}

$genome_lib_dir = &ensure_full_path($genome_lib_dir);

$transcripts_file = &ensure_full_path($transcripts_file);
if ($left_fq ne "NA") {
    $left_fq = &ensure_full_path($left_fq);
}
if ($right_fq ne "NA") {
    $right_fq = &ensure_full_path($right_fq);
}


my $long_reads_only_flag = ($left_fq eq "NA" && $right_fq eq "NA") ? 1:0;


my $UTILDIR = "$FindBin::RealBin/util";

my $FI_DIR = "$FindBin::RealBin/FusionInspector";
my $FI_UTILDIR = "$FI_DIR/util";

my $CTAT_MINIMAP2_DIR = "$FindBin::RealBin/ctat-minimap2";

my $MM2_DB_DIR = $genome_lib_dir;
my $MM2_DB_NAME = "$MM2_DB_DIR/ref_genome.fa.mm2";

my $REF_GTF = "$genome_lib_dir/ref_annot.gtf";
unless (-s $REF_GTF) {
    die "Error, cannot locate reference annotation file: $REF_GTF";
}

my $MM2_splice_file = "$REF_GTF.mm2.splice.bed";
my $genome_fa = "$genome_lib_dir/ref_genome.fa";

if ($PREP_REFERENCE || $PREP_REFERENCE_ONLY) {
    &prep_minimap2_reference($genome_fa, $MM2_DB_DIR, $MM2_DB_NAME, $MM2_splice_file, $REF_GTF);
    if ($PREP_REFERENCE_ONLY) {
        print STDERR "-option --prep_reference_only flag set. Stopping now.\n";
        exit(0);
    }
}

my $MM2_idx = $MM2_DB_NAME;
unless (-e $MM2_idx) {
    die "Error, cannot locate minimap2 database: $MM2_idx";
}

unless (-e $MM2_splice_file) {
    die "Error, cannot locate file: $MM2_splice_file";
}


# make output directory
$output_directory = &ensure_full_path($output_directory);

unless (-d $output_directory) {
    mkdir $output_directory or die "Error, cannot mkdir $output_directory";
}
chdir $output_directory or die "Error, cannot cd to $output_directory";


my $intermediates_dir = &ensure_full_path("fusion_intermediates_dir");
unless (-d $intermediates_dir) {
    mkdir $intermediates_dir or die "Error, cannot mkdir $intermediates_dir";
}



main: {

    my $pipeliner = new Pipeliner(-verbose => 2,
                                  -checkpoint_dir => "$intermediates_dir/__checkpts",
        );
    
    # make fasta file from fastq
    

    # first, check if gzipped.
    if ($transcripts_file =~ /\.gz$/) {
        my $uncompressed_transcripts_file = basename($transcripts_file);
        $uncompressed_transcripts_file =~ s/\.gz$//;
        my $cmd = "gunzip -c $transcripts_file > $uncompressed_transcripts_file";
        $pipeliner->add_commands(new Command($cmd, "gunzip_transcripts.ok"));
        $transcripts_file = "$uncompressed_transcripts_file";
    }
     
    if ($transcripts_file =~ /(fastq|fq)$/i) {
        # convert from fastq to fasta
        my $cmd = "seqtk seq -a $transcripts_file > $transcripts_file.fasta";
        $pipeliner->add_commands(new Command($cmd, "fastq_to_fasta_seqtk.ok"));

        $transcripts_file = "$transcripts_file.fasta";
    }
    
    $transcripts_file = abs_path($transcripts_file);
    
    ####################################################
    # run minimap2 to identify potential fusion transcripts
    ####################################################

    my $mm2_intermediate_output_file_prefix = "$intermediates_dir/" . basename($transcripts_file) . ".mm2";
  
    my $cmd = "bash -c \"set -eou pipefail && $CTAT_MINIMAP2_DIR/ctat-minimap2 --only_chimeric --sam-hit-only  --junc-bed $MM2_splice_file -ax splice -u b -t $CPU $MM2_idx $transcripts_file | samtools view -Sb -o $mm2_intermediate_output_file_prefix.bam\" ";
    

    $pipeliner->add_commands(new Command($cmd, "run_mm2.ok"));
    
    $cmd = "$UTILDIR/SAM_to_gxf.pl  --sam $mm2_intermediate_output_file_prefix.bam --format gff3 >  $mm2_intermediate_output_file_prefix.gff3";
    $pipeliner->add_commands(new Command($cmd, "mm2_sam_to_gff3.ok"));
    
    ###############################
    ## generate initial chim report
    ###############################

    my $chims_described_outfile = "$mm2_intermediate_output_file_prefix.chims_described";
    $cmd = "$UTILDIR/genome_gff3_to_chim_summary.pl --align_gff3 $mm2_intermediate_output_file_prefix.gff3 --annot_gtf $REF_GTF --min_per_id $MIN_PER_ID  > $chims_described_outfile";
    
    $pipeliner->add_commands(new Command($cmd, "chims_described.ok"));

    # Extract candidate chimeric reads and create FI list
    my $chim_candidates_output_prefix = "$intermediates_dir/chimeric_read_candidates";
    my $chim_candidates_fasta = "$chim_candidates_output_prefix.transcripts.fa";
    my $FI_listing = "$chim_candidates_output_prefix.FI_listing";
    my $FI_listing_with_reads = "$chim_candidates_output_prefix.FI_listing.with_reads";
    $cmd = "$UTILDIR/retrieve_fusion_transcript_candidates.pl "
        . " --trans_fasta $transcripts_file "
        . " --chims_described $chims_described_outfile "
        . " --max_exon_delta $MAX_EXON_DELTA "
        . " --min_FFPM $MIN_FFPM "
        . " --output_prefix $chim_candidates_output_prefix";
    if ($CHIM_CANDIDATES_ONLY) {
        $cmd .= " --skip_read_extraction ";
    }
    
    $pipeliner->add_commands(new Command($cmd, "chim_candidates_fasta.skip_read_extraction=${CHIM_CANDIDATES_ONLY}.ok"));


    # annotate candidates
    $cmd = "$FindBin::Bin/FusionAnnotator/FusionAnnotator --genome_lib_dir $genome_lib_dir --annotate $FI_listing > ${FI_listing}.wAnnot";
    $pipeliner->add_commands(new Command($cmd, "chim_candidates_fasta.FI_listing.annotate.ok"));

    $FI_listing = "$FI_listing.wAnnot";

    
    
    unless ($NO_ANNOTATION_FILTER) {
        
        $cmd = "$FindBin::Bin/FusionFilter/util/filter_by_annotation_rules.pl --fusions  $FI_listing --genome_lib_dir $genome_lib_dir --custom_exclusions NEIGHBORS_OVERLAP ";
        $pipeliner->add_commands(new Command($cmd, "filter_by_annot_rules.ok"));
        
        $FI_listing = "$FI_listing.annot_filter.pass";
        
    }
        
    $pipeliner->run();

    if ($CHIM_CANDIDATES_ONLY) {
        print STDERR "** --chim_candidates_only flag set, stopping here now. See: $FI_listing \n\n";
        exit(0);
    }
    

    if (-e $FI_listing && (&count_num_fusions($FI_listing) == 0)) {
        print STDERR "Sorry, no chimeric candidates to pursue. Stopping here.\n";
        exit(0);
    }


    ## update reads fasta
    $cmd = "$FindBin::Bin/util/revise_fusion_reads_fasta.pl $FI_listing $FI_listing_with_reads $chim_candidates_fasta > $chim_candidates_fasta.revised.fasta";
    $pipeliner->add_commands(new Command($cmd, "revise_chimeric_reads_fasta.ok"));
    
    $chim_candidates_fasta = "$chim_candidates_fasta.revised.fasta";


    $pipeliner->run();
    
        
    ##############################################################################################################
    ## start new checkpoints for phase 2 involving FusionInspector like approach for precise breakpoint resolution
    
    $pipeliner = new Pipeliner(-verbose => 2,
                                  -checkpoint_dir => "$intermediates_dir/__checkpts_phase2",
        );
    
    # create FI contigs.
    $cmd = "$FI_UTILDIR/fusion_pair_to_mini_genome_join.pl "
        . " --fusions $FI_listing "
        . " --gtf $REF_GTF"
        . " --genome_fa $genome_lib_dir/ref_genome.fa"
        . " --out_prefix $intermediates_dir/LR-FI_targets";

    unless ($NO_SHRINK_INTRONS) {
        $cmd .= " --shrink_introns --max_intron_length $shrink_intron_max_length ";
    }
    
    if ($top_candidates_only) {
        $cmd .= " --top_candidates_only $top_candidates_only ";
    }
    
    $pipeliner->add_commands(new Command($cmd, "FI_contigs.ok"));

    $pipeliner->run();
    
    ## prep for mm2
    my $FI_splice_bed = "$intermediates_dir/LR-FI_targets.gtf.mm2.splice.bed";
    my $FI_mm2 = "$intermediates_dir/LR-FI_targets.fa.mm2";

    my $FI_contigs_file = "$intermediates_dir/LR-FI_targets.fa";
    my $FI_annots_gtf = "$intermediates_dir/LR-FI_targets.gtf";
    
    &prep_minimap2_reference($FI_contigs_file, $intermediates_dir, $FI_mm2, $FI_splice_bed, $FI_annots_gtf);
    
    ## run mm2 using the chimeric candidates:
    $cmd = "bash -c \"set -eou pipefail && $CTAT_MINIMAP2_DIR/ctat-minimap2 --sam-hit-only  -ax splice -u b --junc-bed $FI_splice_bed -t $CPU $FI_mm2 $chim_candidates_fasta | samtools view -Sb -o $intermediates_dir/LR-FI.mm2.bam\" ";


    $pipeliner->add_commands(new Command($cmd, "LR-FI.mm2.ok"));

    ####
    # filter low quality alignments


    # first sort by read name
    # $cmd = "samtools sort -n  $intermediates_dir/LR-FI.mm2.bam -o $intermediates_dir/LR-FI.mm2.namesorted.bam";
    #$pipeliner->add_commands(new Command($cmd, "LR-FI.mm2.namesorted.ok"));
    #$cmd = "$UTILDIR/filter_low_quality_alignments.py  --input_namesorted_bam $intermediates_dir/LR-FI.mm2.namesorted.bam --output_bam $intermediates_dir/LR-FI.mm2.namesorted.qfiltered.bam  -Q $min_mapping_quality";
    #$pipeliner->add_commands(new Command($cmd, "LR-FI.mm2.namesorted.qfiltered.ok"));
    
        
    # important, capture secondary alignments so paralogs accounted for here w/ full read and single cell representation
    #$cmd = "$UTILDIR/SAM_to_gxf.pl --sam $intermediates_dir/LR-FI.mm2.namesorted.qfiltered.bam --format gff3 --allow_non_primary > $intermediates_dir/LR-FI.mm2.gff3";
    $cmd = "$UTILDIR/SAM_to_gxf.pl --sam $intermediates_dir/LR-FI.mm2.bam --format gff3 --allow_non_primary > $intermediates_dir/LR-FI.mm2.gff3";
    $pipeliner->add_commands(new Command($cmd, "LR-FI.mm2.sam_to_gff3.ok"));
    
    # get seq-similar regions to help in filtering alignment evidence.
    $cmd = "$FI_UTILDIR/get_seq_similar_region_FI_coordinates.pl "
        . " --finspector_gtf $intermediates_dir/LR-FI_targets.gtf "
        . " --genome_lib_dir $genome_lib_dir "
        . " >  $intermediates_dir/LR-FI_targets.seqsimilar_regions.gff3 ";

    $pipeliner->add_commands(new Command($cmd, "FI_targets_seqsim_gff3.ok") );


    
    $cmd = "$UTILDIR/LR-FI_fusion_align_extractor.pl "
        . " --FI_gtf $intermediates_dir/LR-FI_targets.gtf "
        . " --LR_gff3 $intermediates_dir/LR-FI.mm2.gff3 "
        . " --seq_similar_gff3  $intermediates_dir/LR-FI_targets.seqsimilar_regions.gff3 "
        . " --output_prefix $intermediates_dir/LR-FI.mm2.fusion_transcripts "
        . " --snap_dist $SNAP_dist "
        . " --min_trans_overlap_length $min_trans_overlap_length "
        . " >  $intermediates_dir/LR-FI.mm2.fusion_transcripts";
    $pipeliner->add_commands(new Command($cmd, "LR-FI.mm2.sam_to_gff3.extract_fusions.ok"));

    $pipeliner->run();

    my $LR_total_count = &get_total_read_count("$transcripts_file.LR_read_count.txt");
    
    my $fusions_filename = "$intermediates_dir/LR-FI.mm2.fusion_transcripts.breakpoint_info.tsv";

    $cmd = "$UTILDIR/incorporate_LR_FFPM.pl --fusions $fusions_filename --num_LR_total $LR_total_count --output_file $fusions_filename.w_LR_FFPM";
    $pipeliner->add_commands(new Command($cmd, "added_LR_FFPM.ok"));
    $fusions_filename = "$fusions_filename.w_LR_FFPM";
    
    ###############################################
    ## Add regular FI for short reads if available.
    
    if ($left_fq ne "NA") {
        $cmd = "$FI_DIR/FusionInspector --fusions $FI_listing --genome_lib_dir $genome_lib_dir "
            . " --FI_contigs_fa $intermediates_dir/LR-FI_targets.fa --FI_contigs_gtf $intermediates_dir/LR-FI_targets.gtf "
            . " --left_fq $left_fq ";
        if ($right_fq ne "NA") {
            $cmd .= " --right_fq $right_fq ";
        }

        if ($FI_extra_params ) {
            $cmd .= " $FI_extra_params";
        }
        
        $pipeliner->add_commands(new Command($cmd, "FI_short_reads.ok"));
    
        my $merged_fusions_filename = "$intermediates_dir/mm2_and_FI_fusions_merged.tsv";

        # merge FI with the mm2 fusions
        $cmd = "$UTILDIR/merge_mm2fusion_FI.py --mm2_fusions $fusions_filename  --FI_fusions FI/finspector.FusionInspector.fusions.tsv --output_file $merged_fusions_filename";
        $pipeliner->add_commands(new Command($cmd, "merge_mm2_FI.ok"));

        $fusions_filename = $merged_fusions_filename;
	
    }
    

    #################################
    ## Fusion Annotator
    #################################
    
    $cmd = "$FindBin::Bin/FusionAnnotator/FusionAnnotator --genome_lib_dir $genome_lib_dir --annotate $fusions_filename > $fusions_filename.wAnnot";
    $pipeliner->add_commands(new Command($cmd, "annotate_fusions.ok"));
    
    $fusions_filename = "$fusions_filename.wAnnot";

    ###############################
    ## Preliminary fusion reporting
    ###############################

    ## consider this the pre-filtered preliminary report
    my $preliminary_report_file = "$output_directory/ctat-LR-fusion.fusion_predictions.preliminary.tsv";
    $cmd = "cp $fusions_filename $preliminary_report_file";
    $pipeliner->add_commands(new Command($cmd, "cp_to_prelim.ok"));
    

    ## add abridged version w/o all the evidence read names.
    $cmd = "$FI_UTILDIR/column_exclusions.pl ctat-LR-fusion.fusion_predictions.preliminary.tsv "
        . " LR_accessions,JunctionReads,SpanningFrags,CounterFusionLeftReads,CounterFusionRightReads "
        . " > $output_directory/ctat-LR-fusion.fusion_predictions.preliminary.abridged.tsv";
    $pipeliner->add_commands(new Command($cmd, "abridged_prelim_preds.ok"));
    
            
    ## Fusion filtering
            
    $cmd = "$FindBin::Bin/FusionFilter/blast_and_promiscuity_filter.pl --fusion_preds $fusions_filename --out_prefix $fusions_filename --genome_lib_dir $genome_lib_dir";
    $pipeliner->add_commands(new Command($cmd, "blast_promisc_filter.ok"));
        
    
    $fusions_filename = "$fusions_filename.post_blast_and_promiscuity_filter";
    

    # perform final evidence abundance-based filtering.
    unless ($NO_ABUNDANCE_FILTER) {
        
        $cmd = "$UTILDIR/filter_LR_fusions_by_evidence_abundance.py "
            . " --min_num_LR $MIN_NUM_LR "
            . " --min_FFPM $MIN_FFPM "
            . " --min_LR_novel_junction_support $MIN_LR_NOVEL "
            . " --min_J $MIN_J "
            . " --min_sumJS $MIN_SUM_JS "
            . " --min_novel_junction_support $MIN_NOVEL_J "
            . " --fusions_input $fusions_filename "
            . " --filtered_fusions_output $fusions_filename.filt_by_min_reads";

        $pipeliner->add_commands(new Command($cmd, "filter_by_min_reads.ok"));
        
        $fusions_filename = "$fusions_filename.filt_by_min_reads";

    }

    if ($MIN_FRACTION_DOMINANT_ISO > 0) {
        
        $cmd = "$UTILDIR/filter_low_pct_dom_iso.py "
            . " --min_frac_dom_iso $MIN_FRACTION_DOMINANT_ISO "
            . " --fusions_input $fusions_filename "
            . " --filtered_fusions_output $fusions_filename.filt_by_min_dom_iso_frac";
        
        $pipeliner->add_commands(new Command($cmd, "filter_by_min_dom_iso_frac.ok"));

        $fusions_filename = "$fusions_filename.filt_by_min_dom_iso_frac";
    }
        

    if ($EXAMINE_CODING_EFFECT) {

        my $fusions_w_coding_effect_file = "$fusions_filename.w_coding_effect";
        
        $cmd = "$FindBin::Bin/FusionAnnotator/util/fusion_to_coding_region_effect.pl  --fusions $fusions_filename --genome_lib_dir $genome_lib_dir > $fusions_w_coding_effect_file";
        $pipeliner->add_commands(new Command($cmd, "coding_eff.ok"));

        $fusions_filename = $fusions_w_coding_effect_file;
        
    }
    
    
    ############################################
    # Copy final fusions file as the deliverable:
    
    $cmd = "cp $fusions_filename $output_directory/ctat-LR-fusion.fusion_predictions.tsv";
    $pipeliner->add_commands(new Command($cmd, "copy_final_predictions_to_deliverable.$EXAMINE_CODING_EFFECT.ok"));


    ## add abridged version w/o all the evidence read names.
    $cmd = "$FI_UTILDIR/column_exclusions.pl $output_directory/ctat-LR-fusion.fusion_predictions.tsv "
        . " LR_accessions,JunctionReads,SpanningFrags,CounterFusionLeftReads,CounterFusionRightReads "
        . " > $output_directory/ctat-LR-fusion.fusion_predictions.abridged.tsv";
    $pipeliner->add_commands(new Command($cmd, "abridged_final_preds.$EXAMINE_CODING_EFFECT.ok"));
    


    if ($extract_fusion_LR_fasta) {
        $cmd = "$UTILDIR/extract_fusion_evidence_reads.pl "
            . " --fusions $output_directory/ctat-LR-fusion.fusion_predictions.tsv "
            . " --reads_fasta $chim_candidates_fasta "
            . " --reads_output $extract_fusion_LR_fasta";
        $pipeliner->add_commands(new Command($cmd, "extract_fusion_reads.ok"));
    }
    
    
    $pipeliner->run();
    
    


    
    if ($IGV_REPORTS) {
        &include_IGV_REPORTS($pipeliner, $FI_contigs_file, $FI_annots_gtf, $fusions_filename, $max_IGV_LR_per_fusion);
        $pipeliner->run();
    }
    
    
    print STDERR "\n\n\tDone. See fusion predictions at: $output_directory/ctat-LR-fusion.fusion_predictions.tsv\n\n\n";
    
    exit(0); ### stopping here now.


    
}



####
sub prep_minimap2_reference {
    my ($genome_fa, $intermediates_dir, $MM2_DB_NAME, $MM2_splice_file, $REF_GTF) = @_;

    
    my $mm2_build_ok_checkpoint = "$MM2_DB_NAME.build.ok";
    if (-e $mm2_build_ok_checkpoint) {
        print STDERR "-checkpoint exists: $mm2_build_ok_checkpoint, so skipping.\n";
    }
    else {
        
        my $mm2_prep_pipeliner = new Pipeliner(-verbose => 2,
                                               -checkpoint_dir => "$intermediates_dir/__mm2_prep_chkpts");
        
        
        my $cmd = "$CTAT_MINIMAP2_DIR/ctat-minimap2 -d $MM2_DB_NAME $genome_fa";
        $mm2_prep_pipeliner->add_commands(new Command($cmd, "mm2_prep_genome.ok"));
        
        $cmd = "$CTAT_MINIMAP2_DIR/misc/paftools.ctat.js gff2bed $REF_GTF > $MM2_splice_file";
        $mm2_prep_pipeliner->add_commands(new Command($cmd, "mm2_prep_splices.ok"));
        
        $mm2_prep_pipeliner->run();
    
        system("touch $mm2_build_ok_checkpoint");
    }
    

    return;
}

####
sub include_IGV_REPORTS {
    my ($pipeliner, $FI_contigs_file, $FI_annots_gtf, $fusions_file, $max_IGV_LR_per_fusion) = @_;
    
    my $igv_prep_dir = "$intermediates_dir/IGV_prep";
    if (! -d $igv_prep_dir) {
        mkdir($igv_prep_dir) or die "Error, cannot mkdir $igv_prep_dir";
    }
    

    ## include the fusion gene targets and annotations:
    
    my $cmd = "ln -sf $FI_contigs_file $igv_prep_dir/igv.genome.fa";
    $pipeliner->add_commands(new Command($cmd, "symlink_igv_genome_fa.ok"));

    $cmd = "ln -sf $FI_contigs_file.fai $igv_prep_dir/igv.genome.fa.fai";
    $pipeliner->add_commands(new Command($cmd, "symlink_igv_genome_fa.fai.ok"));

    $cmd = "ln -sf $FI_annots_gtf $igv_prep_dir/igv.annot.gtf";
    $pipeliner->add_commands(new Command($cmd, "symlink_igv_genome_annot.ok"));

    $cmd = "$FI_UTILDIR/gtf_gene_to_bed.pl $igv_prep_dir/igv.annot.gtf > $igv_prep_dir/igv.annot.bed";
    $pipeliner->add_commands(new Command($cmd, "prep_igv_annot_bed.ok"));
    
    
    ## get the long read alignments
    $cmd = "$UTILDIR/LR_sam_fusion_read_extractor.pl --FI_LR_sam $intermediates_dir/LR-FI.mm2.namesorted.qfiltered.bam  --LR_fusion_report $fusions_file --max_alignments_per_fusion $max_IGV_LR_per_fusion > $intermediates_dir/LR-FI.mm2.max_per_fusion-$max_IGV_LR_per_fusion.sam";
    $pipeliner->add_commands(new Command($cmd, "IGV_select_max_LR_per_fusion.ok"));
    
    $cmd = "samtools view -Sb $intermediates_dir/LR-FI.mm2.max_per_fusion-$max_IGV_LR_per_fusion.sam -o $igv_prep_dir/igv.LR.bam && samtools sort $igv_prep_dir/igv.LR.bam -o $igv_prep_dir/igv.LR.sorted.bam && samtools index $igv_prep_dir/igv.LR.sorted.bam";
    $pipeliner->add_commands(new Command($cmd, "igv.LR-FI.mm2.bam.ok"));

    
    ## get the FI short read alignment evidence if it exists.
    my $FI_junction_sam = "$output_directory/FI/fi_workdir/finspector.star.cSorted.dupsMarked.bam.fusion_junc_reads.sam";
    if (-e $FI_junction_sam) {
        $cmd = "samtools view -Sb $FI_junction_sam -T $FI_contigs_file -o $igv_prep_dir/igv.illumina.junction_reads.bam && samtools sort $igv_prep_dir/igv.illumina.junction_reads.bam -o $igv_prep_dir/igv.illumina.junction_reads.sorted.bam && samtools index $igv_prep_dir/igv.illumina.junction_reads.sorted.bam && rm -f $igv_prep_dir/igv.illumina.junction_reads.bam";
        $pipeliner->add_commands(new Command($cmd, "igv.illumina.junction_reads.ok"));
    }

    my $FI_spanning_sam = "$output_directory/FI/fi_workdir/finspector.star.cSorted.dupsMarked.bam.fusion_span_reads.sam";
    if (-e $FI_spanning_sam) {
        $cmd = "samtools view -Sb $FI_spanning_sam -T $FI_contigs_file -o $igv_prep_dir/igv.illumina.spanning_frags.bam && samtools sort $igv_prep_dir/igv.illumina.spanning_frags.bam -o $igv_prep_dir/igv.illumina.spanning_frags.sorted.bam && samtools index $igv_prep_dir/igv.illumina.spanning_frags.sorted.bam && rm -f $igv_prep_dir/igv.illumina.spanning_frags.bam ";
        $pipeliner->add_commands(new Command($cmd, "igv.illumina.spanning_frags.ok"));
    }

    

    ########## Pfam Matches
    ## add pfam matches in genome coordinates:

    my $pfam_igv_gff3_filename = "$igv_prep_dir/igv.pfam.gff3";
    
    $cmd = "$FI_UTILDIR/get_pfam_domain_info.pl " 
        . " --finspector_gtf $FI_annots_gtf "
        . " --genome_lib_dir $genome_lib_dir "
        . " > $pfam_igv_gff3_filename" ;
    
    $pipeliner->add_commands(new Command($cmd, "prep_igv_pfam_gff3.ok"));
    
    ## must convert to bed for viewing
    my $pfam_igv_bed_filename = "$igv_prep_dir/igv.pfam.bed";
    
    $cmd = "$FI_UTILDIR/transcript_gff3_to_bed.pl $pfam_igv_gff3_filename > $pfam_igv_bed_filename";
    
    $pipeliner->add_commands(new Command($cmd, "prep_igv_pfam_bed.ok"));
    
    ######## Seq Similar Regions
    ## add seq-similar region info
    my $seqsimilar_igv_gff3_filename = "$igv_prep_dir/igv.seqsimilar.gff3";

    $cmd = "$FI_UTILDIR/get_seq_similar_region_FI_coordinates.pl "
        . " --finspector_gtf $FI_annots_gtf "
        . " --genome_lib_dir $genome_lib_dir "
        . " > $seqsimilar_igv_gff3_filename ";

    $pipeliner->add_commands(new Command($cmd, "prep_igv_seqsim_gff3.ok") );
                             
    ## must convert to bed for viewing
    my $seqsimilar_igv_bed_filename = "$igv_prep_dir/igv.seqsimilar.bed";

    $cmd = "$FI_UTILDIR/transcript_gff3_to_bed.pl $seqsimilar_igv_gff3_filename > $seqsimilar_igv_bed_filename";
    
    $pipeliner->add_commands(new Command($cmd, "prep_igv_seqsim_bed.ok") );

    
    ################################
    ## generate the fusion json table

    my $fusions_json_file = "$igv_prep_dir/igv.fusion_inspector_web.json";
    my $roi_outfile = "$igv_prep_dir/igv.LR.breakoint.roi.bed";

    $cmd = "$UTILDIR/create_ctat-LR-fusion_inspector_igvjs.py"
        . " --fusion_inspector_directory $output_directory "
        . " --json_outfile $fusions_json_file"
        . " --roi_outfile $roi_outfile"
        . " --file_prefix ctat-LR-fusion";
    
    
    $pipeliner->add_commands(new Command($cmd, "ctat-LR-fusion_report_json.ok") );
    
    ###############
    # make the html
    ## old way, since deprecated and now using igv-reports 
    #my $html_template = "$UTILDIR/fusion_report_html_template/igvjs_fusion.html";
    #
    #$cmd = "$UTILDIR/fusion-reports/create_fusion_report.py"
    #    . " --html_template $html_template "
    #    . " --fusions_json $json_file "
    #    . " --input_file_prefix $igv_prep_dir/igv"
    #    . " --html_output $output_directory/ctat-LR-fusion.fusion_inspector_web.html";
    #
    #
    #$pipeliner->add_commands(new Command($cmd, "fusion_reports_html.ok"));
    
    my $tracks_json_file = "$UTILDIR/fusion_report_html_template/tracks.json";
    if ( -e $FI_junction_sam  && -e $FI_spanning_sam ) {
        $tracks_json_file = "$UTILDIR/fusion_report_html_template/tracks.wIllumina.json";
    }
    
    my $tracks_json_cp_file = "$igv_prep_dir/tracks.json";
    $cmd = "cp $tracks_json_file $tracks_json_cp_file";
    $pipeliner->add_commands(new Command($cmd, "copy_tracks_json.ok"));
    
    $cmd = "cd $igv_prep_dir && "
        . " create_report "
        . " $fusions_json_file "
        . " igv.genome.fa "
        . " --type fusion "
        . " --track-config $tracks_json_cp_file "
        . " --output $output_directory/ctat-LR-fusion.fusion_inspector_web.html";
    
    
    $pipeliner->add_commands(new Command($cmd, "fusion_igv_reports_html.ok"));
   
    return;
}

####
sub get_total_read_count {
    my ($total_read_count_filename) = @_;

    unless (-e $total_read_count_filename) {
	die "Error, cannot locate file: $total_read_count_filename";
    }
    
    open(my $fh, "$total_read_count_filename") or die "Error, cannot open file: $total_read_count_filename";
    my $total_read_count = <$fh>;
    chomp $total_read_count;

    unless ($total_read_count =~ /^\d+$/) {
	die "Error, not interpreting total read count [$total_read_count] as integer value";
    }

    return($total_read_count);
}


####
sub count_num_fusions {
    my ($fusions_filename) = @_;

    my $count = 0;
    
    open(my $fh, $fusions_filename) or die "Error, cannot open file: $fusions_filename";
    while(<$fh>) {
        if (/^\#/) { next; }
        if (/\w/) { $count += 1; }
    }

    return($count);
}
