#!/usr/bin/perl

#==============================================================
# Script Name: Assign_D4Z4_Reads.pl
# Description: This Perl script processes a BAM file, performs
#              ssecondary mapping, and outputs the results.
# Author: Bob Weiss
# Date: July 26, 2023
#==============================================================

=head1

07-2023
This is the script for classifying D4Z4-targeted ONT reads:
1. mapping to chm13v2.0_HG002-1-JAHKSE010000016.mmi with minimap2 2.24-r1122
2. extracting chr4qA, 4qB and chr 10 reads from BAM file
3. classifying reads by chromosome, location, and size

=cut

=head1 DESCRIPTION

This script assigns chromosome, location, and size to Oxford Nanopore reads targeted to the chr4q and chr10q D4Z4 regions.

=head1 REQUIREMENTS

=head2 Perl Version

This script requires Perl 5.16 or higher.

=head2 Operating System Compatibility

Tested and expected to work on Linux.

=head2 External Perl Modules

The following external Perl modules are required. You can install them using CPAN or your preferred package manager.

=over 4

=item * POSIX

=item * Cwd

=item Getopt::Std

=back

=head2 Third-Party Software Dependencies

This script relies on the following external software:

=over 4

=item * samtools

=item * cross_match 

=back

=head2 Input Data Format

The input data should be a BAM file generated from minimap2 alignment to the T2T CHM13v2.0/hs1 reference genome, that includes the HG002-1-JAHKSE010000016_1-200000_rev.fasta contig  (200kb contig that includes chr4qB D4Z4 array) 

=head2 Output Data Format

The script generates BAM, fasta, and text files with D4Z4-classified reads 

=head1 CONFIGURATION

No specific configuration file is required for this script.

=head1 USAGE

Usage: Assign_D4Z4_Reads.pl -f BAM_INPUT -d Output_Directory -o OutputFile_prefix

=head1 EXAMPLE

Assign_D4Z4_Reads.pl -f Test_D4Z4-targeted-reads_SUP_5mC_chm13v2.0_HG002-1-JAHKSE010000016.sort.bam -d Output_Test_D4Z4-reads -o Output_Test_D4Z4-targeted-reads

=head1 ENVIRONMENT VARIABLES

No specific environment variables are used in this script.

=head1 PERMISSIONS

The script requires read access to the input BAM file and write access to the directory where the output files will be saved.

=head1 DATABASE REQUIREMENTS

This script does not interact with any databases.

=head1 LICENSE

This script is distributed under the MIT License.

=cut

########### Check for executable external software:

my $command1 = 'cross_match';  # http://www.phrap.org/consed/consed.html#howToGet
my $exit_code1 = system("which $command1 > /dev/null 2>&1");
if ($exit_code1 == 0) {
    print "The program '$command1' is executable and found in the system's PATH.\n";
} else {
    print "The program '$command1' is either not executable or not in the system's PATH.\n";
}

my $command2 = 'samtools';  # http://www.htslib.org/download/
my $exit_code2 = system("which $command2 > /dev/null 2>&1");
if ($exit_code2 == 0) {
    print "The program '$command2' is executable and found in the system's PATH.\n";
} else {
    print "The program '$command2' is either not executable or not in the system's PATH.\n";
}


############

use POSIX qw/strftime/;
use Cwd;
use Getopt::Std;
getopt('f:d:o:');

unless (defined $opt_o && defined $opt_d && defined $opt_f) {
    die "Usage: $0 -f Test_D4Z4-targeted-reads_SUP_5mC_chm13v2.0_HG002-1-JAHKSE010000016.sort.bam -d Output_Test_D4Z4-reads -o Output_Test_D4Z4-targeted-reads \n",
	"-f = input BAM file\n",
	"-d = output directory\n",
	"-o = output files prefix\n",
	"Run this script from a directory that contains the DATA_FILES directory or create a path to fasta files in the DATA_FILES directory\n";
}

$| = 1;

my $BAM = $opt_f;
my $OUT_DIR = $opt_d;
my $PRE = $opt_o;

my $MAIN_DIR = cwd();

my $CONS_DIR = "$MAIN_DIR/DATA_FILES";
mkdir($OUT_DIR,0755) unless (-d $OUT_DIR);
system("cp $BAM $OUT_DIR");

chdir($OUT_DIR);
system("samtools index $BAM");

my $date = strftime('%d-%b-%Y',localtime);
my $time1 = strftime('%H:%M:%S',localtime);

my $LOG = $PRE . '_AssignHaplotype_to_D4Z4_Reads.log';

open(LOG,">$LOG") or die $!;
print LOG "$0 -f $BAM -d OUT_DIR -o $PRE \ndate = $date\ntime start = $time1\n";

################################# FIND 3.3kb reads and generate fast5 lists 

my $BAM4qA = $PRE . '.chr4qA.bam';
my $BAM4qB = $PRE . '.chr4qB.bam';
my $BAM10 = $PRE . '.chr10.bam';

my $BAM4qA_fa = $PRE . '.chr4qA.fasta';
my $BAM4qB_fa = $PRE . '.chr4qB.fasta';
my $BAM10_fa = $PRE . '.chr10.fasta';

my $BAM4qA_size = $PRE . '.chr4qA.size.txt';
my $BAM4qB_size = $PRE . '.chr4qB.size.txt';
my $BAM10_size = $PRE . '.chr10.size.txt';

##### 3.3kb chr-specific D4Z4 KpnI fasta, p13_E-11/pLAM query fasta, and D4Z4 array flanking fasta 

my $FASTA_BE = "$CONS_DIR/ALL_D4Z4-BE_03-19-23_cons.fasta";
my $FASTA_FLANK2 = "$CONS_DIR/ALL_Chr4-10_proximal_distal_3353.fasta";
my $FASTA_p13_pLAM = "$CONS_DIR/p13-pLAM-D4Z4.fasta";
my $FASTA_FLANK1 = "$CONS_DIR/chr4_ALL_flank_10kb.fasta";

##### extract reads from minimap2 BAM file, entire D4Z4 arrays on chr4 and chr10
system("samtools view -h $BAM chr4:193000001-198000000 -o $BAM4qA");
system("samtools index $BAM4qA");
system("samtools fasta $BAM4qA > $BAM4qA_fa");
system("fasta_sizes2.pl $BAM4qA_fa > $BAM4qA_size");

system("samtools view -h $BAM HG002-1-JAHKSE010000016_1-200000:1-200000 -o $BAM4qB");
system("samtools index $BAM4qB");
system("samtools fasta $BAM4qB > $BAM4qB_fa");
system("fasta_sizes2.pl $BAM4qB_fa > $BAM4qB_size");

system("samtools view -h $BAM chr10:134510001-145000000 -o $BAM10");
system("samtools index $BAM10");
system("samtools fasta $BAM10 > $BAM10_fa");
system("fasta_sizes2.pl $BAM10_fa > $BAM10_size");


##### Load FASTA 
my ($SEQ4qA_ref,$SIZE4qA_ref) = fasta_size_load($BAM4qA_fa,$BAM4qA_size);
my %SEQ4qA = %$SEQ4qA_ref;
my %SIZE4qA = %$SIZE4qA_ref;

my ($SEQ4qB_ref,$SIZE4qB_ref) = fasta_size_load($BAM4qB_fa,$BAM4qB_size);
my %SEQ4qB = %$SEQ4qB_ref;
my %SIZE4qB = %$SIZE4qB_ref;
    
my ($SEQ10_ref,$SIZE10_ref) = fasta_size_load($BAM10_fa,$BAM10_size);
my %SEQ10 = %$SEQ10_ref;
my %SIZE10 = %$SIZE10_ref;


#### Classify reads as TYPE1 = p13_pLAM targeted reads, TYPE2 = D4Z4 array flanking, TYPE3 = 3.3 kb D4Z4-cut_3353BE targeted reads
my $CM_SWITCH = 'ON';

type_READS($PRE,$BAM4qA_fa,'4qA',\%SIZE4qA,\%SEQ4qA);
type_READS($PRE,$BAM4qB_fa,'4qB',\%SIZE4qB,\%SEQ4qB);
type_READS($PRE,$BAM10_fa,'10',\%SIZE10,\%SEQ10);

   
close LOG;


##################################################################
###     SUBROUTINES
##################################################################

#### classify chr4 and chr10 reads

sub type_READS {
    my $name = shift;
    my $fasta1 = shift;
    my $chr = shift;
    my $size_ref = shift;
    my $fasta_ref = shift;
    my %size = %$size_ref;
    my %fasta = %$fasta_ref;

    #### find p13-E11 and pLAM targeted reads
    my $cm1 = $name . '_' . $chr . '_p13-pLAM-RU3.cm';
    system("cross_match -tags -masklevel 101 -gap_ext 0 -minscore 100 $fasta1 $FASTA_p13_pLAM > $cm1") if ($CM_SWITCH eq 'ON');
    my @cm1 = `grep ALIGN $cm1`;
    my (%reads,%READ_CM,$ct1,%CM_CT1,%TYPE1,%RANGE_RU);
    $RANGE_RU{'na'}++;
    foreach (@cm1) { #### p13_pLAM 
	chomp;
	my @tmp = split;
	my $read = $tmp[5];
	my $SDI = $tmp[2] + $tmp[3] + $tmp[4];
	$CM_CT1{$read}++;
	if (/D4Z4/) { ### Count full-length RU3
	    my $len = $tmp[11] - $tmp[10] + 1;
	    $len = $tmp[12] - $tmp[13] + 1 if ($tmp[9] eq 'C');
	    next unless ($len > 3100);
	    $reads{$read}++;
	    $RANGE_RU{$tmp[5]}{$_} = $reads{$read} foreach ($tmp[6] .. $tmp[7]);
	}
	if ($CM_CT1{$read} == 1) { ### Look at first match for p13 or pLAM
	    if ($tmp[6] < 200 && $tmp[9] eq 'chr4_p13E-11' && $tmp[10] < 60 && $SDI < 30) {
		$TYPE1{$read} = 'p13';
	    }
	    elsif ($tmp[6] < 200 && $tmp[9] eq 'C' && $tmp[10] eq 'chr10_pLAM' && $tmp[13] < 60 && $SDI < 30) {
		$TYPE1{$read} = 'pLAM';
	    }
	}
	push(@{$READ_CM{$read}},$_);
    }
    foreach my $read (sort keys %CM_CT1) {
	my @cm8 = @{$READ_CM{$read}};
	my @last = split /\s+/, $cm8[$#cm8];
	my $last_end = $& if ($last[8] =~ /\d+/);
	if ($TYPE1{$read} eq 'p13') {
	    if ($last[9] eq 'chr10_pLAM' && $last_end < 75) {
		$TYPE1{$read} = 'p13_pLAM' . '_' . $reads{$read} . 'U';
	    }
	    else {
		$TYPE1{$read} = 'p13' . '_' . $reads{$read} . 'U';
	    }
	}
	elsif ($TYPE1{$read} eq 'pLAM') {
	    if ($last[10] eq 'chr4_p13E-11' && $last_end < 75) {
		$TYPE1{$read} = 'pLAM_p13' . '_' . $reads{$read} . 'U';
	    }
	    else {
		$TYPE1{$read} = 'pLAM' . '_' . $reads{$read} . 'U';
	    }
	}
	elsif ($reads{$read} > 2) {
	    if ($last[10] eq 'chr4_p13E-11' && $last_end < 75) {
		$TYPE1{$read} = 'pLAM_p13_trunc' . '_' . $reads{$read} . 'U';
	    }
	    elsif ($last[9] eq 'chr10_pLAM' && $last_end < 75) {
		$TYPE1{$read} = 'p13_pLAM_trunc' . '_' . $reads{$read} . 'U';
	    }
	}
    }
  
    #### find reads in the flanking direction from the D4Z4 arrays
    my $cm2A = $name . '_' . $chr . '_FLANK.cm';
    my $cm2B = $name . '_' . $chr . '_3353-FLANK.cm';
    system("cross_match -tags -gap_ext 0 -minscore 2500 $fasta1 $FASTA_FLANK1 > $cm2A") if ($CM_SWITCH eq 'ON');
    system("cross_match -tags -gap_ext 0 -minscore 2500 $fasta1 $FASTA_FLANK2 > $cm2B") if ($CM_SWITCH eq 'ON');
    my @cm2 = `grep ALIGN $cm2A`;
    my @distal = `grep distal $cm2B | grep ALIGN`;
    my @proximal = `grep proximal $cm2B | grep ALIGN`;
    my (%CM_CT2,%TYPE2);
    foreach (@cm2) { #### _FLANK
	chomp;
	my @tmp = split;
	my $read = $tmp[5];
	$CM_CT2{$read}++;
	if ($CM_CT2{$read} == 1) { ### Look at first match for p13 or cen or telo match
	    if ($tmp[12] > 9900 && $tmp[9] eq 'C' && $tmp[10] eq 'chr4qA_cen_10kb') {
		$TYPE2{$read} = 'centromeric';
	    }
	    elsif ($tmp[10] < 100 && $tmp[9] =~ /telo/) {
		$TYPE2{$read} = 'telomeric';
	    }
	}
    }

    my %RANGE_DIST;
    $RANGE_DIST{'na'}++;
    my %RANGE_PROX;
    $RANGE_PROX{'na'}++;
    my %READS2;
    $READS2{'na'}++;
    foreach (@distal) {	    
	chomp;
	my @cm = split;
	next if ($cm[9] eq 'C');
	my $dist_end = $cm[6] + 3000;
	$RANGE_DIST{$chr}{$cm[5]}{$_}++ foreach ($cm[6] .. $dist_end);
	$READS2{$cm[5]} = $cm[9];
    }
    foreach (@proximal) {
	chomp;
	my @cm = split;
	next unless ($cm[9] eq 'C');
	my $prox_end = $cm[6] + 1700;
	$RANGE_PROX{$chr}{$cm[5]}{$_}++ foreach ($cm[6] .. $prox_end);
	$READS2{$cm[5]} = $cm[10];
    }

    
    #### find 3.3 kb D4Z4-cut reads (CD.Cas9.YQFF3353.BE guide RNA, 5'-TTCCTCCGGGACAAAAGACC-3')  

    my $cm3 = $name . '_' . $chr . '_3353BE.cm';
    system("cross_match -tags -gap_ext 0 -minscore 1500 $fasta1 $FASTA_BE > $cm3") if ($CM_SWITCH eq 'ON');
    my @cm3 = `grep ALIGN $cm3`;
    my (%CM_CT3,%TYPE3);
    foreach (@cm3) {
	chomp;
	my @tmp = split;
	my $read = $tmp[5];
	$CM_CT3{$read}++;
	if ($CM_CT3{$read} == 1) { ### Look at first match for p13 or cen or telo match
	    my $last_end = $& if ($tmp[8] =~ /\d+/);
	    if ($tmp[6] <= 45 && $tmp[7] >= 3250 && $last_end <= 45) {
		$TYPE3{$read} = '3353BE';
	    }
	}
    }

    #### output summary files for read types:
    
    my $type1 = $name . '_' . $chr . '_p13-pLAM-D4Z4.reads.txt';
    my $type2A = $name . '_' . $chr . '_FLANK.reads.txt';
    my $type2B = $name . '_' . $chr . '_3353-FLANK.reads.txt';
    my $type3 = $name . '_' . $chr . '_3353BE.reads.txt';
    my $type4 = $name . '_' . $chr . '_Unassigned.reads.txt';

    my $type3_fa = $name . '_' . $chr . '_3353BE.reads.fasta';
    open(TYPE3_FA,">$type3_fa") or die $! or die $!;
    foreach my $read (keys %TYPE3) {
	print TYPE3_FA ">$read\n$fasta{$read}\n";
    }
    close TYPE3_FA;
    
    open(TYPE1,">$type1") or die $!;
    print TYPE1 "ReadName\tOrientation_D4Z4-count\tsize_bp\n";
    open(TYPE2A,">$type2A") or die $!;
    print TYPE2A "ReadName\tOrientation_D4Z4-count\tsize_bp\n";
    open(TYPE2B,">$type2B") or die $!;
    print TYPE2B "ReadName\tOrientation_D4Z4-count\tsize_bp\n";
    open(TYPE3,">$type3") or die $!;
    print TYPE3 "ReadName\tReadType\tsize_bp\n";
    open(TYPE4,">$type4") or die $!;
    print TYPE4 "ReadName\tReadType\tsize_bp\n";

    my ($TYPE1_CT,$TYPE2_CT,$TYPE3_CT,$TYPE4_CT);
    
    foreach my $read (sort {$size{$b} <=> $size{$a}} keys %size) {
	if ($TYPE1{$read}) { ## p13_pLAM
	    print TYPE1 "$read\t$TYPE1{$read}\t$size{$read}\n";
	    $TYPE1_CT++;
	}
	elsif ($READS2{$read}) { ## FLANKS
	    print TYPE2B "$read\t$READS2{$read}\t$size{$read}\n";
	    $TYPE2_CT++;
	}
	elsif ($TYPE2{$read}) { ## FLANKS
	    print TYPE2A "$read\t$TYPE2{$read}\t$size{$read}\n";
	    $TYPE2_CT++;
	}
	elsif ($TYPE3{$read}) { ## 3353
	    print TYPE3 "$read\t$TYPE3{$read}\t$size{$read}\n";
	    $TYPE3_CT++;
	}
	else {
	    print TYPE4 "$read\tna\t$size{$read}\n";
	    $TYPE4_CT++;
	}
    }
    close TYPE1;
    close TYPE2A;
    close TYPE2B;
    close TYPE3;
    close TYPE4;	

    print LOG "Read totals chr$chr:\n",
	"\tp13-pLAM-D4Z4 read count = $TYPE1_CT; output file = $type1, \n",
	"\tp13-pLAM-FLANK read count = $TYPE2_CT; output files = $type2A\n",
	"\tD4Z4-cut 3.3 kb read count = $TYPE3_CT; output files = $type3, $type3_fa\n",
	"\tunassigned read count = $TYPE4_CT; output files = $type4\n\n";
}


################################# Import FASTA file to hash
sub fasta_load {
    my $file = shift;
    my %fasta;
    my $NAME;
    open(IN,$file) or die "Cannot open $file in fasta_load $!\n";
    while(<IN>) {
	chomp;
	if (/^>(\S+)/) {
	    $NAME = $1;
	}
	else {
	    $fasta{$NAME} .= $_;
	}
    }
    close IN;
    return(\%fasta);
}

#################################
sub fasta_size_load {
    my $file1 = shift;
    my (%fasta,%size);
    my $NAME;
    open(IN,$file1) or die "Cannot open $file1 in fasta_size_load $!\n";
    while(<IN>) {
	chomp;
	if (/^>(\S+)/) {
	    $NAME = $1;
	}
	else {
	    $fasta{$NAME} .= $_;
	}
    }
    close IN;
    foreach my $name (sort keys %fasta) {
	$size{$name} = length($fasta{$name});
    }
    return(\%fasta,\%size);
}
