#!/usr/bin/perl
use strict;

# Input/Output Files
my $random_tags_in="$ARGV[0]";
my $fastq_R1_in="$ARGV[1]";
my $fastq_R2_in="$ARGV[2]";
my $alignments_in="$ARGV[3]";
my $alignments_out="$ARGV[4]";
if(! -f "$random_tags_in"){ print "Random tags file not found.\n"; exit; }
if(! -f "$fastq_R1_in"){ print "Fastq file #1 not found.\n"; exit; }
if(! -f "$fastq_R2_in"){ print "Fastq file #2 not found.\n"; exit; }
if(! -f "$alignments_in"){ print "SAM alignment results not found.\n"; exit; }
if($alignments_out eq ""){ print "Output file name is required.\n"; exit; }

# List of Valid Chromosomes
my %valid_chr; print "\n";
$valid_chr{"chrI"}="1";    $valid_chr{"chrIX"}="1";
$valid_chr{"chrII"}="1";   $valid_chr{"chrX"}="1";
$valid_chr{"chrIII"}="1";  $valid_chr{"chrXI"}="1";
$valid_chr{"chrIV"}="1";   $valid_chr{"chrXII"}="1";
$valid_chr{"chrV"}="1";    $valid_chr{"chrXIII"}="1";
$valid_chr{"chrVI"}="1";   $valid_chr{"chrXIV"}="1";
$valid_chr{"chrVII"}="1";  $valid_chr{"chrXV"}="1";
$valid_chr{"chrVIII"}="1"; $valid_chr{"chrXVI"}="1";

# Opening Input/Output Files
open(TAGS,"$random_tags_in");
open(FASTQR1,"$fastq_R1_in");
open(FASTQR2,"$fastq_R2_in");
open(OUTALN,">$alignments_out");
my $max_batch_size=10000000;
my $processing_done=0;
my $num_reads_in=0;
my $num_reads_out=0;
my $num_reads_single=0;
my $num_reads_multi=0;
my $num_mapping_out=0;

# Initializing READ Dataset For Current Batch
while($processing_done==0){
  my $num_READ=0;
  my %READ_index;
  my @READ_basics;
  my @READ_matchs;
  
  # Loading Next Read Data in TAG + FASTQ Files
  while(($num_READ<$max_batch_size)&&(my $tag_line=<TAGS>)){
    chomp($tag_line); my @TAG=split("\t",$tag_line);
    my $read_id=$TAG[0];
    my $random_tag=$TAG[1];
    my $header1=<FASTQR1>;
    my $header2=<FASTQR2>;
    my @HEADER1=split(" ",$header1);
    my @HEADER2=split(" ",$header2);
    my $R1_seq=<FASTQR1>; chomp($R1_seq);
    my $R2_seq=<FASTQR2>; chomp($R2_seq);
    my $R1_len=length($R1_seq);
    my $R2_len=length($R2_seq);
    my $next=<FASTQR1>; $next=<FASTQR1>;
    $next=<FASTQR2>; $next=<FASTQR2>;
    
    # Checking Retrieved Data & Updating READ Dataset
    if($#TAG!=1){ print "Inconsistent tag file format.\n"; exit; }
    if($READ_index{"$read_id"} ne ""){ print "Duplicated read identifiers.\n"; exit; }
    if($random_tag!~/^[ACGT]{8}$/){ print "Inconsistent tag file format.\n"; exit; }
    if((substr($HEADER1[0],1) ne "$read_id")||(substr($HEADER2[0],1) ne "$read_id")){
      print "Mismatch between fastq and tag files, check script arguments.\n"; exit; }
    $READ_basics[$num_READ]="$read_id\t$random_tag\t$R1_seq\t$R1_len\t$R2_seq\t$R2_len";
    $READ_index{"$read_id"}="$num_READ"; $READ_matchs[$num_READ]="0\t\t"; $num_READ++; }
  
  # Parsing SAM Input File & Selecting Viable Entries
  if($num_READ>0){ $num_reads_in+=$num_READ; open(IN,"$alignments_in");
    while(my $sam_line=<IN>){ if($sam_line!~/^@/){
      chomp($sam_line); my @SAM=split("\t",$sam_line);
      my $read_id=$SAM[0]; my $read_index=$READ_index{"$read_id"};
      if(($read_index ne "")&&($valid_chr{"$SAM[2]"} ne "")){
        if(($SAM[6] eq "=")||($SAM[6] eq $SAM[2])){
          
          # Extracting Detailed Mapping Results For Selected Entry
          my $read_chr=$SAM[2]; my $read_pos=$SAM[3]; my $read_cig=$SAM[5];
          my $read_len=length($SAM[10]); my $read_num="";
          my $read_str="+"; my $mate_pos=$SAM[7]; my $flag=$SAM[1];
          if($flag>=2048){ $flag-=2048; } if($flag>=1024){ $flag-=1024; }
          if($flag>=512){ $flag-=512; } if($flag>=256){ $flag-=256; }
          if($flag>=128){ $read_num.="R2"; $flag-=128; }
          if($flag>=64){ $read_num.="R1"; $flag-=64; }
          if($flag>=32){ $flag-=32; } if($flag>=16){ $read_str="-"; }
          if(($read_pos!~/^[0-9]+$/)||($read_pos==0)||($mate_pos!~/^[0-9]+$/)||($mate_pos==0)){
            print "Inconsistent mapping positions : $read_pos / $mate_pos\n"; exit; }
          if($read_num!~/^R[12]$/){ print "Inconsistent SAM file format\n"; exit; }
          my $mapping="$read_chr,$read_str,$read_pos,$read_cig,$read_len,$mate_pos";
          
          # Retrieving Existing Data For Corresponding Read
          my @MATCHS=split("\t",$READ_matchs[$read_index]);
          my $num_matchs=$MATCHS[0]; my %missing_entries;
          my @MATCHS_R1=split(";",$MATCHS[1]);
          my @MATCHS_R2=split(";",$MATCHS[2]);
          for(my $match=0;$match<$num_matchs;$match++){
            if(($MATCHS_R1[$match] ne "")&&($MATCHS_R2[$match] eq "")){
              my @INFO=split(",",$MATCHS_R1[$match]);
              $missing_entries{"R2,$INFO[0],$INFO[5],$INFO[2]"}="$match"; }
            elsif(($MATCHS_R1[$match] eq "")&&($MATCHS_R2[$match] ne "")){
              my @INFO=split(",",$MATCHS_R2[$match]);
              $missing_entries{"R1,$INFO[0],$INFO[5],$INFO[2]"}="$match"; } }
          
          # Adding Mapping Results to Existing Data
          my $match_index=$missing_entries{"$read_num,$read_chr,$read_pos,$mate_pos"};
          if($match_index eq ""){ $match_index=$num_matchs; $num_matchs++; }
          if($read_num eq "R1"){ $MATCHS_R1[$match_index]="$mapping"; }
          else{ $MATCHS_R2[$match_index]="$mapping"; } $READ_matchs[$read_index]=
          "$num_matchs\t".join(";",@MATCHS_R1)."\t".join(";",@MATCHS_R2); } } } } close(IN);
    
    # Retrieving All Extracted Data For Current Read
    for(my $read_index=0;$read_index<$num_READ;$read_index++){
      my @BASICS=split("\t",$READ_basics[$read_index]);
      my @MATCHS=split("\t",$READ_matchs[$read_index]);
      my $num_MATCHS=$MATCHS[0]; my $num_printed=0;
      my @MATCHS_R1=split(";",$MATCHS[1]);
      my @MATCHS_R2=split(";",$MATCHS[2]);
      for(my $match_index=0;$match_index<$num_MATCHS;$match_index++){
        my @INFO_R1=split(",",$MATCHS_R1[$match_index]);
        my @INFO_R2=split(",",$MATCHS_R2[$match_index]);
        
        # Fixing Missing Entries For Multiple Hits Full Overlap
        if(($MATCHS_R1[$match_index] ne "")&&($MATCHS_R2[$match_index] eq "")){
          if($INFO_R1[2]==$INFO_R1[5]){ my $strand="+"; if($INFO_R1[1] eq "+"){ $strand="-"; }
            my $match="$INFO_R1[0],$strand,$INFO_R1[2],$BASICS[5]M,$BASICS[5],$INFO_R1[2]";
            $MATCHS_R2[$match_index]="$match"; @INFO_R2=split(",",$match); } }
        if(($MATCHS_R1[$match_index] eq "")||($MATCHS_R2[$match_index] ne "")){
          if($INFO_R2[2]==$INFO_R2[5]){ my $strand="+"; if($INFO_R2[1] eq "+"){ $strand="-"; }
            my $match="$INFO_R2[0],$strand,$INFO_R2[2],$BASICS[3]M,$BASICS[3],$INFO_R2[2]";
            $MATCHS_R1[$match_index]="$match"; @INFO_R1=split(",",$match); } }
        if(($MATCHS_R1[$match_index] ne "")&&($MATCHS_R2[$match_index] ne "")){
          
          # Checking Mapping Results For Current Read
          if(($valid_chr{"$INFO_R1[0]"} eq "")||($INFO_R1[0] ne $INFO_R2[0])){
            print "Failed to extract consistent alignment results  1  $BASICS[0]\n"; exit; }
          if(($INFO_R1[1] ne "+")&&($INFO_R1[1] ne "-")){
            print "Failed to extract consistent alignment results  2  $BASICS[0]\n"; exit; }
          if(($INFO_R2[1] ne "+")&&($INFO_R2[1] ne "-")){
            print "Failed to extract consistent alignment results  3  $BASICS[0]\n"; exit; }
          if(($INFO_R1[2]!~/^[0-9]+$/)||($INFO_R2[2]!~/^[0-9]+$/)){
            print "Failed to extract consistent alignment results  4  $BASICS[0]\n"; exit; }
          if(($INFO_R1[5]!~/^[0-9]+$/)||($INFO_R2[5]!~/^[0-9]+$/)){
            print "Failed to extract consistent alignment results  5  $BASICS[0]\n"; exit; }
          if(($INFO_R1[2]!=$INFO_R2[5])||($INFO_R1[5]!=$INFO_R2[2])){
            print "Failed to extract consistent alignment results  6  $BASICS[0]\n"; exit; }
          
          # Checking Mapping CIGARS & Removing Incomplete Mapping Results
          if($INFO_R1[2]==$INFO_R2[2]){
            if($INFO_R1[3] eq "*"){ $INFO_R1[3]="$INFO_R1[4]M"; }
            if($INFO_R2[3] eq "*"){ $INFO_R2[3]="$INFO_R2[4]M"; } }
          if(($INFO_R1[3]!~/^[0-9MIDNSHP]+$/)||($INFO_R2[3]!~/^[0-9MIDNSHP]+$/)){
            print "Couldn't parse mapping CIGARS : $INFO_R1[3] / $INFO_R2[3] ($BASICS[0])\n"; exit; }
          if(($INFO_R1[3]=~/^[0-9MID]+$/)&&($INFO_R2[3]=~/^[0-9MID]+$/)){
            if(($INFO_R1[4] ne $BASICS[3])||($INFO_R2[4] ne $BASICS[5])){
              print "Failed to extract consistent alignment results  7  $BASICS[0]\n"; exit; }
            
            # Reformatting CIGARS For Compatibility With Next Scripts
            my @CIGARS=("$INFO_R1[3]","$INFO_R2[3]"); my @CIGARS_OUT;
            foreach my $cigar (@CIGARS){ my @CIGAR_val=split(/[MID]/,$cigar);
              my @CIGAR_pat=split(/[0-9]+/,$cigar); shift(@CIGAR_pat); my $output_cigar="";
              if($#CIGAR_val!=$#CIGAR_pat){ print "CIGAR conversion failed\n"; exit; }
              for(my $i=0;$i<=$#CIGAR_val;$i++){ my $pat=$CIGAR_pat[$i]; my $val=$CIGAR_val[$i];
                if(($pat!~/^[MID]$/)||($val!~/^[0-9]+$/)){ print "CIGAR conversion failed\n"; exit; }
                if($pat eq "M"){ $output_cigar.="$val"; } elsif($pat eq "I"){
                  $output_cigar.="^$val\$"; } else{ $output_cigar.="^";
                  for(my $j=0;$j<$val;$j++){ $output_cigar.="N"; }
                  $output_cigar.="\$"; } } push(@CIGARS_OUT,"$output_cigar"); }
            $INFO_R1[3]=$CIGARS_OUT[0]; $INFO_R2[3]=$CIGARS_OUT[1];
            
            # Adding Current Mapping Results to Output Files
            print OUTALN "$BASICS[0]\t$BASICS[1]\t$BASICS[2]\t$INFO_R1[0]\t$INFO_R1[1]\t";
            print OUTALN "$INFO_R1[2]\t$INFO_R1[3]\t$BASICS[4]\t$INFO_R2[0]\t";
            print OUTALN "$INFO_R2[1]\t$INFO_R2[2]\t$INFO_R2[3]\n"; $num_printed++; } } }
      if($num_printed>0){ $num_reads_out++; $num_mapping_out+=$num_printed;
        if($num_printed==1){ $num_reads_single++; } else{ $num_reads_multi++; } } }
    print "  Processed $num_reads_in sequencing reads...\n"; } else{ $processing_done=1; } }

# Closing Input/Output Files + Stats
close(TAGS); close(FASTQR1); close(FASTQR2); close(OUTALN);
my $p1="0.00"; if($num_reads_in>0){
  $p1=sprintf("%.2f",($num_reads_out/$num_reads_in)*100); }
my $p2="0.00"; if($num_reads_out>0){
  $p2=sprintf("%.2f",($num_reads_single/$num_reads_out)*100); }
my $p3="0.00"; if($num_reads_out>0){
  $p3=sprintf("%.2f",($num_reads_multi/$num_reads_out)*100); }
my $p4="0.0000"; if($num_reads_out>0){
  $p4=sprintf("%.4f",$num_mapping_out/$num_reads_out); }
print "\n  Input Paired-Reads     : $num_reads_in reads\n";
print "  Output Paired-Reads    : $num_reads_out reads ($p1%)\n\n";
print "     Single Location     : $num_reads_single reads ($p2%)\n";
print "     Multiple Locations  : $num_reads_multi reads ($p3%)\n\n";
print "  Num Alignments Printed : $num_mapping_out ($p4 per read)\n\n";

