#!/usr/bin/perl
use strict;

# Input/Output Files
my $Ty3_LTR_in="$ARGV[0]";
my $fastq_R1_in="$ARGV[1]";
my $fastq_R2_in="$ARGV[2]";
my $fastq_R1_out="$ARGV[3]";
my $fastq_R2_out="$ARGV[4]";
my $random_tags_out="$ARGV[5]";
if(! -f "$Ty3_LTR_in"){ print "Ty3 sequence not found.\n"; exit; }
if(! -f "$fastq_R1_in"){ print "Fastq file #1 not found.\n"; exit; }
if(! -f "$fastq_R2_in"){ print "Fastq file #2 not found.\n"; exit; }
if($fastq_R1_out eq ""){ print "Output file names required.\n"; exit; }
if($fastq_R2_out eq ""){ print "Output file names required.\n"; exit; }
if($random_tags_out eq ""){ print "Output file names required.\n"; exit; }

# Ty3 Target Sequence
open(IN,"$Ty3_LTR_in"); my $l=<IN>; chomp($l);
if($l!~/^>Ty3/){ print "Inconsistent Ty3 sequence file.\n"; exit; }
$l=<IN>; chomp($l); my $LTR_seq=$l; close(IN);
my @LTR=split("",$LTR_seq); my $LTR_len=length($LTR_seq);
if($LTR_seq!~/^[ACGT]+$/){ print "Inconsistent Ty3 sequence.\n"; exit; }

# Opening Input/Output Files
open(IN1,"$fastq_R1_in");
open(IN2,"$fastq_R2_in");
open(OUT1,">$fastq_R1_out");
open(OUT2,">$fastq_R2_out");
open(TAGS,">$random_tags_out");
my $num_mismat=0; my $num_fastq_in=0;
my $num_frames=0; my $num_fastq_out=0;
my $num_nocall=0; my $num_fastq_rej=0;
my $num_others=0;

# Parsing Input Files
while((my $l1=<IN1>)&&(my $l2=<IN2>))
{
  # Input Paired-Read
  $num_fastq_in++;      chomp($l1); chomp($l2); my $head11=$l1; my $head12=$l2;
  $l1=<IN1>; $l2=<IN2>; chomp($l1); chomp($l2); my $calls1=$l1; my $calls2=$l2;
  $l1=<IN1>; $l2=<IN2>; chomp($l1); chomp($l2); my $head21=$l1; my $head22=$l2;
  $l1=<IN1>; $l2=<IN2>; chomp($l1); chomp($l2); my $phred1=$l1; my $phred2=$l2;
  my @HEADER1=split(" ",$head11); my $id_R1=substr($HEADER1[0],1);
  my @HEADER2=split(" ",$head12); my $id_R2=substr($HEADER2[0],1);
  if($id_R1 ne $id_R2){ print "Mismatch read identifiers"; exit; }
  my $read1="$head11\n".substr($calls1,8+$LTR_len)."\n$head21\n".substr($phred1,8+$LTR_len)."\n";
  my $read2="$head12\n$calls2\n$head22\n$phred2\n";
  
  # 28bp Sequence
  my $segment=substr($calls1,8,$LTR_len);
  my @SUB=split("",$segment); my $d=0;
  for(my $i=0;$i<$LTR_len;$i++){
    if($SUB[$i] ne $LTR[$i]){ $d++; } }
  
  # Output Files / Statistics
  if($d==0){ if(substr($calls1,0,8)!~"N"){ $num_fastq_out++;
      print OUT1 "$read1"; print OUT2 "$read2";
      print TAGS "$id_R1\t".substr($calls1,0,8)."\n"; }
    else{ $num_fastq_rej++; $num_nocall++; } }
  else{ $num_fastq_rej++; if($d<=8){ $num_mismat++; }
    elsif(substr($calls1,7,$LTR_len) eq $LTR_seq){ $num_frames++; }
    elsif(substr($calls1,9,$LTR_len) eq $LTR_seq){ $num_frames++; }
    else{ $num_others++; } }
}

# Closing Input/Output Files + Stats
close(IN1); close(IN2); close(OUT1); close(OUT2); close(TAGS);
my $pb_out=sprintf("%.2f",($num_fastq_out/$num_fastq_in)*100);
my $pb_rej=sprintf("%.2f",($num_fastq_rej/$num_fastq_in)*100);
my $pb_mismat=sprintf("%.2f",($num_mismat/$num_fastq_in)*100);
my $pb_frames=sprintf("%.2f",($num_frames/$num_fastq_in)*100);
my $pb_nocall=sprintf("%.2f",($num_nocall/$num_fastq_in)*100);
my $pb_others=sprintf("%.2f",($num_others/$num_fastq_in)*100);
print "\nInput Reads    : $num_fastq_in\n";
print "Output Reads   : $num_fastq_out ($pb_out %)\n";
print "Rejected Reads : $num_fastq_rej ($pb_rej %)\n\n";
print "NoCall 8bp Tag : $num_nocall ($pb_nocall %)\n";
print "Frameshifts    : $num_frames ($pb_frames %)\n";
print "1 to 8 Mismat. : $num_mismat ($pb_mismat %)\n";
print ">8 Mismat.     : $num_others ($pb_others %)\n\n";

