#!/usr/bin/perl
use strict;

# Input/Output Files
my $reference_LTRs="$ARGV[0]";
my $reference_tRNA="$ARGV[1]";
my $reference_POL3="$ARGV[2]";
my $reference_SGDB="$ARGV[3]";
my $sample_dataset="$ARGV[4]";
my $outputs_prefix="$ARGV[5]";
my $collected_data="$sample_dataset.dat";
my $all_sites_out="${outputs_prefix}_all_sites.csv";
my $all_pairs_out="${outputs_prefix}_all_pairs.csv";
my $sel_sites_out="${outputs_prefix}_sel_sites.csv";
my $sel_pairs_out="${outputs_prefix}_sel_pairs.csv";
if(! -f "$reference_LTRs"){ print "List of native LTRs not found.\n"; exit; }
if(! -f "$reference_tRNA"){ print "List of tRNA genes not found.\n"; exit; }
if(! -f "$reference_POL3"){ print "PolIII transcribed genes not found.\n"; exit; }
if(! -f "$reference_SGDB"){ print "SGD gene annotations not found.\n"; exit; }
if(! -f "$sample_dataset"){ print "Input sample dataset not found.\n"; exit; }
if($outputs_prefix eq ""){ print "Output files prefix not provided.\n"; exit; }

# Reference Chromosomes
my @CHR=("chrI","chrII","chrIII","chrIV","chrV","chrVI","chrVII","chrVIII",
         "chrIX","chrX","chrXI","chrXII","chrXIII","chrXIV","chrXV","chrXVI");
my $num_CHR=$#CHR+1; my %CHR_index;
for(my $i=0;$i<$num_CHR;$i++){ $CHR_index{"$CHR[$i]"}="$i"; }

# Gene annotation dataset (tRNAs + POLIII + SGD)
my $num_GENE=0;
my %GENE_index;
my @GENE_id;
my @GENE_chr;
my @GENE_strand;
my @GENE_start;
my @GENE_stop;
my @GENE_dbid;
my @GENE_name;

# Retrieving the list of native LTRs
my $num_LTR=0; my %LTR_index; my @LTR_id; my @LTR_site; my @LTR_tag;
open(IN,"$reference_LTRs"); my $LTR_header=<IN>;
while(my $l=<IN>){ chomp($l); my @d=split(",",$l);
  my $id=$d[0]; my $chr=$d[1]; my $strand=$d[2];
  my $start=$d[3]; my $stop=$d[4]; my $tag=$d[7];
  my $pos=0; if($strand eq "+"){ $pos=$stop+1; }
  else{ $pos=$start-1; } my $site="$chr,$strand,$pos";
  if($id!~/^NLTR/){ print "Unrecognized LTR file format\n"; exit; }
  if($CHR_index{"$chr"} eq ""){ print "Unknown CHR ID\n"; exit; }
  if($LTR_index{"$id"} ne ""){ print "Duplicated ID\n"; exit; }
  if($LTR_index{"$site"} ne ""){ print "Duplicated Site\n"; exit; }
  $LTR_index{"$id"}=$num_LTR; $LTR_index{"$site"}=$num_LTR;
  $LTR_id[$num_LTR]="$id"; $LTR_site[$num_LTR]="$site";
  $LTR_tag[$num_LTR]="$tag"; $num_LTR++; } close(IN);
print "\n  Successfully retrieved $num_LTR native LTRs...\n";

# Retrieving the list of tRNA genes
my $num_tRNA=0; open(IN,"$reference_tRNA"); my $tRNA_header=<IN>;
while(my $l=<IN>){ chomp($l); my @d=split(",",$l);
  my $id=$d[0]; my $chr=$d[1]; my $strand=$d[2];
  my $start=$d[3]; my $stop=$d[4]; my $dbid=$d[5]; my $name=$d[6];
  if($id!~/^tRNA/){ print "Unrecognized tRNA file format\n"; exit; }
  if($CHR_index{"$chr"} eq ""){ print "Unknown CHR ID\n"; exit; }
  if($GENE_index{"$id"} ne ""){ print "Duplicated Gene ID\n"; exit; }
  if($GENE_index{"$dbid"} ne ""){ print "Duplicated Gene ID\n"; exit; }
  $GENE_index{"$id"}="$num_GENE"; $GENE_index{"$dbid"}="$num_GENE";
  $GENE_id[$num_GENE]="$id"; $GENE_chr[$num_GENE]="$chr";
  $GENE_strand[$num_GENE]="$strand"; $GENE_start[$num_GENE]="$start";
  $GENE_stop[$num_GENE]="$stop"; $GENE_dbid[$num_GENE]="$dbid";
  $GENE_name[$num_GENE]="$name"; $num_GENE++; $num_tRNA++; } close(IN);
print "  Successfully retrieved $num_tRNA tRNA genes...\n";

# Retrieving the PolIII transcribed genes
my $num_PolIII=0; open(IN,"$reference_POL3"); my $PolIII_header=<IN>;
while(my $l=<IN>){ chomp($l); my @d=split(",",$l);
  my $id=$d[0]; my $chr=$d[1]; my $strand=$d[2];
  my $start=$d[3]; my $stop=$d[4]; my $dbid=$d[5]; my $name=$d[6];
  if($id!~/^PolIII/){ print "Unrecognized PolIII file format\n"; exit; }
  if($CHR_index{"$chr"} eq ""){ print "Unknown CHR ID\n"; exit; }
  if($GENE_index{"$id"} ne ""){ print "Duplicated Gene ID\n"; exit; }
  if($GENE_index{"$dbid"} ne ""){ print "Duplicated Gene ID\n"; exit; }
  $GENE_index{"$id"}="$num_GENE"; $GENE_index{"$dbid"}="$num_GENE";
  $GENE_id[$num_GENE]="$id"; $GENE_chr[$num_GENE]="$chr";
  $GENE_strand[$num_GENE]="$strand"; $GENE_start[$num_GENE]="$start";
  $GENE_stop[$num_GENE]="$stop"; $GENE_dbid[$num_GENE]="$dbid";
  $GENE_name[$num_GENE]="$name"; $num_GENE++; $num_PolIII++; } close(IN);
print "  Successfully retrieved $num_PolIII PolIII genes...\n";

# Retrieving the SGD gene annotations
my $num_SGD=0; open(IN,"$reference_SGDB"); my $SGD_header=<IN>;
while(my $l=<IN>){ chomp($l); my @d=split(",",$l);
  my $id=$d[0]; my $chr=$d[1]; my $strand=$d[2];
  my $start=$d[3]; my $stop=$d[4]; my $dbid=$d[5]; my $name=$d[6];
  if($id!~/^SGD/){ print "Unrecognized SGD file format\n"; exit; }
  if($CHR_index{"$chr"} eq ""){ print "Unknown CHR ID\n"; exit; }
  if($GENE_index{"$id"} ne ""){ print "Duplicated Gene ID\n"; exit; }
  if($GENE_index{"$dbid"} ne ""){ print "Duplicated Gene ID\n"; exit; }
  $GENE_index{"$id"}="$num_GENE"; $GENE_index{"$dbid"}="$num_GENE";
  $GENE_id[$num_GENE]="$id"; $GENE_chr[$num_GENE]="$chr";
  $GENE_strand[$num_GENE]="$strand"; $GENE_start[$num_GENE]="$start";
  $GENE_stop[$num_GENE]="$stop"; $GENE_dbid[$num_GENE]="$dbid";
  $GENE_name[$num_GENE]="$name"; $num_GENE++; $num_SGD++; } close(IN);
print "  Successfully retrieved $num_SGD SGD gene entries...\n";
print "  Done retrieving gene annotations ($num_GENE total)\n\n";

# Parsing mapping results and collecting data
print "  Parsing mapping results and collecting data...\n";
my $dataset_size=0; open(IN,"$sample_dataset"); open(OUT,">$collected_data");
while(my $l=<IN>){ chomp($l); my @d=split("\t",$l);
  my $tag=$d[1]; my $chr=$d[3]; my $strand=$d[4]; my $start=$d[5];
  if($tag!~/^[ACGT]{8}$/){ print "Inconsistent TAG sequences\n"; exit; }
  if($CHR_index{"$chr"} eq ""){ print "Unknown CHR ID\n"; exit; }
  if($d[8] ne "$chr"){ print "Inconsistent mapping results\n"; exit; }
  my $offset=0; my $index=0; my @CIGAR=split("",$d[6]);
  while($index<=$#CIGAR){ if($CIGAR[$index] eq "^"){ $index++;
      if($CIGAR[$index]=~/[ACGTN]/){ while($CIGAR[$index] ne "\$"){
          $offset++; $index++; } } else{ my $count="";
        while($CIGAR[$index] ne "\$"){ $count=$count.$CIGAR[$index];
          $index++; } $offset-=$count; } } $index++; }
  my $stop=$start+length($d[2])-1+$offset; $dataset_size++;
  if($strand eq "+"){ print OUT "$chr,$strand,$start,$tag\n"; }
  else{ print OUT "$chr,$strand,$stop,$tag\n"; } } close(IN); close(OUT);
print "  Done! $dataset_size entries successfully processed...\n";

# Extracting list of insertion sites and random tags
print "  Extracting list of insertion sites and tags...\n";
my %collected_sites; my %collected_pairs; my %collected_tags;
open(IN,"$collected_data"); while(my $l=<IN>){ chomp($l);
  my @d=split(",",$l); my $site="$d[0],$d[1],$d[2]";
  my $tag="$d[3]"; my $pair="$site-$tag";
  $collected_sites{"$site"}++; $collected_pairs{"$pair"}++;
  $collected_tags{"$tag"}++; } close(IN); `rm -f $collected_data`;

# Insertion sites and random tags dataset
my $num_SITE=0;        my %SITE_index;   my @SITE_num_PAIR;   my @SITE_num_FILT;
my $num_PAIR=0;        my @SITE_id;      my @SITE_tot_PAIR;   my @SITE_tot_FILT;
my $num_READ=0;        my @SITE_gid;     my @SITE_avg_PAIR;   my @SITE_avg_FILT;
my $num_FILT_SITE=0;   my @SITE_chr;     my @SITE_PAIR_id;    my @SITE_FILT_id;
my $num_FILT_PAIR=0;   my @SITE_str;     my @SITE_PAIR_tag;   my @SITE_FILT_tag;
my $num_FILT_READ=0;   my @SITE_pos;     my @SITE_PAIR_tot;   my @SITE_FILT_tot;

# Sorting the list of collected insertion sites
print "  Sorting and filtering insertion sites & tags...\n";
my @ALL_SITES=keys(%collected_sites); my $num_collected_sites=$#ALL_SITES+1;
my @ALL_TAGS=keys(%collected_tags); my $tag_threshold=50;
foreach my $chr_id (@CHR){ my $num_sites=0; my @UNSORTED;
  foreach my $site (@ALL_SITES){ my @d=split(",",$site);
    if($d[0] eq "$chr_id"){ push(@UNSORTED,"$site"); $num_sites++; } }
  my @SORTED; for(my $sorted=0;$sorted<$num_sites;$sorted++){
    my $min_ins_pos=10000000000; my $min_ins_str=""; my $min_ins_ind=-1;
    for(my $unsorted=0;$unsorted<$num_sites;$unsorted++){
      if($UNSORTED[$unsorted] ne "done"){
        my @d=split(",",$UNSORTED[$unsorted]);
        if($d[2]<$min_ins_pos){ $min_ins_pos=$d[2];
          $min_ins_str=$d[1]; $min_ins_ind=$unsorted; }
        elsif(($d[2]==$min_ins_pos)&&($d[1] eq "+")){
          $min_ins_str=$d[1]; $min_ins_ind=$unsorted; } } }
    if(($min_ins_ind==-1)||($min_ins_str eq "")){
      print "Failed to sort the list of insertion sites\n"; exit 1; }
    $SORTED[$sorted]=$UNSORTED[$min_ins_ind]; $UNSORTED[$min_ins_ind]="done"; }
  
  # Extracting all data for sorted insertion sites
  for(my $sorted=0;$sorted<$num_sites;$sorted++){ my $site_id=$SORTED[$sorted];
    my @SITE_INFO=split(",",$site_id); my $site_chr=$SITE_INFO[0];
    my $site_str=$SITE_INFO[1]; my $site_pos=$SITE_INFO[2];
    my $site_index=$num_SITE; my $site_gid=$num_SITE+1;
    while(length($site_gid)<length($num_collected_sites)){ $site_gid="0$site_gid"; }
    $site_gid="SITE$site_gid"; my $site_num_PAIR=0; my $site_tot_PAIR=0;
    my $site_avg_PAIR=0; my @site_PAIR_id; my @site_PAIR_tag; my @site_PAIR_tot;
    my $site_filtered=0; my $site_num_FILT=0; my $site_tot_FILT=0;
    my $site_avg_FILT=0; my @site_FILT_id; my @site_FILT_tag; my @site_FILT_tot;
    foreach my $tag (@ALL_TAGS){ my $pair="$site_id-$tag";
      my $count=$collected_pairs{"$pair"}; if($count ne ""){
        $site_PAIR_id[$site_num_PAIR]="$pair"; $site_PAIR_tag[$site_num_PAIR]="$tag";
        $site_PAIR_tot[$site_num_PAIR]="$count"; $site_num_PAIR++; $site_tot_PAIR+=$count;
        $site_avg_PAIR=sprintf("%.2f",$site_tot_PAIR/$site_num_PAIR);
        if($count>=$tag_threshold){ $site_FILT_id[$site_num_FILT]="$pair";
          $site_FILT_tag[$site_num_FILT]="$tag"; $site_FILT_tot[$site_num_FILT]="$count";
          $site_filtered="1"; $site_num_FILT++; $site_tot_FILT+=$count;
          $site_avg_FILT=sprintf("%.2f",$site_tot_FILT/$site_num_FILT); } } }
    
    # Updating the SITE dataset with extracted data
    if($SITE_index{"$site_id"} ne ""){ print "Duplicated site ID\n"; exit; }
    if($SITE_index{"$site_gid"} ne ""){ print "Duplicated site ID\n"; exit; }
    $num_SITE++; $num_PAIR+=$site_num_PAIR; $num_READ+=$site_tot_PAIR;
    $num_FILT_SITE+=$site_filtered; $num_FILT_PAIR+=$site_num_FILT;
    $num_FILT_READ+=$site_tot_FILT; $SITE_index{"$site_id"}="$site_index";
    $SITE_index{"$site_gid"}="$site_index"; $SITE_id[$site_index]="$site_id";
    $SITE_gid[$site_index]="$site_gid"; $SITE_chr[$site_index]="$site_chr";
    $SITE_str[$site_index]="$site_str"; $SITE_pos[$site_index]="$site_pos";
    $SITE_num_PAIR[$site_index]="$site_num_PAIR"; $SITE_tot_PAIR[$site_index]="$site_tot_PAIR";
    $SITE_avg_PAIR[$site_index]="$site_avg_PAIR"; for(my $i=0;$i<$site_num_PAIR;$i++){
      $SITE_PAIR_id[$site_index][$i]=$site_PAIR_id[$i];
      $SITE_PAIR_tag[$site_index][$i]=$site_PAIR_tag[$i];
      $SITE_PAIR_tot[$site_index][$i]=$site_PAIR_tot[$i]; }
    $SITE_num_FILT[$site_index]="$site_num_FILT"; $SITE_tot_FILT[$site_index]="$site_tot_FILT";
    $SITE_avg_FILT[$site_index]="$site_avg_FILT"; for(my $i=0;$i<$site_num_FILT;$i++){
      $SITE_FILT_id[$site_index][$i]=$site_FILT_id[$i];
      $SITE_FILT_tag[$site_index][$i]=$site_FILT_tag[$i];
      $SITE_FILT_tot[$site_index][$i]=$site_FILT_tot[$i]; } } }

# Printing statistics on the final lists of insertion sites
print "  Done! Filtered and unfiltered datasets ready...\n\n";
print "  Unfiltered dataset overview :\n\n";
print "    Total number of insertion sites   : $num_SITE sites\n";
print "    Number of pairs site / random tag : $num_PAIR pairs\n";
print "    Total coverage of insertion sites : $num_READ reads\n\n";
print "  Filtered dataset overview :\n\n";
print "    Total number of insertion sites   : $num_FILT_SITE sites\n";
print "    Number of pairs site / random tag : $num_FILT_PAIR pairs\n";
print "    Total coverage of insertion sites : $num_FILT_READ reads\n\n";

# Annotations to be extracted for all insertion sites
my @SITE_native_LTR;
my @SITE_native_tag;
my @SITE_closest_any;
my @SITE_closest_ups;
my @SITE_closest_dow;
my @SITE_is_ups_of_1;
my @SITE_is_ups_of_2;
my @SITE_is_ups_of_3;

# Extracting full lists of candidate genes (Part 1)
print "  Now annotating all extracted insertion sites...\n";
for(my $site_index=0;$site_index<$num_SITE;$site_index++){
  my $site_chr=$SITE_chr[$site_index]; my $site_str=$SITE_str[$site_index];
  my $site_pos=$SITE_pos[$site_index]; my $num_CAND_closest_of=0;
  my $num_CAND_ups_of_ins=0; my $num_CAND_dow_of_ins=0;
  my $num_CAND_ins_is_ups=0; my $num_CAND_ins_is_dow=0;
  my @CAND_GENE_closest_of; my @RANK_GENE_closest_of;
  my @CAND_DIST_closest_of; my @RANK_DIST_closest_of;
  my @CAND_GENE_ups_of_ins; my @RANK_GENE_ups_of_ins;
  my @CAND_DIST_ups_of_ins; my @RANK_DIST_ups_of_ins;
  my @CAND_GENE_dow_of_ins; my @RANK_GENE_dow_of_ins;
  my @CAND_DIST_dow_of_ins; my @RANK_DIST_dow_of_ins;
  my @CAND_GENE_ins_is_ups; my @RANK_GENE_ins_is_ups;
  my @CAND_DIST_ins_is_ups; my @RANK_DIST_ins_is_ups;
  my @CAND_GENE_ins_is_dow; my @RANK_GENE_ins_is_dow;
  my @CAND_DIST_ins_is_dow; my @RANK_DIST_ins_is_dow;
  
  # Extracting full lists of candidate genes (Part 2)
  for(my $gene_index=0;$gene_index<$num_GENE;$gene_index++){
    if($GENE_chr[$gene_index] eq "$site_chr"){ my $gene_id=$GENE_id[$gene_index];
      my $gene_str=$GENE_strand[$gene_index]; my $gene_beg=$GENE_start[$gene_index];
      my $gene_end=$GENE_stop[$gene_index]; my $ups_of_ins=0; my $dow_of_ins=0;
      my $ins_is_ups=0; my $ins_is_dow=0; my $distance=-1;
      if($gene_str eq "+"){ if($site_str eq "+"){ if($site_pos<=$gene_beg){ $ins_is_ups=1;
          $dow_of_ins=1; $distance=$gene_beg-$site_pos; } if($site_pos>$gene_end){
          $ins_is_dow=1; $ups_of_ins=1; $distance=$site_pos-$gene_end-1; } } else{
          if($site_pos<$gene_beg){ $ins_is_ups=1; $ups_of_ins=1;
          $distance=$gene_beg-$site_pos-1; } if($site_pos>=$gene_end){ $ins_is_dow=1;
          $dow_of_ins=1; $distance=$site_pos-$gene_end; } } } else{ if($site_str eq "+"){
          if($site_pos>$gene_end){ $ins_is_ups=1; $ups_of_ins=1;
            $distance=$site_pos-$gene_end-1; } if($site_pos<=$gene_beg){ $ins_is_dow=1;
            $dow_of_ins=1; $distance=$gene_beg-$site_pos; } } else{ if($site_pos>=$gene_end){
            $ins_is_ups=1; $dow_of_ins=1; $distance=$site_pos-$gene_end; } if($site_pos<$gene_beg){
            $ins_is_dow=1; $ups_of_ins=1; $distance=$gene_beg-$site_pos-1; } } }
      
      # Extracting full lists of candidate genes (Part 3)
      if($distance!=-1){ $num_CAND_closest_of++;
        push(@CAND_GENE_closest_of,"$gene_id");
        push(@CAND_DIST_closest_of,$distance);
        if($ups_of_ins==1){ $num_CAND_ups_of_ins++;
          push(@CAND_GENE_ups_of_ins,"$gene_id");
          push(@CAND_DIST_ups_of_ins,$distance); }
        if($dow_of_ins==1){ $num_CAND_dow_of_ins++;
          push(@CAND_GENE_dow_of_ins,"$gene_id");
          push(@CAND_DIST_dow_of_ins,$distance); }
        if($ins_is_ups==1){ $num_CAND_ins_is_ups++;
          push(@CAND_GENE_ins_is_ups,"$gene_id");
          push(@CAND_DIST_ins_is_ups,$distance); }
        if($ins_is_dow==1){ $num_CAND_ins_is_dow++;
          push(@CAND_GENE_ins_is_dow,"$gene_id");
          push(@CAND_DIST_ins_is_dow,$distance); } } } }
  
  # Sorting full lists of candidate genes (Part 1)
  for(my $rank=0;$rank<3;$rank++){
    my $min_distance=20000000000; my $min_index=-1;
    for(my $i=0;$i<$num_CAND_closest_of;$i++){
      if($CAND_DIST_closest_of[$i]<$min_distance){
        $min_distance=$CAND_DIST_closest_of[$i]; $min_index=$i; } }
    if($min_index!=-1){ push(@RANK_GENE_closest_of,$CAND_GENE_closest_of[$min_index]);
      push(@RANK_DIST_closest_of,$CAND_DIST_closest_of[$min_index]);
      $CAND_DIST_closest_of[$min_index]=30000000000; }
    $min_distance=20000000000; $min_index=-1;
    for(my $i=0;$i<$num_CAND_ups_of_ins;$i++){
      if($CAND_DIST_ups_of_ins[$i]<$min_distance){
        $min_distance=$CAND_DIST_ups_of_ins[$i]; $min_index=$i; } }
    if($min_index!=-1){ push(@RANK_GENE_ups_of_ins,$CAND_GENE_ups_of_ins[$min_index]);
      push(@RANK_DIST_ups_of_ins,$CAND_DIST_ups_of_ins[$min_index]);
      $CAND_DIST_ups_of_ins[$min_index]=30000000000; }
    $min_distance=20000000000; $min_index=-1;
    for(my $i=0;$i<$num_CAND_dow_of_ins;$i++){
      if($CAND_DIST_dow_of_ins[$i]<$min_distance){
        $min_distance=$CAND_DIST_dow_of_ins[$i]; $min_index=$i; } }
    
    # Sorting full lists of candidate genes (Part 2)
    if($min_index!=-1){ push(@RANK_GENE_dow_of_ins,$CAND_GENE_dow_of_ins[$min_index]);
      push(@RANK_DIST_dow_of_ins,$CAND_DIST_dow_of_ins[$min_index]);
      $CAND_DIST_dow_of_ins[$min_index]=30000000000; }
    $min_distance=20000000000; $min_index=-1;
    for(my $i=0;$i<$num_CAND_ins_is_ups;$i++){
      if($CAND_DIST_ins_is_ups[$i]<$min_distance){
        $min_distance=$CAND_DIST_ins_is_ups[$i]; $min_index=$i; } }
    if($min_index!=-1){ push(@RANK_GENE_ins_is_ups,$CAND_GENE_ins_is_ups[$min_index]);
      push(@RANK_DIST_ins_is_ups,$CAND_DIST_ins_is_ups[$min_index]);
      $CAND_DIST_ins_is_ups[$min_index]=30000000000; }
    $min_distance=20000000000; $min_index=-1;
    for(my $i=0;$i<$num_CAND_ins_is_dow;$i++){
      if($CAND_DIST_ins_is_dow[$i]<$min_distance){
        $min_distance=$CAND_DIST_ins_is_dow[$i]; $min_index=$i; } }
    if($min_index!=-1){ push(@RANK_GENE_ins_is_dow,$CAND_GENE_ins_is_dow[$min_index]);
      push(@RANK_DIST_ins_is_dow,$CAND_DIST_ins_is_dow[$min_index]);
      $CAND_DIST_ins_is_dow[$min_index]=30000000000; } }
  
  # Finalizing annotations for current insertion site (Part 1)
  my $ltr_index=$LTR_index{"$SITE_id[$site_index]"}; if($ltr_index ne ""){
    $SITE_native_LTR[$site_index]="$LTR_id[$ltr_index]";
    $SITE_native_tag[$site_index]="$LTR_tag[$ltr_index]"; }
  else{ $SITE_native_LTR[$site_index]="no";
    $SITE_native_tag[$site_index]="none"; } if($num_CAND_closest_of>0){
    my $gene_id=$RANK_GENE_closest_of[0]; my $distance=$RANK_DIST_closest_of[0];
    my $gene_index=$GENE_index{"$gene_id"}; my $gene_str=$GENE_strand[$gene_index];
    my $gene_dbid=$GENE_dbid[$gene_index]; my $gene_name=$GENE_name[$gene_index];
    $SITE_closest_any[$site_index]="$gene_id,$distance,$gene_str,$gene_dbid,$gene_name"; }
  else{ $SITE_closest_any[$site_index]="none,NA,NA,NA,NA"; } if($num_CAND_ups_of_ins>0){
    my $gene_id=$RANK_GENE_ups_of_ins[0]; my $distance=$RANK_DIST_ups_of_ins[0];
    my $gene_index=$GENE_index{"$gene_id"}; my $gene_str=$GENE_strand[$gene_index];
    my $gene_dbid=$GENE_dbid[$gene_index]; my $gene_name=$GENE_name[$gene_index];
    $SITE_closest_ups[$site_index]="$gene_id,$distance,$gene_str,$gene_dbid,$gene_name"; }
  else{ $SITE_closest_ups[$site_index]="none,NA,NA,NA,NA"; } if($num_CAND_dow_of_ins>0){
    my $gene_id=$RANK_GENE_dow_of_ins[0]; my $distance=$RANK_DIST_dow_of_ins[0];
    my $gene_index=$GENE_index{"$gene_id"}; my $gene_str=$GENE_strand[$gene_index];
    my $gene_dbid=$GENE_dbid[$gene_index]; my $gene_name=$GENE_name[$gene_index];
    $SITE_closest_dow[$site_index]="$gene_id,$distance,$gene_str,$gene_dbid,$gene_name"; }
  else{ $SITE_closest_dow[$site_index]="none,NA,NA,NA,NA"; }
  
  # Finalizing annotations for current insertion site (Part 2)
  if($num_CAND_ins_is_ups>0){
    my $gene_id=$RANK_GENE_ins_is_ups[0]; my $distance=$RANK_DIST_ins_is_ups[0];
    my $gene_index=$GENE_index{"$gene_id"}; my $gene_str=$GENE_strand[$gene_index];
    my $gene_dbid=$GENE_dbid[$gene_index]; my $gene_name=$GENE_name[$gene_index];
    $SITE_is_ups_of_1[$site_index]="$gene_id,$distance,$gene_str,$gene_dbid,$gene_name"; }
  else{ $SITE_is_ups_of_1[$site_index]="none,NA,NA,NA,NA"; } if($num_CAND_ins_is_ups>1){
    my $gene_id=$RANK_GENE_ins_is_ups[1]; my $distance=$RANK_DIST_ins_is_ups[1];
    my $gene_index=$GENE_index{"$gene_id"}; my $gene_str=$GENE_strand[$gene_index];
    my $gene_dbid=$GENE_dbid[$gene_index]; my $gene_name=$GENE_name[$gene_index];
    $SITE_is_ups_of_2[$site_index]="$gene_id,$distance,$gene_str,$gene_dbid,$gene_name"; }
  else{ $SITE_is_ups_of_2[$site_index]="none,NA,NA,NA,NA"; } if($num_CAND_ins_is_ups>2){
    my $gene_id=$RANK_GENE_ins_is_ups[2]; my $distance=$RANK_DIST_ins_is_ups[2];
    my $gene_index=$GENE_index{"$gene_id"}; my $gene_str=$GENE_strand[$gene_index];
    my $gene_dbid=$GENE_dbid[$gene_index]; my $gene_name=$GENE_name[$gene_index];
    $SITE_is_ups_of_3[$site_index]="$gene_id,$distance,$gene_str,$gene_dbid,$gene_name"; }
  else{ $SITE_is_ups_of_3[$site_index]="none,NA,NA,NA,NA"; } }

# Writing output files for the unfiltered dataset
print "  Writing output files for the unfiltered dataset...\n";
open(SITES,">$all_sites_out"); open(PAIRS,">$all_pairs_out");
print SITES "Insertion Site,Chromosome,Strand,Position,";
print SITES "Num Random Tags,Total Read Count,Native LTR,";
print SITES "CLOSEST GENE,Distance,Strand,Identifier,Name,";
print SITES "CLOSEST UPSTREAM GENE,Distance,Strand,Identifier,Name,";
print SITES "CLOSEST DOWNSTREAM GENE,Distance,Strand,Identifier,Name,";
print SITES "UPSTREAM OF GENE #1,Distance,Strand,Identifier,Name,";
print SITES "UPSTREAM OF GENE #2,Distance,Strand,Identifier,Name,";
print SITES "UPSTREAM OF GENE #3,Distance,Strand,Identifier,Name\n";
print PAIRS "Insertion Site,Chromosome,Strand,Position,";
print PAIRS "Total Read Count,Random Tag,Tag Read Count\n";
for(my $site_index=0;$site_index<$num_SITE;$site_index++){
  print SITES "$SITE_gid[$site_index],$SITE_id[$site_index],$SITE_num_PAIR[$site_index],";
  print SITES "$SITE_tot_PAIR[$site_index],$SITE_native_LTR[$site_index],";
  print SITES "$SITE_closest_any[$site_index],$SITE_closest_ups[$site_index],";
  print SITES "$SITE_closest_dow[$site_index],$SITE_is_ups_of_1[$site_index],";
  print SITES "$SITE_is_ups_of_2[$site_index],$SITE_is_ups_of_3[$site_index]\n";
  for(my $tag_index=0;$tag_index<$SITE_num_PAIR[$site_index];$tag_index++){
    print PAIRS "$SITE_gid[$site_index],$SITE_id[$site_index],";
    print PAIRS "$SITE_tot_PAIR[$site_index],$SITE_PAIR_tag[$site_index][$tag_index],";
    print PAIRS "$SITE_PAIR_tot[$site_index][$tag_index]\n"; } } close(SITES); close(PAIRS);

# Writing output files for the filtered dataset
print "  Writing output files for the filtered dataset...\n\n";
open(SITES,">$sel_sites_out"); open(PAIRS,">$sel_pairs_out");
print SITES "Insertion Site,Chromosome,Strand,Position,";
print SITES "Num Random Tags,Total Read Count,Native LTR,";
print SITES "CLOSEST GENE,Distance,Strand,Identifier,Name,";
print SITES "CLOSEST UPSTREAM GENE,Distance,Strand,Identifier,Name,";
print SITES "CLOSEST DOWNSTREAM GENE,Distance,Strand,Identifier,Name,";
print SITES "UPSTREAM OF GENE #1,Distance,Strand,Identifier,Name,";
print SITES "UPSTREAM OF GENE #2,Distance,Strand,Identifier,Name,";
print SITES "UPSTREAM OF GENE #3,Distance,Strand,Identifier,Name\n";
print PAIRS "Insertion Site,Chromosome,Strand,Position,";
print PAIRS "Total Read Count,Random Tag,Tag Read Count\n";
for(my $site_index=0;$site_index<$num_SITE;$site_index++){ if($SITE_num_FILT[$site_index]>0){
    print SITES "$SITE_gid[$site_index],$SITE_id[$site_index],$SITE_num_FILT[$site_index],";
    print SITES "$SITE_tot_FILT[$site_index],$SITE_native_LTR[$site_index],";
    print SITES "$SITE_closest_any[$site_index],$SITE_closest_ups[$site_index],";
    print SITES "$SITE_closest_dow[$site_index],$SITE_is_ups_of_1[$site_index],";
    print SITES "$SITE_is_ups_of_2[$site_index],$SITE_is_ups_of_3[$site_index]\n";
    for(my $tag_index=0;$tag_index<$SITE_num_FILT[$site_index];$tag_index++){
      print PAIRS "$SITE_gid[$site_index],$SITE_id[$site_index],";
      print PAIRS "$SITE_tot_FILT[$site_index],$SITE_FILT_tag[$site_index][$tag_index],";
      print PAIRS "$SITE_FILT_tot[$site_index][$tag_index]\n"; } } } close(SITES); close(PAIRS);

