#!/usr/bin/perl
use strict;
use warnings;
use diagnostics;
$|=1;

######### CGI Pair Finder #################
#Purpose: To find pairs of CGIs for analysis. High-high, low-high, high-low and low-low pairs must be found separately. 

#Fiorella C. Grandi. 
#last revision: 08/07/2014 

#Input files
#FIRST ISLAND
open FIRST,"test_CGIs_low.txt" or die "Couldn't open file: $!\n";
#SECOND ISLAND
open SECOND,"test_high.txt" or die "Couldn't open file1: $!\n";
#OUTFILE
open PAIRS, ">>test_pairs_low_high.txt";

#Declare variables
my $island_start;
my $island_end;
my $variable;
my $location_distance;


my %hash; # stores second island information
my @array; #stores fist island information
my @array2; #internal script storage for pairs
my @variable; #internal storage for pairs

#Input the second CGI island data into a hash. 
while (<SECOND>) {
  my($tmpvar1,$tmpvar2,$tmpvar3,$tmpvar4,$tmpvar5,$tmpvar6,$tmpvar7) = split(/\t/, $_);
#get rid of white spaces  
  $tmpvar1=~s/^\s+//g; # strip white space from the beginning
  $tmpvar1=~s/\r*\n//g;
  $tmpvar1=~s/\s+$//g; # strip white space from the end
  $tmpvar2=~s/\r*\n//g;
  $tmpvar2=~s/^\s+//g; # strip white space from the beginning

  #associate the location and the methylation in a hash. 
          $hash{$tmpvar1} = $tmpvar2;
}
close SECOND;

#input the first CGI island into an array then find pairs 
foreach my $line (<FIRST>) {
  chomp ($line);
  @array= split("\t", $line);
  
  #define "free" zone
  $island_start = $array[0];
  $island_end = $array[0] + $array[4];
 
foreach my$key (keys %hash){
      my$distance= ($key-$island_end);
      if ($distance<10000 && $distance>0){ #change the first number to change the distance 
	$location_distance="$key\t$distance";
	push (@array2, $location_distance); #stores CGI start site of CGI that is within 10000 bp of island
      }      
}
if (scalar(@array2 ==1)){
  $variable=$array2[0];
  @variable=split("\t", $variable);
  #needs to be equal to 1 because this means that it only has one CGI that is within 10000 kb of it. 
  print PAIRS "$array[0]\t$array[4]\t$variable[0]\t$variable[1]\n";
  #this prints: location of start first CGI in pair, length first CGI in pair, location of second CGI in pair, distance
}
undef(@array2);

}

close FIRST;
close PAIRS;
