#!/usr/bin/perl
use strict;
use warnings;
use diagnostics;
$|=1;

#########  CGI Mapper #################
#Purpose: To find the surrounding CpG methylation for pairs.

#Fiorella C. Grandi. 
#last revision: 08/07/2014 

#Declare variables
my@array; #holds single CGIs
my %hash; #holds methylation data for each CpG


###### DATA INPUT ########
# Methylation data
open INPUT1,"Meth_sperm_mchr19_Kobayashi.txt" or die "Couldn't 
open file1: $!\n";
my $tmpvar4; #position
my $tmpvar5; # methylation
my $tmpvar6; #coverage

while (<INPUT1>) {
  ($tmpvar4, $tmpvar5, $tmpvar6) = split(/\t/, $_);
#Get rid of all weird spaces 
  $tmpvar1=~s/^\s+//g; # strip white space from the beginning
  $tmpvar1=~s/\r*\n//g;
  $tmpvar1=~s/\s+$//g; # strip white space from the end
  $tmpvar2=~s/\r*\n//g;
  $tmpvar2=~s/^\s+//g; # strip white space from the beginning
  $tmpvar3=~s/\r*\n//g;
  $tmpvar2=~s/\s+$//g; # strip white space from the end
  $tmpvar6=~s/^\s+//g; # strip white space from the beginning
  $tmpvar3=~s/\s+$//g; # strip white space from the end        
  #associate the location and the percent methylation in a hash variable. 
          $hash{$tmpvar1} = $tmpvar2;
}
close INPUT1;

#CGI defintions input/output file names.  
open HIGH,"test_highCGIs.txt" or die "Couldn't open file: $!\n" ;
open HIGHOUT, ">>test_highCGIs_surrounding.txt";

open LOW,"test_lowCGIs.txt" or die  "Couldn't open file: $!\n" ;
open LOWOUT, ">>test_lowCGIs_surrounding.txt";


#### FOR HIGH CGIs #######
foreach my $line (<HIGH>) {
  chomp ($line);
  @array= split("\t", $line); #array1 is the methylation, array4 is the length

#Get all CpGs within 5kb of the start and end of the island
  my$island_start = $array[0];
  my$island_end = $array[0] + $array[4];
  my$island_upstream = $island_start - 5000; #sets the upper bound of how far we will search
  my$island_downstream = $island_end + 5000; #sets the lower bound of how far we will search 

  #find upstream CpGs 
  foreach my$key (keys %hash){
    if ($key >= $island_upstream && $key <= $island_start){
      my$distance= $island_start - $key;  #calculate distance from the end of the CGI to this CpG
      print HIGHOUT "$array[0]\tupstream\t$key\t$distance\t$hash{$key}\n";
    }
  }

#find downstream CpGs 
  foreach my$key (keys %hash){
    if ($key >= $island_end && $key <= $island_downstream){
      my$distance2= abs($island_end-$key);    #calculate distance from the end of the CGI to this CpG
      print HIGHOUT "$array[0]\tdownstream\t$key\t$distance2\t$hash{$key}\n";
    }
  }
}
close HIGH;
close HIGHOUT;

#### FOR LOW CGIs #######
foreach my $line (<LOW>) {
  chomp ($line);
  @array= split("\t", $line);
#array1 is the methylation, array4 is the length
  push(@number, $array[0]);
#Get all CpGs within 5kb of the start and end of the island
  $island_start = $array[0];
  $island_end = $array[0] + $array[4];
  $island_upstream = $island_start - 5000;
  $island_downstream = $island_end + 5000;

  #find upstream CpGs 
  foreach my$key (keys %hash){
    if ($key >= $island_upstream && $key <= $island_start){
      my$distance= $island_start - $key;
      print LOWOUT "$array[0]\tupstream\t$key\t$distance\t$hash{$key}\n";
    }
  }

#find downstream CpGs 
  foreach my$key (keys %hash){
    if ($key >= $island_end && $key <= $island_downstream){
      
      #calculate distance from the end of the CGI to this CpG
      my$distance2= abs($island_end-$key);
      print LOWOUT "$array[0]\tdownstream\t$key\t$distance2\t$hash{$key}\n";
    }
  }
}
close LOW;
close LOWOUT;
