#!/usr/bin/perl
use strict;
use warnings;
use diagnostics;
$|=1;

#Fiorella Carla Grandi
#Last revision 05/4/2015

#Purpose: To determine regions where the shores of CGIs have a difference in methylation

#1) Compare position by position at a CGI shore. Record difference.
#2) Find all CGIs that have >5 CpGs in a row that have a large difference. 


##### DECLARE VARIABLES ###
my @array;
my $CpG_ID;
my $CpG_ID2;
my $CpG_meth;
my $CpG_meth2;
my $difference_meth;
my $tmpvar1;
my $tmpvar2;
my $tmpvar3;
my $tmpvar4;
my $tmpvar5;
my %hash;

#Read both CGI surrounding CpGs files

#"reference" file
open OUTPUT, ">>differences.txt";
open INPUT2, "tissue1_surrounding.txt" or  die "Couldn't open file1: $!\n";
while (<INPUT2>) {
  ($tmpvar1, $tmpvar2, $tmpvar3, $tmpvar4, $tmpvar5) = split(/\t/, $_);
 $tmpvar3=~s/^\s+//g; # strip white space from the beginning
  $tmpvar3=~s/\r*\n//g;
  $tmpvar3=~s/\s+$//g; # strip white space from the end
  $tmpvar5=~s/\r*\n//g;
  $tmpvar5=~s/^\s+//g; # strip white space from the beginning

  #associate the CpG location and the methylation in a hash variable. 
          $hash{$tmpvar3} = $tmpvar5;
}

open INPUT1, "tissue2_surrounding.txt" or  die "Couldn't open file1: $!\n";
foreach my $line (<INPUT1>) {
  chomp ($line);
  @array= split("\t", $line);
  $CpG_ID = $array[2]; # change this to the appropriate location of the CpG_ID number and methylation value. 
  $CpG_meth=$array[4];

  foreach my $key(keys %hash){
    $CpG_ID2=$key;
    $CpG_meth2=$hash{$key};
    if ($CpG_ID == $CpG_ID2){
      $difference_meth= abs($CpG_meth - $CpG_meth2);
      print OUTPUT "$CpG_ID\t$array[0]\t$difference_meth\n";
      delete $hash{$key};
    }
  }
}
