#!/usr/bin/perl
use strict;
use warnings;
use diagnostics;
use Statistics::Descriptive;
$|=1;

########## LOCAL AVERAGE ##########
#Purpose: To find the local average of intervals along the shore. 
#Fiorella C. Grandi. 
#last revision: 08/07/2014


#DECLARE VARIABLES
my @array; #holds the surrounding CpG data, line by line. 
my %methylation;
my @methylation;
my$unique_CGI_ID=0; #holds the last unique CGI ID 
my$current_CGI_ID; #holds the CGI ID of the current CGI being processes
my$direction; #holds the direction of the CpG relative to the CGI ie upstream or downstream
my $methylation; #holds the methylation of the CpG
my $distance;
my $key;

my$interval_start;
my$interval_end;

#SPECIFY THE FILE TO BIN HERE
open INPUT,"test_low_surrounding.txt" or die "Couldn't open file1: $!\n";
open OUTPUT, ">>test_low_surrounding_localaverage.txt";


#SUBROUTINES

sub average_hash {
    foreach my$key (keys %methylation){
      if ($key >= $interval_start && $key < $interval_end){
	my$methylation2 =$methylation{$key};
	push(@methylation, $methylation2);
      }
    }
      my$den = scalar(@methylation);
      if ($den >0){
	my $stat=Statistics::Descriptive::Full->new();
	$stat->add_data(@methylation);
	my$mean = $stat->mean();
	my$sd=$stat->standard_deviation();
	print OUTPUT "$interval_start\t$interval_end\t$mean\t$sd\n";
      }
      else{
	print OUTPUT "$interval_start\t$interval_end\t1000\n";
      }

     undef(@methylation);
      $interval_start=$interval_start +250; # you can change the step size by changing this 
      $interval_end= $interval_end +250;
  }

sub call {
$interval_start=-5000; #you can change the start and end position 
$interval_end= -4750;
foreach(0...40){
average_hash($_);
}
}

##################### PROGRAM ##################

foreach my $line (<INPUT>){
  chomp ($line);
  @array= split("\t", $line);
  $current_CGI_ID=$array[0];
  $methylation=$array[4];
  $direction = $array[1];
  $distance = $array[3];

  $current_CGI_ID=~s/^\s+//g; # strip white space from the beginning
  $current_CGI_ID=~s/\r*\n//g;
  $current_CGI_ID=~s/\s+$//g; # strip white space from the end

  $direction=~s/^\s+//g; # strip white space from the beginning
  $direction=~s/\r*\n//g;
  $direction=~s/\s+$//g; # strip white space from the end

#IF THE ID IS UNIQUE
  if ($current_CGI_ID != $unique_CGI_ID){
      $unique_CGI_ID = $current_CGI_ID;
    call();
    undef($interval_start);
    undef($interval_end);
    }
# IF THE ID IS NOT UNIQUE
  else {
    my$test="downstream";
    my$test2="upstream";
    if ($direction eq $test){
    $key=$distance;
  }
    if ($direction eq $test2){
      $key= $distance * -1;
    }
    $methylation{$key}=$methylation;
    $unique_CGI_ID=$current_CGI_ID;
  }
}
close INPUT;
close OUTPUT;

### Now we want to take the average of the averages
my @methylation_average;
my @array_averages;
my $interval=-5000;
my $interval2;
foreach(0...40){
scan($_);
}
sub scan {
averages2($_);
my$stat2=Statistics::Descriptive::Full->new();
$stat2->add_data(@methylation_average);
my $sd=$stat2->standard_deviation();
my $mean=$stat2->mean();
my $count=$stat2->count();
print "$interval\t$mean\t$sd\t$count\n";
$interval =$interval +250;
@methylation_average=0;
}

sub averages2 {
open INPUT12, "test_low_surrounding_localaverage.txt" or die "you probably forgot to change both files names $!\n";
foreach my $line (<INPUT12>){
  chomp ($line);
  @array_averages= split("\t", $line);
  my $interval2 = $array_averages[0];
  my $methylation= $array_averages[2];
if ($interval2==$interval && $methylation < 1000){
push(@methylation_average,$methylation);
}
}
}
