#!/usr/bin/perl
use strict;
use warnings;
$|=1;

#########  CGI Mapper #################
#Purpose: To match CpG methylation information to CGI islands.

#Fiorella C. Grandi. 
#last revision: 07/25/2014 



#Declare variables: 

my %meth=(); # contaisn location and % methylation for each Cpg
my %coverage=(); # contains location and coverage information for each CpG
my@array;
my@start; #holds the start locations of each CGI
my@end; #holds the end locations of each CGI
my@CpG_number;#holds the # of CpGs in each CGI



###LOAD METHYLATION DATA####
#data should be in the format: location, percent methylation, coverage and should be TAB DELIMITED 

open INPUT,"test_methylation.txt" or die "Couldn't open  file1: $!\n";

while (<INPUT>) {
  my($tmpvar1, $tmpvar2, $tmpvar3) = split(/\t/, $_);
  #tmpvar1 holds the position of the cpg, tmpvar2 holds the percent methylation, tmpvar3 holds the coverage information as a fraction
  
  #remove any pesky white spaces---will throw off absolutely everything. 
  $tmpvar1=~s/^\s+//g; # strip white space from the beginning
  $tmpvar1=~s/\r*\n//g;
  $tmpvar1=~s/\s+$//g; # strip white space from the end
  $tmpvar2=~s/\r*\n//g;
  $tmpvar2=~s/^\s+//g; # strip white space from the beginning
  $tmpvar3=~s/\r*\n//g;
  $tmpvar2=~s/\s+$//g; # strip white space from the end
  $tmpvar3=~s/^\s+//g; # strip white space from the beginning
  $tmpvar3=~s/\s+$//g; # strip white space from the end        


  #associate the location and the coverage for that location in a hash variable. 
  $coverage{$tmpvar1}= $tmpvar3;

  #associate the location and the percent methylation in a hash variable. 
  $meth{$tmpvar1} = $tmpvar2;
     }
close INPUT;



####LOAD CGI DEFINITIONS ###
#data should be in the format: start, end, number of CpGs in island and should be and should be TAB DELIMITED

open INPUT,"test_CGI.txt" or die "Couldn't open file: $!\n";

while (<INPUT>) {
  my ($tmpvar4, $tmpvar5,$tmpvar6) = split(/\t/, $_);
  #$tmpvar4 hold start, $tmpvar5 holds end and $tmpvar6 holds number of CpGs. 

  #remove any pesky white spaces---will throw off absolutely everything.
  $tmpvar4=~s/^\s+//g; # strip white space from the beginning
  $tmpvar4=~s/\s+$//g; # strip white space from the end
  $tmpvar5=~s/^\s+//g; # strip white space from the beginning
  $tmpvar5=~s/\s+$//g; # strip white space from the end
  $tmpvar6=~s/^\s+//g; # strip white space from the beginning
  $tmpvar6=~s/\s+$//g; # strip white space from the end
  $tmpvar4=~s/\r*\n//g;
  $tmpvar5=~s/\r*\n//g;
  $tmpvar6=~s/\r*\n//g;

  push(@start,$tmpvar4);
  push(@end, $tmpvar5);
  push(@CpG_number, $tmpvar6);

 }
close INPUT;

#### MAIN PROGRAM ####

#choose OUTPUT file name
open OUPUT, ">>test_CGIs_mapped.txt";

my$count=scalar(@start);
my$count2=$count-1;
foreach(0..$count2){
elementaverage($_);
}

#The resulting data is in the form: start location, average methylation over CGI (note that is divided by ten to correct for out of 1000 scale the Ziller2013 reanalysis uses, number of CpGs in CGI that were probed for methylation status, average coverage of cpgs investigated, length of CGI, how many CpGs there are in the predicted CGI, percentage of cpgs in CGI that were sampled. 


### SUBROUTINES ####

sub elementaverage{
my$index=$_;
my@meth; #holds methylation of CpGs within defintion. 
my@coverage;

#get all cpgs within the CGI definition
while (my ($key, $value)=each (%meth)){
if ($key>= $start[$index] and $key<= $end[$index]){
    push(@meth, $value);
}
}

#get coverage of all cpgs within the CGI defintion
while (my ($key2, $value2)=each (%coverage)){
if ($key2>= $start[$index] and $key2<= $end[$index]){
    push(@coverage, $value2);
}
}

###Find Average###
my$den=scalar(@meth);
my$sum=0; #holds the sum of cpg methylation
my$sum2=0; #holds the sum of the coverage
for (@meth) {
$sum += $_;
}
for (@coverage) {
$sum2 += $_;
}

if ($den > 0){
my$average_methylation=(($sum/$den));
my$average_coverage=($sum2/$den);
my$CGI_length=($end[$index]-$start[$index]);
my$percent_cpgs_in_CGI_covered=(($den/$CpG_number[$index])*100);

print OUPUT "$start[$index]\t$average_methylation\t$den\t$average_coverage\t$CGI_length\t$CpG_number[$index]\t$percent_cpgs_in_CGI_covered\n";
}
}



