#!/usr/bin/perl -w

# written by John Edwards
# jedwards@dom.wustl.edu
# see README.txt for further information

use strict;

my $usage = "USAGE: $0 <CpGfile> <McrBCfile> <REfile> <outFile>\n";

die $usage unless @ARGV == 4;

my $CpGfile = shift; #CpG position file
my $McrBCfile = shift; #bed formatted McrBC reads file
my $REfile = shift; #bed formatted RE reads file
my $outFile = shift;

my $REshift = 5; #accounts for improper end repair
my $McrBCshift = 50; # accounts for McrBC cutting

open(CPG, $CpGfile) or die "Can't open $CpGfile: $!\n$usage\n";
open(MCRBC, $McrBCfile) or die "Can't open $McrBCfile: $!\n$usage\n";
open(RE, $REfile) or die "Can't open $REfile: $!\n$usage\n";
open(OUT, ">$outFile") or die "Can't open $outFile: $!\n$usage\n";

#indexing parameters
my $genomeBinSize = 50000; #1000000
my $genomeBuffer = 17000; #10000

#read in and index McrBC and RE bed files
print STDERR "Reading in McrBCfile: $McrBCfile\n";
my (%McrBCresults);
my $McrBCid = 0;
while(<MCRBC>) {
	next unless /chr/;
	next if /^#/;
	my ($chr, $start, $end, undef, undef, $strand, undef) = split;
	next unless $start;
    my $genomeBin = 0;
    if ( $start > $genomeBinSize ) {
        $genomeBin = int( $start / $genomeBinSize );
    }
    $McrBCid++;
    $McrBCresults{$chr}{$genomeBin}{$McrBCid}{'start'} = $start;
    $McrBCresults{$chr}{$genomeBin}{$McrBCid}{'end'} = $end;
}
close MCRBC;

print STDERR "Reading in REfile: $REfile\n";
my (%REresults);
my $REid = 0;
while(<RE>) {
	next unless /chr/;
	next if /^#/;
	my ($chr, $start, $end, undef, undef, $strand, undef) = split;
	next unless $start;
    my $genomeBin = 0;
    if ( $start > $genomeBinSize ) {
        $genomeBin = int( $start / $genomeBinSize );
    }
    $REid++;
    $REresults{$chr}{$genomeBin}{$REid}{'start'} = $start;
    $REresults{$chr}{$genomeBin}{$REid}{'end'} = $end;
}
close RE;

# overlap bed files with CpG sites
print STDERR "Processing CpGfile: $CpGfile\n";

my $counter = 0;
my $counterMarker = 10000;
my $maxGenomeBin = 0;

while(my $line = <CPG>) {
	next if $line =~ /position/; # skip headers if need be
	chomp $line;

	my $reEndValue = 0;
	my $mcrbcEndInsideValue = 0;
	my $mcrbcEndOutsideValue = 0;
	my $reInteriorValue = 0;
	my $mcrbcInteriorValue = 0;

	my ($chr, $position, undef) = split /\s+/, $line;

	my (@genomeBins);
    my $addGenomeBin = 0;
    if ( $position > $genomeBinSize ) {
        $addGenomeBin = int( $position / $genomeBinSize );
    }
	push @genomeBins, $addGenomeBin;

	my $testGenomeBinLow = $position - ($addGenomeBin * $genomeBinSize);
	my $testGenomeBinHigh = (($addGenomeBin + 1) * $genomeBinSize) - $position;

	if ( $testGenomeBinLow < $genomeBuffer and $addGenomeBin > 0 ) {
		push @genomeBins, $addGenomeBin - 1;
	} elsif ( $testGenomeBinHigh < $genomeBuffer and $addGenomeBin <$maxGenomeBin ) {
		push @genomeBins, $addGenomeBin + 1;
	}

	$counter++;
	if ( $counter % $counterMarker == 0) {
		print STDERR "\tNow on line: $counter\n";
	}

  foreach my $genomeBin ( @genomeBins) {
	MLOOP: foreach my $McrBCid ( keys %{ $McrBCresults{$chr}{$genomeBin} } ) {
		my $McrBCstart =  $McrBCresults{$chr}{$genomeBin}{$McrBCid}{'start'};
		my $McrBCend =  $McrBCresults{$chr}{$genomeBin}{$McrBCid}{'end'};

		if ( $McrBCstart + $McrBCshift <= $position and $McrBCend - $McrBCshift >= $position ) { 
			$mcrbcInteriorValue++;
		} elsif ( $McrBCstart <= $position and $McrBCstart + $McrBCshift >= $position) {
			$mcrbcEndInsideValue++;
		} elsif ( $McrBCstart - $McrBCshift <= $position and $McrBCstart >= $position) {
			$mcrbcEndOutsideValue++;
		} elsif ( $McrBCend <= $position and $McrBCend + $McrBCshift >= $position) {
			$mcrbcEndOutsideValue++;
		} elsif ( $McrBCend - $McrBCshift <= $position and $McrBCend >= $position) {
			$mcrbcEndInsideValue++;
		}
		
	}

	RLOOP: foreach my $REid ( keys %{ $REresults{$chr}{$genomeBin} } ) {
		my $REstart =  $REresults{$chr}{$genomeBin}{$REid}{'start'};
		my $REend =  $REresults{$chr}{$genomeBin}{$REid}{'end'};

		if ( $REstart + $REshift <= $position and $REend - $REshift >= $position ) { 
			$reInteriorValue++;
		} elsif ( $REstart - $REshift <= $position and $REstart + $REshift >= $position) {
			$reEndValue++;
		} elsif ( $REend - $REshift <= $position and $REend + $REshift >= $position) {
			$reEndValue++;
		}
	}

  }
	print OUT "$chr\t$position\t$mcrbcInteriorValue\t$mcrbcEndInsideValue\t$mcrbcEndOutsideValue\t$reInteriorValue\t$reEndValue\n";

}

