#!/usr/bin/perl -w

# written by John Edwards
# jedwards@dom.wustl.edu
# see README.txt for further information

use strict;

my $usage = "USAGE: $0 <bedFile> <outFile>\n";
die $usage unless @ARGV == 2;

my $inFile = shift;
my $outFile = shift;

my $nibFrag_EXE = "nibFrag"; #specify location to nibFrag executable here.  This tools is available as part of the blat tools suite from Jim Kent at UCSC.
my $chrDir = '/mnt/work2/genomeData/hg18/chromosomes'; #specify location of the genome .nib files here
die "can't find chrDir: $chrDir\n" unless -e $chrDir;

my $window = 500; #window over which to search for McrBC half sites near fragment ends
my $tagLength = 27; #corrects for tag length in mapping coordinates
my $shift = 3; #shift since McrBC sites are 3 bps.
my $minDistance = 40; # min distance between 2 McrBC half sites
my $nearSiteDistance = 50; #cut site will be within this distacnce of one of the two McrBC half sites

my $upperSite = 10;
my $lowerSite = 5;

open(IN, $inFile) or die "Can't open $inFile: $!\n$usage\n";
open(OUT, ">$outFile") or die "Can't open $outFile: $!\n$usage\n";


my $valid1Count = 0;
my $valid2Count = 0;
my $totalCount = 0;

print STDERR "Processing $inFile\n";
while(<IN>) {
	if (/^#/) {
		print OUT;
		next;
	}
	unless (/^chr/) {
		print OUT;
		next;
	}

	my $firstFlag = 0;
	my $secondFlag = 0;

	my ($chr, $start, $end, $id, $score, $strand, undef) = split;

	my $testStrand = '+';
	if ($strand eq '-') { $strand = 'm'; }

	my $s1 = $start - $window;
	my $s2 = $start + $shift;
	my $s3 = $start;
	my $s4 = $start + $window;

	my $cmd = "$nibFrag_EXE $chrDir/$chr.nib $s1 $s2 m stdout";
	my $seq1 = `$cmd`;
	my @seq1 = split /\s+/, $seq1;
	my $realSeq1 = '';
   	foreach my $line (@seq1) { $realSeq1 .= $line unless $line =~ /^>/; }
	$realSeq1 = uc $realSeq1;

	$cmd = "$nibFrag_EXE $chrDir/$chr.nib $s3 $s4 + stdout";
	my $seq2 = `$cmd`;
	my @seq2 = split /\s+/, $seq2;
	my $realSeq2 = '';
   	foreach my $line (@seq2) { $realSeq2 .= $line unless $line =~ /^>/; }
	$realSeq2 = uc $realSeq2;

	$firstFlag = 1 if &findMcrBCpos( \$realSeq1, \$realSeq2, $nearSiteDistance, $minDistance);

	$s1 = $end - $window;
	$s2 = $end + $shift;
	$s3 = $end;
	$s4 = $end + $window;

	my $seq3 = `$nibFrag_EXE $chrDir/$chr.nib $s1 $s2 m stdout`;
	my @seq3 = split /\s+/, $seq3;
	my $realSeq3 = '';
   	foreach my $line (@seq3) { $realSeq3 .= $line unless $line =~ /^>/; }
	$realSeq3 = uc $realSeq3;

	my $seq4 = `$nibFrag_EXE $chrDir/$chr.nib $s3 $s4 + stdout`;
	my @seq4 = split /\s+/, $seq4;
	my $realSeq4 = '';
   	foreach my $line (@seq4) { $realSeq4 .= $line unless $line =~ /^>/; }
	$realSeq4 = uc $realSeq4;

	$secondFlag = 1 if (&findMcrBCpos( \$realSeq3, \$realSeq4, $nearSiteDistance, $minDistance));

	if ($firstFlag or $secondFlag) {
		print OUT;
		$valid2Count++;
	}

	$totalCount++;
	if ( $totalCount % 1000 == 0 ) {
		print STDERR "$totalCount\t$valid2Count\n";
	}
	
}
close IN;
print STDERR "TotalSequences\tPassFilter\n";
print STDERR "$totalCount\t$valid2Count\n";

sub findMcrBCpos {
	my $seqRef1 = shift;
	my $seqRef2 = shift;
	my $nearLimit = shift;
	my $minDistance = shift;

	my $returnVal = 0;

	if ( &findMcrBCsite($seqRef1) and &findMcrBCsite($seqRef2) ) {
		my ($nearFlag1, $nearFlag2);
			my $tempString = substr( $$seqRef1, 0, $nearLimit );
		if ( &findMcrBCsite( \$tempString ) ) {
			$nearFlag1 = 1;
		} elsif ( &findMcrBCsite( \$tempString ) ) {
			$nearFlag2 = 1;
		}

		if ($nearFlag1 and $nearFlag2) {
			my (@positions1);
			while( $seqRef1 =~ /[AG]CG/g ) {
				push @positions1, (pos($seqRef1) + 1);
			}
			while( $seqRef1 =~ /CG[CT]/g ) {
				push @positions1, pos($seqRef1);
			}
			while( $seqRef2 =~ /[AG]CG/g ) {
				my $p = pos($seqRef2) + 1;
				foreach (@positions1) {
					if ( ($_ + $p) >= $minDistance ) {
						$returnVal = 1;
						last;
					}
				}
			}
			unless ( $returnVal ) {
				while( $seqRef2 =~ /CG[CT]/g ) {
					my $p = pos($seqRef2);
					foreach (@positions1) {
						if ( ($_ + $p) >= $minDistance ) {
							$returnVal = 1;
							last;
						}
					}
				}
			}
			
		} elsif ($nearFlag1 || $nearFlag2) {
			$returnVal = 1;
		}
	}


	return $returnVal;
	
}

sub findMcrBCsite {
	my $seqRef = shift;
	my $returnVal = 0;
	$returnVal = 1 if ( ($$seqRef =~ /[AG]CG/) || ($$seqRef =~ /CG[CT]/) );
	#print STDERR "Found Site\n" if $returnVal;
	return $returnVal;
}

