#!/usr/bin/perl -w

# written by John Edwards
# jedwards@dom.wustl.edu
# see README.txt for further information

use strict;

my $usage = "USAGE: $0 bedFile outFile\n";
die $usage unless @ARGV == 2;

my $inFile = shift;
my $outFile = shift;

my $nibFrag_EXE = "nibFrag"; #specify location to nibFrag executable here.  This tools is available as part of the blat tools suite from Jim Kent at UCSC.
my $chrDir = "/mnt/work2/genomeData/hg18/chromosomes"; #specify location of the genome .nib files here

my $fWindow = 10; #this and next param set window to use to search for RE sites
my $rWindow = 10;

my $tagLength = 27; #tag length to shift ends to account for tag sequences as necessary

my $upperSite = 10; #enzymatic end repair is imperfect, some fragments will have sites slightly shifted from ends.
my $lowerSite = 5; # same as previous param, but shift is uneven.

open(IN, $inFile) or die "Can't open $inFile: $!\n$usage\n";
open(OUT, ">$outFile") or die "Can't open $outFile: $!\n$usage\n";

my %reHash = 	(
					'CCGC' => 1, #AciI
                    'CGCG' => 1, #BstUI
                    'GCGC' => 1, #HhaI
                    'CCGG' => 1, #HpaII
                    'ACGT' => 1, #HpyCH4
                    'GCGG' => 1 #AciI RC

				);
my @reArray = keys %reHash;

my $valid1Count = 0;
my $valid2Count = 0;
my $totalCount = 0;

print STDERR "Processing $inFile\n";
while(<IN>) {
	if (/^#/) {
		print OUT;
		next;
	}
	unless (/^chr/) {
		print OUT;
		next;
	}

	my $firstFlag = 0;
	my $secondFlag = 0;

	my ($chr, $start, $end, $id, $score, $strand, undef) = split;

	my $testStrand = '+';
	if ($strand eq '-') { $strand = 'm'; }

	my $s1 = $start - $fWindow;
	my $s2 = $start + $rWindow;
	my $cmd = "$nibFrag_EXE $chrDir/$chr.nib $s1 $s2 $testStrand stdout";
	my $seq1 = `$cmd`;
	my @seq1 = split /\s+/, $seq1;
	my $realSeq1 = '';
   	foreach my $line (@seq1) { $realSeq1 .= $line unless $line =~ /^>/; }
	$firstFlag = 1 if &findREpos( \$realSeq1, \%reHash, $lowerSite, $upperSite);
	$s1 = $end - $fWindow;
	$s2 = $end + $rWindow;
	my $seq2 = `$nibFrag_EXE $chrDir/$chr.nib $s1 $s2 $testStrand stdout`;
	my @seq2 = split /\s+/, $seq2;
	my $realSeq2 = '';
   	foreach my $line (@seq2) { $realSeq2 .= $line unless $line =~ /^>/; }
	$secondFlag = 1 if (&findREpos( \$realSeq2, \%reHash, $lowerSite, $upperSite));

	if ($firstFlag or $secondFlag) {
		print OUT;
		$valid2Count++;
	}

	$totalCount++;
	if ( $totalCount % 1000 == 0 ) {
		print STDERR "$totalCount\t$valid2Count\n";
	}
	
		
}
close IN;
print STDERR "TotalSequences\tPassFilter\n";
print STDERR "$totalCount\t$valid2Count\n";

sub findREpos {
	my $seqRef = shift;
	my $reHashRef = shift;
	my $lowerSite = shift;
	my $upperSite = shift;

	my $reLength = 4;
	
	my $returnVal = 0;
	for (my $i = $lowerSite; $i < $upperSite; $i++) {
		$returnVal = 1 if $$reHashRef{ uc substr( $$seqRef, $i,  $reLength ) };
	}

	return $returnVal;
	
}

