use strict;
use warnings;
use Cwd;
use Getopt::Std;

my $numArgs = $#ARGV + 1;
if($numArgs != 8) 
{ 
  print "\nUSAGE: perl EnrichedGenePromoterFinding.pl -I <file> -C <file> -D <int> -F <number>\n"; 
  print "\t-I\tinput file containing geneWise red-density matrix for genomic control input, as generated by genePromoterTagDensity.pl\n";
  print "\t-C\tinput file containing geneWise red-density matrix for H3K4me3/H3K27me3, as generated by genePromoterTagDensity.pl\n";
  print "\t-D\tFDR threshold as percentage between 0 to 100\n";
  print "\t-F\tMinimum FoldOverInput threshold (e.g. 3) for H3K4me3/H3K27me3 enrichment consideration \n";
  #print "\t-o\t*name of the output file\n";
  die "\n";
}

my %Options;
my $ok = getopts('I:C:D:F:', \%Options);
die "\n\nInvalid options on the command line\n\n" if (!$ok);

my $INPUT_file = $Options{I};
my $IP_file = $Options{C};
my $FDR_filter = $Options{D};
my $fold_diff_filter= $Options{F};
my $outputFileName = $Options{C};
$outputFileName.='FDR'.$FDR_filter.'_min'.$fold_diff_filter.'FoldOverInput.txt';


#my $INPUT_file=shift;#Expects a matrix of RPM/RPKM signals for multiple equal sized bins around TSS for every unique transcript. First column should contain the unique ID;

#my $IP_file=shift;#Same as INPUT_file
#####Number and size of the bins around TSS should be the same for INPUT and IP
#my $FDR_filter=1;
#my $fold_diff_filter=3;

my %input_tagDensity=();
open(IN,$INPUT_file) || die("Cannot open $INPUT_file");
my $l=0;
while(my $line=<IN>)
{
chomp $line;
if($l==0)
{
$l++;
next;
}
my @line_data=split(/\t/,$line);
my $RPM=0;
my $count_bins=0;
for (my $i=4;$i<=$#line_data;$i++)
{
$RPM+=$line_data[$i];
$count_bins++;
}
if($count_bins>0)
{
#$RPM=$RPM/$count_bins;###Average
$input_tagDensity{$line_data[0]}=sprintf("%.3f",$RPM);
}
else
{
$input_tagDensity{$line_data[0]}=0;
}
}
close IN;
$l=0;
open(IP,$IP_file) || die("cannot open $IP_file");
my %IP_tagDensity=();
while(my $line=<IP>)
{
chomp $line;
	if($l==0)
	{
	$l++;
	next;
	}
my @line_data=split(/\t/,$line);
my $RPM=0;
my $count_bins=0;
for (my $i=4;$i<=$#line_data;$i++)
{
	$RPM+=$line_data[$i];
	$count_bins++;
}
if($count_bins>0)
{
#$RPM=$RPM/$count_bins;###Average
$IP_tagDensity{$line_data[0]}=sprintf("%.3f",$RPM);
}
else
{
$IP_tagDensity{$line_data[0]}=0;
}



#$IP_tagDensity{$line_data[0]}=sprintf("%.3f",$RPM);
}
close IP;
my @INPUT_tags=sort {$b<=>$a} values(%input_tagDensity);
my %IP_FDR=();
my $target_count=0;
foreach my $gene(sort {$IP_tagDensity{$b}<=>$IP_tagDensity{$a}} keys %IP_tagDensity)
{
$target_count++;
my $val=$IP_tagDensity{$gene};
my $decoy_count=0;

	for (my $i=0;$i<=$#INPUT_tags;$i++)
	{
	if($INPUT_tags[$i]>=$val)
	{
		$decoy_count++;
	}
	else
	{
		last;
	}
	}
$IP_FDR{$gene}[0]=$val;
$IP_FDR{$gene}[1]=sprintf("%.3f",($decoy_count/$target_count)*100);
my $fold_diff=0;
	if(exists($input_tagDensity{$gene}) &&($input_tagDensity{$gene}>0))
	{
		$fold_diff=$val/$input_tagDensity{$gene};
	}
	elsif(exists($input_tagDensity{$gene})&&($input_tagDensity{$gene}==0))
	{
		$fold_diff=10;####If input has no signal but IP has signal more than FDR threshold, defining "NA or 0" will penalize such genes unnecessarily. 
	}
	else
	{
		$fold_diff=0;
	}
$IP_FDR{$gene}[2]=sprintf("%.3f",$fold_diff);
#print $gene,"\t",$val,"\t",$IP_FDR{$gene}[1],$IP_FDR{$gene}[2],"\n";
} 
my $threshold_value=0;

foreach my $gene(sort {$IP_FDR{$a}[0]<=>$IP_FDR{$b}[0]} keys %IP_FDR)
{
if($IP_FDR{$gene}[1]<=$FDR_filter)
{
$threshold_value=$IP_FDR{$gene}[0];
last;
}
}

open(OUT,">$outputFileName")|| die("Cannot create OUTPUT file $outputFileName");
print OUT "ID\tRPM\tFDR\tFoldOverInput\n";

foreach my $gene(sort {$IP_FDR{$b}[0]<=>$IP_FDR{$a}[0]} keys %IP_FDR)
{
if(($IP_FDR{$gene}[0]>=$threshold_value)&&($IP_FDR{$gene}[2]>=$fold_diff_filter))
{
print OUT $gene,"\t",join("\t",@{$IP_FDR{$gene}}),"\n";
}
}
close OUT;

