#!usr/bin/perl
use strict;
use warnings;

my $start_time = time();

## known bug - if read spans two bins, both bins get one count. change to add fraction of read in the bin to the count.

my $bin_file = shift; # output file from betools makewindows
my $reads_file = shift; # convert sorted bam ip file to bed for the exp
my $input_file = shift; # convert sorted bam input file to bed for the exp
my $output_file = shift; # output file
my (%bin_array, %ordered);
my ($bin_size, $new_bin_start, $new_bin_end);
my ($key_exp_sum, $key_input_sum) = 0;
my $outside_bins_provided = 0;
my $old_chr = "";

### Open the bed file containing the binned coordinates for the entire genome and save it in a hash. Add two more columns at the same time.

open (BINS, $bin_file) || die "$!\n";

while (my $line = <BINS>) {
	chomp($line);
	my @sl = split(/\t/, $line);
	$bin_size = $sl[2] - $sl[1] if $. == 1;
	if ($sl[2] - $sl[1] == $bin_size) {
		$bin_array{"$sl[0]:$sl[1]:$sl[2]"} = "$line\t0\t0";
	}
	else {
		$sl[2] = $sl[1] + $bin_size;
		$bin_array{"$sl[0]:$sl[1]:$sl[2]"} = "$line\t0\t0";
	}
}

close BINS;
print "Binned file loaded to memory\n";

### Count the number of reads in each bin. Counts for input addded to second to column 4. 

open (READS, $reads_file) || die "$!\n";

while (my $line = <READS>) {
	chomp($line);
	my ($chr, $start, $end, $name, $score, $strand) = split(/\t/, $line);
	my $bin_start = 0;
	my $bin_end = $bin_size;
	if ($chr eq $old_chr) {
		$bin_start = $new_bin_start;
		$bin_end = $new_bin_end;
	}
	CONTINUE:
	if ($start >= $bin_start && $start < $bin_end) {
		change_count($chr, $bin_start, $bin_end);
		if ($end >= $bin_end) {
			AGAIN:
			$bin_start += $bin_size;
			$bin_end += $bin_size;
			change_count($chr, $bin_start, $bin_end);
			goto AGAIN if $end >= $bin_end;
		} 
	}
	elsif ($start >= $bin_end) {
		do {
			$bin_start += $bin_size;
			$bin_end += $bin_size;
		} until ($start >= $bin_start && $start < $bin_end);
		$old_chr = $chr;
		$new_bin_start = $bin_start;
		$new_bin_end = $bin_end;
		goto CONTINUE;
	}
}

close READS;
print "Count done for experiment file\n";

$old_chr = "";

### Count the number of reads in each bin. Counts for input addded to second to column 5. 

open (INPUT, "$input_file") || die "$!\n";

while (my $line = <INPUT>) {
	chomp($line);
	my ($chr, $start, $end, $name, $score, $strand) = split(/\t/, $line);
	my $bin_start = 0;
	my $bin_end = $bin_size;
	if ($chr eq $old_chr) {
		$bin_start = $new_bin_start;
		$bin_end = $new_bin_end;
	}
	CONTINUE:
	if ($start >= $bin_start && $start < $bin_end) {
		change_count_input($chr, $bin_start, $bin_end);
		if ($end >= $bin_end) {
			AGAIN:
			$bin_start += $bin_size;
			$bin_end += $bin_size;
			change_count_input($chr, $bin_start, $bin_end);
			goto AGAIN if $end >= $bin_end;
		} 
	}
	elsif ($start >= $bin_end) {
		do {
			$bin_start += $bin_size;
			$bin_end += $bin_size;
		} until ($start >= $bin_start && $start < $bin_end);
		$old_chr = $chr;
		$new_bin_start = $bin_start;
		$new_bin_end = $bin_end;
		goto CONTINUE;
	}
}

close INPUT;
print "Count done for input file\n";

### Get the sum of the reads of the input and ip reads and write them to a file <name>.raw

open (OUT1, ">", "$output_file.raw") || die "$!\n";

foreach my $key (keys %bin_array) {
	my @key_sl = split(/\t/, $bin_array{$key});
	$key_exp_sum += $key_sl[3];
	$key_input_sum += $key_sl[4];
	print OUT1 "$bin_array{$key}\n";
}

close OUT1;

my $ratio = $key_exp_sum / $key_input_sum;

### Normalise the input reads (column 5) with the ratio.

foreach my $key (keys %bin_array) {
	my @key_sl = split(/\t/, $bin_array{$key});
	$key_sl[4] = $ratio * $key_sl[4];
	my $new_values = join("\t", @key_sl);
	$bin_array{$key} = $new_values;
}

### Get a ratio of the normalised reads and write them to a file.

my $count;

open (OUT2, ">", "$output_file.normalised") || die "$!\n";

foreach my $key (keys %bin_array) {
	$count++;
	my @key_sl = split(/\t/, $bin_array{$key});
	my $ip = $key_sl[3];
	my $input = $key_sl[4];
	if ($key_sl[4] > 0) {
		$key_sl[4] = $key_sl[3] / $key_sl[4];
		$key_sl[4] = log($key_sl[4]) if $key_sl[4] > 0;
	}
	else {
		$key_sl[4] = 0;
	}
	$key_sl[3] = "Bin $count";
	my $out_line = join("\t", @key_sl);
	print OUT2 "$out_line\t$ip\t$input\n";
}

qx(cut -f 1,2,3,5 $output_file.normalised > $output_file.bedgraph);

close OUT2;

### subs to perform count for input and ip.

sub change_count {
	my ($chr, $bin_start, $bin_end) = @_;
	if (exists $bin_array{"$chr:$bin_start:$bin_end"}) {
		my $count_line = $bin_array{"$chr:$bin_start:$bin_end"};
		my @count_sl = split(/\t/, $count_line);
		$count_sl[3] += 1;
		my $new_value = join("\t", @count_sl);
		$bin_array{"$chr:$bin_start:$bin_end"} = "$new_value";
	}
	else {
		$outside_bins_provided++;
	}
}

sub change_count_input {
	my ($chr, $bin_start, $bin_end) = @_;
	if (exists $bin_array{"$chr:$bin_start:$bin_end"}) {
		my $count_line = $bin_array{"$chr:$bin_start:$bin_end"};
		my @count_sl = split(/\t/, $count_line);
		$count_sl[4] += 1;
		my $new_value = join("\t", @count_sl);
		$bin_array{"$chr:$bin_start:$bin_end"} = "$new_value";
	}
	else {
		$outside_bins_provided++;
	}
}

print "Total number of reads not used are $outside_bins_provided, as they are outside the bins provided\n" if $outside_bins_provided > 0;
#qx(rm $output_file.raw);
qx(sort -k1,1 -k2,2n $output_file.bedgraph > $output_file.sorted.bedgraph);
qx(rm $output_file.bedgraph);
qx(bedGraphToBigWig $output_file.sorted.bedgraph /data/as/annotation/Mus_musculus/UCSC/mm10/GenomeStudio/Mus_musculus/UCSC-mm10/ChromInfo.txt $output_file.bw);
#Take this out later
qx(mv Ratios.* ../c_newRatios/); 

my $end_time = time();

my $time_taken = $end_time - $start_time;

print "Job took $time_taken seconds\n";
