#!/usr/bin/perl

## Compile reports of annotated depths from znt4 files following cell treatment
## nucleotide occupancy
## Utilizes ZNT4 file types (binary depths) and anotation from sacCer2.chr/seq_gene.md and sacCer2.chr/yeast_TSS-converted.txt

unless (defined $ARGV[1]) {die "Requires 2 or more arguments [ResultsFileName] [One Or More ZNT4 Subfolders]\n";}
unless (-e "sacCer2.chr/seq_gene.md") {die "Missing file sacCer2.chr/seq_gene.md\n";}
unless (-e "sacCer2.chr/yeast_TSS-converted.txt") {die "Missing file sacCer2.chr/yeast_TSS-converted.txt\n";}

use POSIX;
use List::Util qw[min max];
my $filebase = shift;
my $skipset = -1;
my @chra = ("2micron","chrI","chrII","chrIII","chrIV","chrV","chrVI","chrVII","chrVIII","chrIX","chrX","chrXI","chrXII","chrXIII","chrXIV","chrXV","chrXVI");
my @chro = ("2micron","I","II","III","IV","V","VI","VII","VIII","IX","X","XI","XII","XIII","XIV","XV","XVI");
#my @sets = ("INPUT_294_A", "INPUT_294_U", "INPUT_296_A", "INPUT_296_U", "IP_294_A", "IP_294_U", "IP_296_A", "IP_296_U");
my @sets = @ARGV;
for(my $x = 0; $x < @sets; $x++) {
	my $setname = $sets[$x];
	$setname =~ s/\/$//; #remove trailing / from folder names
	$sets[$x] = $setname;
	if ($setname eq "sacCer2.chr") {
		$skipset = $x;
	}
}
if ($skipset != -1) {
	splice(@sets, $x, 1);
}

my @totals = (6318, 230208, 813178, 316617, 1531919, 576869, 270148, 1090947, 562643, 439885, 745742, 666454, 1078175, 924429, 784333, 1091289, 948062);
my %tssvals;
my %promvals;
my %rawvals;
my %lengths;
my %setdepth;
my %downstrvals;

sub find_depths {
	my $filein = shift;
	my $start = shift;
	my $end = shift;
	my $sum = 0;
	my $fname = "$filein.znt4";
	if (-e $fname) {
		open(ZNTFILE,"$fname");
		binmode ZNTFILE;
		
		seek ZNTFILE, $start*4 - 4, 0;
		my $span = $end - $start + 1;
		for($n = 0; $n < $span; $n++) {
			read ZNTFILE, $bbase, 4;
			$nbase = unpack("V",$bbase);
			$sum += $nbase;
		}
		close(ZNTFILE);
	} else {
		$fname = "$filein.znt";
		open(ZNTFILE,"$fname");
		binmode ZNTFILE;
		seek ZNTFILE, $start - 1, 0;
		my $span = $end - $start + 1;
		for($n = 0; $n < $span; $n++) {
			read ZNTFILE, $bbase, 1;
			$nbase = ord($bbase);
			$sum += $nbase;
		}
		close(ZNTFILE);
	}	
	return $sum;
}

open(COMBINED,">${filebase}_CombinedDataRaw.txt");
open(OUTPROM,">${filebase}_UpstreamRaw.txt");
open(OUTTSS,">${filebase}_TSSpromoterRaw.txt");
open(OUTRAW,">${filebase}_GeneRaw.txt");
open(OUTAVG,">${filebase}_GeneLengthAveraged.txt");
open(OUTDS,">${filebase}_DownstreamRaw.txt");

my %tss;

open(TSSFILE,"sacCer2.chr/yeast_TSS-converted.txt");
while(<TSSFILE>){
	chomp;
	my @arr = split(/\t/,$_);
	if ($arr[6] eq "+") {
		$tss{$arr[1]} = $arr[4];
	} else {
		$tss{$arr[1]} = $arr[5];
	}
}

for(my $i = 0; $i <= 16; $i++) {
	print STDERR "Working on chr $chro[$i]\n";
	foreach my $dataset (@sets) {
		my $zfile = "$dataset/$chra[$i]";
		print STDERR "Working on $zfile...\n"; 
		open(FEATFILE,"sacCer2.chr/seq_gene.md");
		while(<FEATFILE>) {
			chomp;
			my ($id, $chr, $start, $end, $dir, $contig, $cstart, $cend, $cdir, $feat, $fid, $type, @rest) = split(/\t/, $_);
			my ($proms, $prome, $dss, $dse, $tsss, $tsse);
			next if ($id =~ /^#/);
			next unless ($chr eq $chro[$i]);
			next if ($type eq "CDS");
			my $val = find_depths($zfile, $start, $end);
			if ($dir eq "+") {
				$proms = $start - 389;
				$prome = $start + 61;
				$dss = $end + 1;
				$dse = $end + 201;
				if (defined $tss{$feat}) {
					$tsse = $tss{$feat};
					$tsss = $tss{$feat} - 200;
				} else {
					($tsss, $tsse) = 0;
				}
			}
			if ($dir eq "-") {
				$proms = $end - 61; 
				$prome = $end + 389;
				$dss = $start - 201;
				$dse = $start - 1;
				if (defined $tss{$feat}) {
					$tsss = $tss{$feat};
					$tsse = $tss{$feat} + 200;
				} else {
					($tsss, $tsse) = 0;
				}
			}
			my $prom = find_depths($zfile, $proms, $prome);
			my $dsval = find_depths($zfile, $dss, $dse);
			my $lengthval = $end - $start + 1;
			$tssvals{$fid}{$feat}{$dataset} = "NA";
			if ($tsss > 0) {$tssvals{$fid}{$feat}{$dataset} = find_depths($zfile, $tsss, $tsse);}
			$rawvals{$fid}{$feat}{$dataset} = $val;
			$promvals{$fid}{$feat}{$dataset} = $prom;
			$downstrvals{$fid}{$feat}{$dataset} = $dsval;
			$lengths{$fid}{$feat} = $lengthval;
		}
		close(FEATFILE);
	}
}

print STDERR "Writing reports\n";

print COMBINED join ("\t", "#FEAT_ID", "FEATURE", "LENGTH");
foreach my $set (@sets) {
	print COMBINED "\tRAWGENE_$set\tAVGGENE_$set\tUPSTR_$set\tTSS_$set\tDOWNSTR_$set";
}
print COMBINED "\n";
print OUTRAW join ("\t", "#FEAT_ID", "FEATURE", @sets), "\n";
print OUTPROM join ("\t", "#FEAT_ID", "FEATURE", @sets), "\n";
print OUTDS join ("\t", "#FEAT_ID", "FEATURE", @sets), "\n";
print OUTAVG join ("\t", "#FEAT_ID", "FEATURE", "LENGTH", @sets), "\n";

print OUTTSS join ("\t", "#FEAT_ID", "FEATURE", @sets), "\n";
foreach my $fid (keys %rawvals) {
	foreach my $feat (keys %{$rawvals{$fid}}) {
		print OUTRAW "$fid\t$feat";
		print OUTPROM "$fid\t$feat";
		print OUTDS "$fid\t$feat";
		#if ($tssvals{$fid}{$feat}{$dataset} ne "NA") {print OUTTSS "$fid\t$feat";} #Uncomment and comment below line to exclude NA entries
		print OUTTSS "$fid\t$feat";
		print OUTAVG "$fid\t$feat\t$lengths{$fid}{$feat}";
		print COMBINED "$fid\t$feat\t$lengths{$fid}{$feat}";
		foreach my $dataset (@sets) {
			my $depth = $rawvals{$fid}{$feat}{$dataset};
			my $prom = $promvals{$fid}{$feat}{$dataset};
			my $avg = $rawvals{$fid}{$feat}{$dataset} / $lengths{$fid}{$feat};
			print OUTRAW "\t$depth";
			print OUTPROM "\t$prom";
			print OUTAVG "\t$avg";
			#if ($tssvals{$fid}{$feat}{$dataset} ne "NA") {print OUTTSS "\t$tssvals{$fid}{$feat}{$dataset}";} #Uncomment and comment below line to exclude NA entries
			print OUTTSS "\t$tssvals{$fid}{$feat}{$dataset}";
			my $dsval = $downstrvals{$fid}{$feat}{$dataset};
			print OUTDS "\t$dsval";
			print COMBINED "\t$depth\t$avg\t$prom\t$tssvals{$fid}{$feat}{$dataset}\t$dsval";
		}
		print OUTRAW "\n";
		print OUTPROM "\n";
		print OUTDS "\n";
		print OUTAVG "\n";
		print COMBINED "\n";
		#if ($tssvals{$fid}{$feat}{$dataset} ne "NA") {print OUTTSS "\n";} #Uncomment and comment below line to exclude NA entries
		print OUTTSS "\n";
	}
}

print STDERR "Calculating totals\n";

print OUTTSS "TOTAL SET DEPTH\t";
print OUTRAW "TOTAL SET DEPTH\t";
print OUTPROM "TOTAL SET DEPTH\t";
print OUTDS "TOTAL SET DEPTH\t";
print OUTAVG "TOTAL SET DEPTH\t";
print COMBINED "TOTAL SET DEPTH\t";
my $sizesw = 0;
foreach my $dataset (@sets) {
	print STDERR "$dataset...\n";
	my $total = 0;
	my $size = 0;
	for(my $i = 0; $i <= 16; $i++) {
		my $file = "$dataset/$chra[$i]";
		$total += find_depths($file, 0, $totals[$i]);
		$size += $totals[$i];
	}
	print OUTRAW "\t$total";
	print OUTPROM "\t$total";
	print OUTDS "\t$total";
	print OUTTSS "\t$total";
	if ($sizesw == 0) {$sizesw = 1; print OUTAVG "\t$size"; print COMBINED "\t$size";}
	my $avg = $total / $size;
	print OUTAVG "\t$avg";
	print COMBINED "\t$total\t$avg\t$total\t$total\t$total"
}
	
close(OUTDS);
close(OUTPROM);
close(OUTRAW);
close(OUTAVG);
close(OUTTSS);
close(COMBINED);
print STDERR "DONE\n";
