#! /usr/bin/perl

use warnings;
use strict;
use Statistics::Descriptive;


###############################################
# USAGE STATEMENT
unless (scalar @ARGV >= 3) {
	die "\nIncorrect command line arguments.\n".
		"Usage : Merge_H3K4me3_breadth_files.pl <Gene list> <Breadth file 1>... <Breadth file n>.\n";
}
###############################################

###############################################
# 0. Parse command line arguments
my $list = shift @ARGV;
my @breadthsfiles = @ARGV;

###############################################


###############################################
# 1. Parse gene list
my %list_hash = ();

open (LIST,$list) or die "Couldn't open $list: $!\n";

my $headerline = <LIST>;

# 3 columns: refseqID, entrez ID, gene symbol
while (my $line = <LIST>) {
	my @data = get_line_data ($line);
	$list_hash{$data[1]} = $line;
}

close LIST;
###############################################


###############################################
# 2. Parse annotated breadth list

#my.cols <- c(10,13,16,19,22,25)

my %data_hash = ();

my $file_idx = 0;

foreach my $file (@breadthsfiles) {

	open (FILE,$file) or die "Couldn't open $file: $!\n";
	
	my $header = <FILE>;
	
	while (my $line = <FILE>) {
		
		my @linedata = get_line_data ($line);
		
		if ( defined($list_hash{$linedata[5]}) ){
			
			# parse overlap bp columns
			my @curbreadths = ( $linedata[9], $linedata[12], $linedata[15], $linedata[18], $linedata[21], $linedata[24]);
			
			my $breadth_cur = join("\t",@curbreadths);
			
			# autovivify if necessary, push current breadths vector
			push(@{ ${ $data_hash{$linedata[5]} }[$file_idx] }, $breadth_cur);

		}
		
		
	}
	
	
	
	close FILE;
	++$file_idx;

}
###############################################

###############################################
# 3. Output

foreach my $gene (sort keys %data_hash) {
	
	my $breadth = $data_hash{$gene};
	my $annot = $list_hash{$gene};

	my $remainingfields = "";
	
	for (my $i = 0; $i < $file_idx; ++$i) {
		
		# determine if the gene had a K4 domain in $i-th sample
		if (defined @{ ${ $data_hash{$gene} }[$i] } ) {
			my @breadths = @{ ${ $data_hash{$gene} }[$i] };
			
			# if there is one or more peaks, get max breadth
			if ( scalar @breadths == 1 ) {
				#get the peak info as is
				$remainingfields .= "\t".$breadths[0];

			} else {
				#get broadest peak associated to gene

				my @relbre = ();
				
				foreach my $peak (@breadths) {
					# get mean breadth of the peak and record it
					my @breadthdata = get_line_data($peak);
					my $meancurbreadth = get_mean(@breadthdata);
					push(@relbre,$meancurbreadth);
				}
				
				my $maxB = get_max (@relbre);
				my ( $index ) = grep { $relbre[$_] eq $maxB } 0..$#relbre; 
				
				$remainingfields .= "\t".$breadths[$index];

			}
			
		} else {
			# 6 0 if no recorded breadth in tissue
			$remainingfields .= "\t0\t0\t0\t0\t0\t0" ; # leading tab to account for joining
		
		}
	
	}

	chomp $list_hash{$gene};
	my $outline = $list_hash{$gene}.$remainingfields;

	print $outline."\n";
	
}


###############################################


exit;


#####################################
######      SUBROUTINES        ######
#####################################

###########################################################
# a subroutine that separates fields from a data line and
# returns them in an array

sub get_line_data {

    my $line = $_[0];
    
    chomp $line;
    
    my @linedata = split(/\t/, $line);
       
    return @linedata;
}


###########################################################
# a subroutine that returns statistical data 
sub get_max {
	#gets filename to analyze from the main program
	my @lengths = @_;

	# create objet for stats (new full statistics object)
	my $stat_FULL = Statistics::Descriptive::Full->new();
	$stat_FULL->clear();

	# add data to the object
	$stat_FULL->add_data(@lengths);
	$stat_FULL->sort_data();
	my $max = $stat_FULL->max();

	return $max;
}

###########################################################
# a subroutine that returns statistical data 

sub get_mean {
	#gets filename to analyze from the main program
	my @lengths = @_;

	# create objet for stats (new full statistics object)
	my $stat_FULL = Statistics::Descriptive::Full->new();
	$stat_FULL->clear();

	# add data to the object
	$stat_FULL->add_data(@lengths);
	$stat_FULL->sort_data();
	my $mean = $stat_FULL->mean();

	return $mean;
}

