#! /usr/bin/perl

use warnings;
use strict;
use Statistics::Descriptive;


###############################################
# USAGE STATEMENT
unless (scalar @ARGV >= 3) {
	die "\nIncorrect command line arguments.\n".
		"Usage : Merge_SE_files.pl <Gene list> <HOMER 1> <DiffBind concentration 1> ... <HOMER N> <DiffBind concentration N> .\n";
}
###############################################

###############################################
# 0. Parse command line arguments
my $list = shift @ARGV;
my @files = @ARGV;
###############################################



##############################################
# 1. Parse gene list
my %list_hash = ();

open (LIST,$list) or die "Couldn't open $list: $!\n";

my $headerline = <LIST>;

# 3 columns: refseqID, entrez ID, gene symbol
while (my $line = <LIST>) {
	my @data = get_line_data ($line);
	$list_hash{$data[0]} = $line; # use refseq as reference
	
}

close LIST;
###############################################
	
	
###############################################
# 2. Parse concentrations and annotations
my %data_hash = ();
my $outheader = '';

my @sample_nums = ();

my $sample_idx = 0;
while (my $homer = shift @files) {	
	
	# get corresponding concentration matrix
	my $concentration = shift @files;
	
	# a. parse HOMER annotations
	open (HOMER,$homer) or die "Couldn't open $homer: $!\n";
	my %homer_hash = ();

	BLA:while (my $line = <HOMER>) {
		next BLA if ($. ==1); # skip header
		
		my @linedata = get_line_data($line);
		
		my $name = join("-",@linedata[1..3]); # join chr start end to make unique identifier
		$homer_hash{$name} = $linedata[10]; # extract RefSeq ID
		#print "$name\t$homer_hash{$name}\n";
	}
	
	close HOMER;
	#my @keys = keys %homer_hash;
	#print @keys."\n";
	
	# b. parse concentration
	open (CONC,$concentration) or die "Couldn't open $concentration: $!\n";
	
	# get header info
	my $header = <CONC>; # collect header
	my @headerdata = get_line_data($header);
	
	if ($sample_idx == 0) {
		$outheader .= join("\t",@headerdata[3..$#headerdata]);
	} else {
		$outheader .= "\t".join("\t",@headerdata[3..$#headerdata]);
	}
	#print join("\t",@headerdata[3..$#headerdata]).join("\t",@headerdata);;
	
	# record the number of samples
	my @tmp = @headerdata[3..$#headerdata];
	my $cur_sample_num = scalar @tmp;
	push(@sample_nums,$cur_sample_num);
	
	
	my %summary_gene_hash = ();
	# get one entry per gene, or maximum if more than one
	while (my $line = <CONC>) {
		
		my @linedata = get_line_data ($line);

		# the Diffbind has an offset in the name compared to HOMER (the beginning of diffbind is -1 HOMER), put the +1 to be able to compare
		my $coord = join("-",($linedata[0],$linedata[1]+1,$linedata[2])); # join chr start end to make unique identifier
		#print $coord."\n";
		my $refseq = $homer_hash{$coord};
		#print $refseq."\n";

		if ( defined($summary_gene_hash{$refseq}) ){
			# if already defined, look if new entry has higher average
			# extract existing mean
			
			#print $summary_gene_hash{$refseq}."\n";
			#print "$refseq\t$summary_gene_hash{$refseq}\n";
			my $curmean = get_mean(@{$summary_gene_hash{$refseq}});
			my $newmean = get_mean(@linedata[3..$#linedata]);
			#print "$refseq\t$curmean\t$newmean\n";

			if ($newmean > $curmean) { # if higher, replace
				my @newArray = @linedata[3..$#linedata];
				$summary_gene_hash{$refseq} = \@newArray;
			}

		} else {
			# if not already defined, enter new data
			my @newArray = @linedata[3..$#linedata];
			$summary_gene_hash{$refseq} = \@newArray;
		}
		
	}
	close CONC;
	
	#my @keys = keys %summary_gene_hash;
	#print @keys."\n";

	
	# c. extract and store concentration
	# now there is a unique entry for each gene
	foreach my $refseq (sort keys %summary_gene_hash) {
		#print $refseq."::\n";

		if ( defined($list_hash{$refseq}) ){
			#print "IN\n";
			# parse overlap bp columns			
			my $concentrations = join("\t",@{$summary_gene_hash{$refseq}});
			
			# autovivify if necessary, record current height vector
			${ $data_hash{$refseq} }[$sample_idx] = $concentrations;
		
		}
	}

	++$sample_idx;
}
###############################################

#print "$sample_idx\n";
###############################################
# 3. Output result
print "RefSeq\tEntrez\tSymbol\t".$outheader."\n";

#my @keys = keys %data_hash;
#print @keys."\n";

foreach my $gene (sort keys %data_hash) {
	
	my $height = $data_hash{$gene};
	my $annot = $list_hash{$gene};

	my $remainingfields = "";
	#print "$sample_idx\n";

	for (my $i = 0; $i < $sample_idx; ++$i) {
		#print "$i\n";

		# determine if the gene had a K4 domain in $i-th sample
		if (defined ${ $data_hash{$gene} }[$i]) {
			my $heights = ${ $data_hash{$gene} }[$i] ;
			#get the peak info as is
			$remainingfields .= "\t".$heights;

		} else {
			# record 0 if no recorded data in tissue
			$remainingfields .= "\t0" x $sample_nums[$i] ; # leading tab to account for joining
			
		}
	
	}

	chomp $list_hash{$gene};
	my $outline = $list_hash{$gene}.$remainingfields;

	print $outline."\n";
	
}
###############################################


exit;


#####################################
######       SUBROUTINES       ######
#####################################

###########################################################
# a subroutine that separates fields from a data line and
# returns them in an array

sub get_line_data {

		my $line = $_[0];
		
		chomp $line;
		
		my @linedata = split(/\t/, $line);
			 
		return @linedata;
}


###########################################################
# a subroutine that returns statistical data 
sub get_max {
	#gets filename to analyze from the main program
	my @lengths = @_;

	# create objet for stats (new full statistics object)
	my $stat_FULL = Statistics::Descriptive::Full->new();
	$stat_FULL->clear();

	# add data to the object
	$stat_FULL->add_data(@lengths);
	$stat_FULL->sort_data();
	my $max = $stat_FULL->max();

	return $max;
}

###########################################################
# a subroutine that returns statistical data 

sub get_mean {
	#gets filename to analyze from the main program
	my @lengths = @_;

	# create objet for stats (new full statistics object)
	my $stat_FULL = Statistics::Descriptive::Full->new();
	$stat_FULL->clear();

	# add data to the object
	$stat_FULL->add_data(@lengths);
	$stat_FULL->sort_data();
	my $mean = $stat_FULL->mean();

	return $mean;
}

