#!/usr/bin/perl
use strict;
use warnings;
use Getopt::Std;

my %option;
getopts( 'q:r:s:h', \%option );

my ($query, $readnum, $seq_db_file);

if ( ( $option{q} ) && ($option{r}) && ($option{s}) ) {
    $query = $option{q};
    $readnum = $option{r};
    $seq_db_file = $option{s};
} else {
    die "Proper parameters not passed\n ( $option{q} ) && ($option{r}) && ($option{s}) )\n";
} 

my $query_filename  = $query.".rep.bed";
#filenames are harded coded here, but they can passed from command line
#my @filearr = qw/Het_Asb1_k$readnumidney_AO34.rep.bed KO_Asb1_kideny_AO36.rep.bed HET_Asb1_Testes_AO31.rep.bed KO_Asb1_Testes_AO33.rep.bed HET_Asb1_testes_AO34.rep.bed KO_Asb1_testes_AO36.rep.bed/;

my @filearr = ($query_filename);
$readnum=$readnum/1000000;
my $rpm_hash = { "$query" =>$readnum };

print STDERR "$query => $readnum\n";
#my $rpm_hash = { "SRR609664.nostrna" => 64.126190 };




my $gene_count_hash = {};
my $unified_hash = {}; #build the unified gene list

foreach my $filename (@filearr) {
    my ($gene_hash, $unified_hash) = &produceGeneCount($filename, $unified_hash);
    $gene_count_hash->{$filename} = $gene_hash;
#    my $keylist = keys %$gene_hash;

    my $keylist = keys %{$gene_count_hash->{$filename}};
#    print "Filename: $filename| count: $keylist\n";

}

#build the unified gene list
#my $keylist = keys %$unified_hash;
#print "Unified count: $keylist\n";


#Now build the elements of for contrated_list just the key list
#my $test = 0;
my $len_hash = {};
my $name_hash = {};
foreach my $key (keys %$unified_hash) {
    my $string = $key;   
#    my @arr1 = split(/@/, $key);
#    my $gene_name = $key; #$arr1[0];
    my @arr2 = split(/=/, $key);
    my $gene_name = $arr2[0];   

# my @arr1 = split(/@/, $key);
#    my $gene_name = $arr1[0];
#    my @arr2 = split(/=/, $key);
    if (@arr2 > 2) {
	die "invalid characters like =\n";
    }
    my $new_gene_length = $arr2[1]; 
    if (exists $name_hash->{$gene_name}) {
	my $curr_len = $len_hash->{$gene_name};
	if ($curr_len < $new_gene_length) {
	    $name_hash->{$gene_name} = $string;
	    $len_hash->{$gene_name} = $new_gene_length;	
	}
    } else {
	$name_hash->{$gene_name} = $string;
	$len_hash->{$gene_name} = $new_gene_length;	
    }  
    
}

#----------------- need to this portion for entries that are not present in the rep.bed file
#now open the sequence file, and update the missing entries
#$seq_db_file
 open(my $SEQ, "<", $seq_db_file) 
	or die "unable to open file  $seq_db_file";

while ( my $line = <$SEQ> ) {
    chomp  $line;
    
    if ($line=~/^>/) {
	my $foo = reverse($line);
	chop($foo);
	my $first_part = reverse($foo);
	my @arr1 = split(/:/, $first_part);
	my @arr = split(/=/, $arr1[0]);
	my $gene_name = $arr[0];
	my $new_gene_length = $arr[1];
	next if ( (exists $name_hash->{$gene_name}) && (exists $len_hash->{$gene_name}) );
	$name_hash->{$gene_name} = $gene_name;
	$len_hash->{$gene_name} = $new_gene_length;
	
    }
}
#------------------------------------        



#---------------produce head for output-------------------------------
print "Gene\tLength";
foreach my $filename (@filearr) {
    if ($filename=~/(.*)\.rep.bed/) {
	print "\t$1\t$1_RPM\t$1_RPKM";
    }
}
print "\n";
###############################

foreach my $key (sort { $a<=>$b } keys %$name_hash) {
    my $gene = $key;
    my $len = $len_hash->{$key};
    my $uqstring = $name_hash->{$key};
    
    print "$gene\t$len";
    
    foreach my $filename (@filearr) {
	my $filekey;
	if ($filename=~/(.*)\.rep.bed/) {
	  #  print "\t$1";
	    $filekey = $1;
	}
	my $ind_hash = $gene_count_hash->{$filename};
	
	if ( exists $ind_hash->{$uqstring} ) {
	    my $raw_read = $ind_hash->{$uqstring};
	    my $rpm_read = $raw_read/$rpm_hash->{$filekey};
	    my $rpkm_read =  $rpm_read/($len/1000);

#round it to 2 decimal place
	    $rpm_read = sprintf("%.2f", $rpm_read);
	    $rpkm_read = sprintf("%.2f", $rpkm_read);

	    print "\t$raw_read\t$rpm_read\t$rpkm_read";
	    
	} else {
	    print "\t0\t0\t0";
	}
    }
    print "\n";
    
}





#----------------------------------------------
#then build another hash with gene name as key and gene isoform as value 


exit;

sub produceGeneCount {
    
    my ($filename, $unified_hash) = @_;
    my $gene_count = {};
    open(my $INFILE, "<", $filename) 
	or die "unable to open file $filename";
    
    while ( my $line = <$INFILE> ) {
	next if ($line=~/^track/);
	
	chomp $line;
	my @arr = split(/\t/, $line);
	
	#    foreach my $num (@arr) {
	#	
	#	print "$num\n";
	#	
	#    }
	
	my $refgene = $arr[0];
	my $read_target = $arr[3];    
	my ($read, $count) =  split(/:/, $read_target);
	my $strand = $arr[5];
#	next if ($strand eq "-"); #only used for mRNA expression library
	
#	print "refgene: $refgene; read: $read; count: $count; strand: $strand\n";
	if (exists $gene_count->{$refgene}) {
	    $gene_count->{$refgene} = $gene_count->{$refgene} + $count;
	} else {
	    $gene_count->{$refgene} = $count;
	}
	
	$unified_hash->{$refgene}++;
	
    }
    close $INFILE;
    
    return ($gene_count, $unified_hash);    
}
