#Take in a SAM file of aligned fastUTR reads and create a table of summary statistics for 
# each insert
#
#Parameters
#-base=name Prefix to use to find alignment files for this sample
#
#
#-barcoded Print out gDNA and gDNA-normalized RNA reads for each barcode
#-collapsed Print out one data point for each sequence that is the sum of all the barcodes
#
#Default mode is to print out summary statistics (i.e. number of barcodes and distribution information) for each sequence
#
#SAM format
#
#QNAME0,FLAG1,RNAME2,POS3,MAPQ4,CIGAR5,RNEXT6,PNEXT7,TLEN8,SEQ9,QUAL10

use List::Util 'max';
use List::Util 'sum';
use Statistics::Basic qw(:all);
use Data::Dumper qw(Dumper);
use Statistics::Distributions;



foreach $parameter( @ARGV )
{
	if( substr($parameter,0,1) eq '-')
	{
		($pname, $pval) = split( '=',substr($parameter,1)) ;
		if ($pval eq '') { $pval = 1; }
		$PARAMS{$pname} = $pval;
	}
		
}

my $basename = 'shortCD8';

$basename = $PARAMS{base} if $PARAMS{base}; 
$minimum_gdna_reads = 10;
$minimum_gdna_reads = $PARAMS{min} if $PARAMS{min};

my ($gdna, $minus, $plus);
$gdna = 'sorted_' . $basename . '_gDNA_alignment.txt';
$minus = 'sorted_' . $basename . '_minusPI_alignment.txt';
$plus = 'sorted_' . $basename . '_plusPI_alignment.txt';

my $global_gdna_reads = 0;
my $global_minus_reads = 0;
my $global_plus_reads = 0;

my %fastutr = {};

open GDNA, $gdna;
while (<GDNA>)
{
	
	chomp;
	my ($QNAME, $FLAG, $RNAME, $POS, $MAPQ, $CIGAR, $RNEXT, $PNEXT, $TLEN, $SEQ, $QUAL) = split;
	
	next unless $FLAG eq '16';
	my ($barcode, undef) = split(':',$QNAME);

	$fastutr{$RNAME}{$barcode}{gdna_reads}++;
	$global_gdna_reads++;
}
close GDNA;

open MINUS, $minus;
while(<MINUS>)
{
	chomp;
	my ($QNAME, $FLAG, $RNAME, $POS, $MAPQ, $CIGAR, $RNEXT, $PNEXT, $TLEN, $SEQ, $QUAL) = split;
	
	next unless $FLAG eq '16';
	my ($barcode, undef) = split(':',$QNAME);

	next unless $fastutr{$RNAME}{$barcode}{gdna_reads} > 1;

	$fastutr{$RNAME}{$barcode}{minus_reads}++;
	$global_minus_reads++;
}
close MINUS;

open PLUS, $plus;
while(<PLUS>)
{
	chomp;
	my ($QNAME, $FLAG, $RNAME, $POS, $MAPQ, $CIGAR, $RNEXT, $PNEXT, $TLEN, $SEQ, $QUAL) = split;
	
	next unless $FLAG eq '16';
	my ($barcode, undef) = split(':',$QNAME);
	next unless $fastutr{$RNAME}{$barcode}{gdna_reads} > 1;

	$fastutr{$RNAME}{$barcode}{plus_reads}++;
	$global_plus_reads++;
}
close PLUS;

my $gdna_minus_ratio = $global_gdna_reads/$global_minus_reads;
my $gdna_plus_ratio = $global_gdna_reads/$global_plus_reads;

#print Dumper \%fastutr;
my %peaks;

print "Peakname\tBarcode\tgDNA_reads\tnorm_minus_reads\tnorm_plus_reads\tminus_ratio\tplus_ratio\n" if $PARAMS{barcoded};

foreach $rname ( sort{ (split('_', $a))[-1] <=> (split('_', $b))[-1] } keys %fastutr )
{
	foreach $barcode (sort {$a cmp $b} keys %{$fastutr{$rname}} )
	{
		my $gr = $fastutr{$rname}{$barcode}{gdna_reads};
		my $mr = $fastutr{$rname}{$barcode}{minus_reads};
		my $pr = $fastutr{$rname}{$barcode}{plus_reads};
		my $norm_mr = $mr * $gdna_minus_ratio;
		my $norm_pr = $pr * $gdna_plus_ratio;

		next unless $gr > $minimum_gdna_reads && $mr && $pr;

		my $minus_ratio = $norm_mr/$gr;
		my $plus_ratio = $norm_pr/$gr;

		#next if $minus_ratio > 10 || $minus_ratio < 0.1 || $plus_ratio > 10 || $plus_ratio < 0.1;

		push(@{$peaks{$rname}{greads}}, $gr);
		push(@{$peaks{$rname}{mreads}}, $mr);
		push(@{$peaks{$rname}{preads}}, $pr);
		push(@{$peaks{$rname}{m}}, $minus_ratio);
		push(@{$peaks{$rname}{p}}, $plus_ratio);
		$peaks{$rname}{gdna_reads} += $gr;
		$peaks{$rname}{norm_minus_reads} += $norm_mr;
		$peaks{$rname}{norm_plus_reads} += $norm_pr;
		$peaks{$rname}{n}++;

		print "$rname\t$barcode\t$gr\t$norm_mr\t$norm_pr\t$minus_ratio\t$plus_ratio\n" if $PARAMS{barcoded};
	}
}

print "Peakname\tN_barcodes\tmean_minus_ratio\tmean_plus_ratio\tminus_std\tplus_std\tdifference\tmean\tt\tp\tsd\tse\n" unless $PARAMS{collapsed} || $PARAMS{barcoded};
print "Peakname\tN_barcodes\ttotal_gDNA_reads\ttotal_norm_minus_reads\ttotal_norm_plus_reads\tminus_ratio\tplus_ratio\n" if $PARAMS{collapsed};

foreach $rname ( sort{ (split('_', $a))[-1] <=> (split('_', $b))[-1] } keys %peaks )
{
	last if $PARAMS{barcoded};

	my $mmean = mean($peaks{$rname}{m});
	my $pmean = mean($peaks{$rname}{p});

	my $mstd = stddev($peaks{$rname}{m});
	my $pstd = stddev($peaks{$rname}{p});

	my $n = $peaks{$rname}{n};


	unless($PARAMS{collapsed})
	{
		next if $n < 2;

		my ($t, $p, $sd, $se) = pairedttest($peaks{$rname}{m}, $peaks{$rname}{p});

		my $dif = abs( $mmean - $pmean);
		my $mean = ($mmean + $pmean) / 2;

		print "$rname\t$n\t$mmean\t$pmean\t$mstd\t$pstd\t$dif\t$mean\t$t\t$p\t$sd\t$se\n" unless $PARAMS{collapsed} || $PARAMS{barcoded};
	}
	if($PARAMS{collapsed})
	{
		my $gr = $peaks{$rname}{gdna_reads};
		my $norm_mr = $peaks{$rname}{norm_minus_reads};
		my $norm_pr = $peaks{$rname}{norm_plus_reads};
		my $minus_ratio = $norm_mr / $gr;
		my $plus_ratio = $norm_pr / $gr;

		print "$rname\t$n\t$gr\t$norm_mr\t$norm_pr\t$minus_ratio\t$plus_ratio\n" if $PARAMS{collapsed};
	}

}

sub A
{
        my $a = 0;
        $a += $_ for @_;
        return $a / @_;
}

sub SOS
{
	my $a = 0;
    $a += $_ ** 2 for @_;
    return $a;
}


sub pairedttest 
{
	my @x = @{$_[0]};
	my @y = @{$_[1]};
	my @d = ();
	my @dd = ();

	$nx = @x;
	$ny = @y;
	
	if($nx == $ny)
	{
		for($i = 0; $i < $nx; $i++)
		{
			$d[$i] = $y[$i] - $x[$i];
		}
		
		$md = A(@d);
		
		for($i = 0; $i < $nx; $i++)
		{
			$dd[$i] = $d[$i] - $md;
		}
	
		$d_sd = sqrt( SOS( @dd ) / ($nx - 1));
		$d_se = $d_sd / sqrt( $nx );
		
		
		$t = 0; 
		$t = $md / $d_se unless $d_se == 0;
		
		$df = $nx - 1;
		
		$tForP = $t;
		$tForP = $t * -1 if $t < 0;
		$p=2 * Statistics::Distributions::tprob($df,$tForP);
			
		return ($t,$p,$d_sd,$d_se); 
	}
	else
	{
		return (0,0,0);
	}
	
}