#!/usr/bin/perl -w

use strict;
use DBI;
use vars qw/ %opt /;

# process the command-line args
sub init() {
        use Getopt::Std;
        my $opt_string = 'ht:n:s:r:c:i:';
        getopts("$opt_string", \%opt) or usage();
        usage() if $opt{h} or !$opt{t} or !$opt{n} or !$opt{r};
}

# print if requested or insufficent args
sub usage() {
        print STDERR << "EOF";
        
usage: $0 [-h] [-t tablename]
        
 -h     : this message
 -t     : table name for initial reads
 -r	: table name for read output
 -n	: name for peak table (i.e. pairedpeaks)
 -s	: server
 -c	: chr to start [default 1]
 -i	: starting index [default 0]

example: $0 -t readpairs -n pairedpeaks 

EOF
        exit;
}

sub linktable {
	my ($peaktablename,$readtablename) = @_;
	my $database = 'l1hsgeno';
	my $server = 'localhost';
	$server = $opt{s} if $opt{s};
	my $user = 'l1';
	my $passwd = 'l1';

	my $dbh = DBI->connect("dbi:mysql:database=$database;host=$server;port=8081;mysql_socket=/gpfs/fs0/u/ewingad/mysql/mysql.sock", $user, $passwd);


	my $maketable_query = "
	CREATE TABLE IF NOT EXISTS `$peaktablename` (
	`id` INT NOT NULL AUTO_INCREMENT ,
	`chr` CHAR ( 2 )  NOT NULL ,
	`minloc` INT NOT NULL ,
	`maxloc` INT NOT NULL ,
	`mintnloc` INT NOT NULL ,
	`maxtnloc` INT NOT NULL ,
	`n` INT NOT NULL ,
	`genomes` INT NOT NULL ,
	`genomestring` TEXT NOT NULL ,
	`insid` INT NOT NULL , 
	`abinitio` INT NOT NULL ,
	PRIMARY KEY ( `id` )
	) TYPE = MYISAM ;
	";
	my $sth = $dbh->prepare($maketable_query);
	$sth->execute();

        my $index_query = "ALTER TABLE `$peaktablename` ADD INDEX ( `abinitio` )";
        $sth = $dbh->prepare($index_query);
        $sth->execute();

	$maketable_query = "
	CREATE TABLE IF NOT EXISTS `$readtablename` (
	`id` INT NOT NULL AUTO_INCREMENT ,
	`genome` VARCHAR( 20 ) NOT NULL ,
	`chr` VARCHAR( 2 ) NOT NULL ,
	`loc` INT NOT NULL ,
	`strand` CHAR ( 1 ) NOT NULL ,
	`tnloc` INT NOT NULL ,
	`tnstrand` CHAR ( 1 ) NOT NULL ,
	`peakid` INT NOT NULL ,
	PRIMARY KEY ( `id` )
	) TYPE = MYISAM ;
	";
	$sth = $dbh->prepare($maketable_query);
	$sth->execute();

        # indexes
        $index_query = "ALTER TABLE `$readtablename` ADD INDEX ( `genome` )";
        $sth = $dbh->prepare($index_query);
        $sth->execute();

        $index_query = "ALTER TABLE `$readtablename` ADD INDEX ( `peakid` )";
        $sth = $dbh->prepare($index_query);
        $sth->execute();

	
}

sub analyzepeak {
	my ($peak,$table,$peaktable,$readouttable,$dbh,$peakid) = @_;
	my ($minloc, $maxloc, $avgloc, $n, $mintnloc, $maxtnloc);
	$minloc = $maxloc = $avgloc = $n = $mintnloc = $maxtnloc = 0;
	my $chr;
	my $locstring = "";
	my %u; # unique
	my @g;
	my %seeng;
	my $orstring = "";
	my @peakids;

	my @insrows;
	for (@$peak) {
		my ($pid,$genome,$pchr,$loc,$strand,$tnloc,$tnstrand) = @{$_};
		$chr = $pchr;
		push @peakids, $pid;

		push @insrows,"$readouttable\t$pid\t$genome\t$chr\t$loc\t$strand\t$tnloc\t$tnstrand\t$peakid";
		if ($n == 0) { # first loc
			$n ++;
			$maxloc = $loc;
			$minloc = $loc;
			$avgloc = $loc;
			$mintnloc = $tnloc;
			$maxtnloc = $tnloc;
			$locstring = "$loc";
			$u{$loc} = 1;
		} else {
			if (!$u{$loc}) {
				$n ++;
				$maxloc = $loc if ($loc > $maxloc);
				$minloc = $loc if ($loc < $minloc);
				$maxtnloc = $tnloc if ($tnloc > $maxtnloc);
				$mintnloc = $tnloc if ($tnloc < $mintnloc);
				$avgloc = (($avgloc*($n-1))+$loc)/$n;
				$locstring .= ",$loc";
				$u{$loc} = 1; # add to the list of locs we've seen
			}
		}
		# keep track of genomes
		if (!$seeng{$genome}) {
			push @g, $genome;
			$seeng{$genome} = 1;
		}
	}
	$orstring =~ s/^OR//;
	my $w = $maxloc-$minloc;
	$avgloc = int $avgloc;

	my $genomestring = join(',',@g);
	my $genomes = scalar @g;

	push @insrows,"$peaktable\t$peakid\t$chr\t$minloc\t$maxloc\t$mintnloc\t$maxtnloc\t$n\t$genomes\t$genomestring\t0";

	return @insrows; 
}

sub loaddata {
	my ($fn,$table,$dbh) = @_;
	my $query = "load data infile '/gpfs/fs0/u/ewingad/1000genomes/buildpeak/$fn' into table $table";
	my $sth = $dbh->prepare($query);
	$sth->execute();
}

init();
linktable($opt{n},$opt{r});

my $database = 'l1hsgeno';
my $server = 'localhost';
$server = $opt{s} if ($opt{s});
my $user = 'l1';
my $passwd = 'l1';

my $searchwidth = 1000;
my $dbh = DBI->connect("dbi:mysql:database=$database;host=$server;port=8081;mysql_socket=/gpfs/fs0/u/ewingad/mysql/mysql.sock", $user, $passwd);
my $table = $opt{t};
my $peaktable = $opt{n};
my $readouttable = $opt{r};

my $minchr = 1;
$minchr = $opt{c} if ($opt{c});
my $chrquery = "select distinct chr from $table where chr >= \"$minchr\"";
my $sth = $dbh->prepare($chrquery);
$sth->execute();
my $chr;
$sth->bind_columns(\$chr);

my $peakid=0;
$peakid = $opt{i} if ($opt{i});

while($sth->fetch()) {

	print "chr$chr ...\n";

	my $query = "select id,genome,loc,strand,tnloc,tnstrand from $table where nmm < 3 and seqtype='Illumina' and chr='$chr' order by loc";

	my $sth = $dbh->prepare($query);
	$sth->execute();
	my ($id,$genome,$loc,$strand,$tnloc,$tnstrand);
	$sth->bind_columns(\$id,\$genome,\$loc,\$strand,\$tnloc,\$tnstrand);

	my $peaknum=0;
	my $prevchr="0";
	my $avgloc=0;
	my $n = 1;
	my @peaks;
	while ($sth->fetch()) {
		my @read = ($id,$genome,$chr,$loc,$strand,$tnloc,$tnstrand);
		if (($chr eq $prevchr) && ($avgloc + $searchwidth > $loc)) {
			$n ++;
			$avgloc = (($avgloc * ($n-1)) + $loc)/$n;
			$peaks[$peaknum-1][$n-1] = \@read;
		} else {
			# start new peak
			$peaknum ++;
			$n = 1;
			$peaks[$peaknum-1][$n-1] = \@read;
			$prevchr = $chr;
			$avgloc = $loc;
		}
	}

#	open(OFH, ">>buildpeak.sql");
	open(BPFH,">buildpeak.chr$chr.extern");
	open(URFH,">usedreads.chr$chr.extern");

	my $prog = 0;
	print "peaks analyzed (. = 1000): ";
	for my $p (0 .. $#peaks) {
		print "." if ($prog % 1000 == 0);
		$prog ++;
		$peakid ++;
		if ($#{$peaks[$p]} >= 8) { # since this is set to '8', can only do abinitio with peaks of size 8 or greater
			for(analyzepeak(\@{$peaks[$p]},$table,$peaktable,$readouttable,$dbh,$peakid)) {
				my @cols = split(/\t/,$_);
				if ($cols[0] eq $peaktable) {
					print BPFH join("\t",@cols[1..(scalar @cols)-1]);
					print BPFH "\n";
				}
				if ($cols[0] eq $readouttable) {
					print URFH join("\t",@cols[1..(scalar @cols)-1]);
					print URFH "\n";
				}
			}
			#print OFH analyzepeak(\@{$peaks[$p]},$table,$peaktable,$readouttable,$dbh,$peakid);
		}
	}
	print "\n";
	print "loading into $peaktable...";
	loaddata("buildpeak.chr$chr.extern", $peaktable, $dbh);
	print "done\nloading into $readouttable...";
	loaddata("usedreads.chr$chr.extern", $readouttable, $dbh);
	print "done\n";

	close BPFH;
	close URFH;
}
