#!/usr/bin/perl
use strict;
use Jon_for_unix;
use BeginPerlBioinfo; 
#ref genome is first arg sam file of illumina data is second
# 3rd argument is STRAIN so you can run multiple scripts
# 4th arg is exact name of fasta output file which will have to be concatenated using another script.
# 5th optional argument is chrom,start,end if you want to fix only one region. (should convert this to a bed file) if you want a kmer put in "FALSE"
# 6th optional sequence is a 30 mer to test
my $hit_thresh=3; # hits greater than this, will change sequence
unless($ARGV[3]=~/\w+/) { print "ref genome, sam file of illumina data, strain, exact name of output file, csv list of chrom start end (optional) \n"; die;}
print STDERR "@ARGV\n";
print STDERR "hit threshold is $hit_thresh\n";
my $f=fasta_to_hash_v2("$ARGV[0]",'Y'); my $ref_genome=$ARGV[0]; 
my %refgenome=%$f;
my $big_sam_file=$ARGV[1];
my $strain=$ARGV[2];
my $output_file_name=$ARGV[3];
my $start_end=$ARGV[4];
if ($start_end eq 'FALSE') {} elsif ($start_end=~/\w+/) { print "using $start_end instead of whole genome\n"; sleep(3);}
my $test_sequence=$ARGV[5];
my $beginning_time=time(); my $bed_file="$strain".'_regions.bed'; my $sam_file="$strain".'_temp_sam_file.sam'; my $fasta_file="$strain".'_region.fasta';
my $zero=0;
my $window_size=250000; #hack point
my $last_window=0;
print "$beginning_time\n";
open_out_test($output_file_name,'OUT','Y'); close(OUT);

CHROM: foreach my $chrom (sort {$a cmp $b} keys %refgenome) {
	next unless ($chrom=~/chr/); 
#	print "$chrom\n"; &halt;
	$refgenome{$chrom}=~s/s+//g;
	my $limit=0;
	my $count=0;
	$zero=0; #hack point
	$last_window=length($refgenome{$chrom});
	if (($start_end ne 'FALSE')&&($start_end=~/\w+/)) {
		my @se=split/\,/,$start_end;
		my $c=$se[0];
		if ($#se==0) {
			# don't chnage anything from above with zero, window size and last window
			# this just let's you pick a single chrom
		} elsif ($#se==2) {
			$zero=$se[1]; $window_size=$se[2]-$se[1];
			$last_window=$se[2];
		}
		next CHROM unless ($chrom=~/$c/);
		print STDERR "alert i am using $zero to ".($zero+$window_size)." region\n";
		sleep(2);
	}
	
LINE:	for (my $i=$zero;$i<$last_window;$i+=$window_size) {
		my $targseq=substr($refgenome{$chrom},$i,$window_size);
		print "$i\t".($i+$window_size)."\t".length($targseq)."\n";
		open_out_test($fasta_file,'FAS','Y');
		print FAS "\>$chrom $i\n$targseq\n";
		close(FAS);
		$count++;
		my $right=$i+$window_size;
		(my $name)=($chrom=~/^(\w+)/);
		open_out_test($bed_file,'BED','Y');
		my $left;
		if ($i==0) { $left=0;} else {$left=$i;}
                print BED "$name $left $right\n";
                close(BED);
		print "$name\n";
		my $bam_file=$chrom."\.bam";
#hack point
    		`samtools view -@ 15 -L $bed_file $bam_file > $sam_file`;
		open_in_test($sam_file,'SAM');
		my %temp;
		while (<SAM>) {
			my @line=split/\t/,$_;
			$temp{$line[0]}=$_; # hack to make a new file that just has unique reads(this is a weird bug with the read viewer
		}
		close (SAM);
		open_out_test($sam_file,'NEWSAM','Y'); #
		foreach my $k (keys %temp) {
			print NEWSAM "$temp{$k}";
		}
		close(NEWSAM);
#hack point
		print STDERR " ".($beginning_time-time())." time elapsed make sam file\n";
		my $a=time();
		my $consensus=&run_genome_fix($chrom,$i);
		my $b=time();
		print STDERR " ".($b-$a)." elapsed seconds for $name $i $right\n";

		open_out_append_test($output_file_name,'OUT'); 
		print OUT ">$chrom $i\n$consensus\n";
		close(OUT);
		print STDERR "length after ".(length($consensus))."\n";

#		last LINE if ($count>$limit);

	}
}
print " ".($beginning_time-time())." time elapsed\n";


#### this subroutine does this with a single window and then moves on #####
sub run_genome_fix {
my $beginning_time=time();
my $chromosome=$_[0]; my $global_position=$_[1]; 
my $f=fasta_to_hash_v2($fasta_file,'Y');
my %seq=%$f;
#code: see pseudo code at end
my %read_hash;
my @reads;
#open_in_test('reads.fasta','READS');
open_in_test($sam_file,'READS');
#open_in_test('unique_reads.txt','READS');
print "here is my sam file $sam_file\n";
my $ct;
my %seen;
my %kmer_hash;
my %kmer_hash_reads;
my $beginning_time=time();
my $consensus=""; #consensus for the window, return this as a result of the subroutine
while (<READS>) {
		$_=~s/\n$//;
		my @f=split/\t/,$_;
		unless (exists($seen{$f[0]})) {
			$read_hash{$f[9]}++;
			$ct++;
			$seen{$f[0]}++;
			my @a=(0..length($f[9])-30);
			for my $i (@a)	{
 				my $sequence=substr($f[9],$i,30);
#				print "$i\t$sequence\n";
				$kmer_hash_reads{$sequence}.="$f[9]\t"; #critical! stores all reads that have that kmer
				$kmer_hash{$sequence}++; 
			}	
	
		}
		print STDERR "$ct reads kmered and stored\n" if ($ct=~/00000$/);
}
print "$ct reads\n";
print "I have $kmer_hash{$test_sequence} copies of $test_sequence\n";
close(READS);
print " ".($beginning_time-time())." time elapsed\n";

print STDERR "started on $chromosome $global_position\n";
my %count_hash;
my $targ_seq; #hack use keys but there is only one 
foreach my $key (keys %seq) {
#	print "$key\n$seq{$key}\n";
	next unless ($seq{$key}=~/\w+/);
	$targ_seq=$seq{$key};
	print STDERR "$key\tlength before ".length($seq{$key})."\n";
	my $length_before=length($seq{$key});
	my $zeroflag='FALSE';
	my $couldnotfix="";
	my @a=(0..length($seq{$key})-30);
	my @running_count=(1)x10;
	for my $i (@a) {		
		my $forct=0; my $revct=0;
		my $sequence=substr($seq{$key},$i,30);
		my $revcom=revcom($sequence);
		my $abs_pos=$i+$global_position;	
		my $sequence_q=uc($sequence);
		my $revcom_q=uc($revcom);
		$forct+=$kmer_hash{$sequence_q};
		$revct+=$kmer_hash{$revcom_q};
		push (@running_count,$forct);
		shift(@running_count);
		my $running_mean=get_mean(\@running_count); $running_mean=~s/(\d+\.\d)\d+/$1/;
		$count_hash{$abs_pos}{'chrom'}=$key;
		$count_hash{$abs_pos}{'sub_pos'}=$i;
		$count_hash{$abs_pos}{'sequence'}=$sequence;
		$count_hash{$abs_pos}{'count'}=$forct;
		$count_hash{$abs_pos}{'mean'}=$running_mean;
		print STDERR "$i\t".($i+$zero)."\t$sequence\n" if ($i=~/00000$/);
	}
}
# now fix sequences
my $posct=0;
my @names=qw/chrom sub_pos sequence count mean/;
my $flag="false";

POSITION: foreach my $position (sort {$a<=>$b} keys %count_hash) {
	$posct++;
	print STDERR "$position\t".($position+$global_position)."\t";
	foreach my $name (@names) {print STDERR "$count_hash{$position}{$name}\t";}; print STDERR "\n";
	my $chrom=$count_hash{$position}{'chrom'};
	my @seqs=split//,$count_hash{$position}{'sequence'};
	if (($count_hash{$position}{'count'}>5)) {
		#continue
		$flag="false";
		if ($posct==1) { $consensus=$count_hash{$position}{'sequence'} } else {$consensus.=$seqs[$#seqs];} 
#	} elsif (($flag eq 'false')&&($count_hash{$position}{'count'}<=5)&&($count_hash{$position}{'mean'}>=10)&&($count_hash{$position}{'sequence'}=~/([ACTG])\1+[ACTG]$/i))	{ 
	} elsif (($flag eq 'false')&&($count_hash{$position}{'count'}<=5)&&($count_hash{$position}{'mean'}>=10))	{ 
		# this means that it's a drop that reduces 2X avg.
		# also can query whether it's a homopolymer run here as well.==>that's the hash toggle.
		$flag="true";
		#examine reads from prev kmer
		my $current=$count_hash{$position}{'sequence'};	my $prior=$count_hash{$position-1}{'sequence'};
		#if ($current=~/([actg])\1\1+[actg]$/i) { 		
		my $altered=$current;
		my $r=get_downstream_read_seqs($position,uc($prior),$kmer_hash_reads{uc($prior)}); my @reads=@$r;
		my $fixed_flag='false';		
		my $g=get_downstream_consensus(\@reads,20); my @consensi=@$g;
		my $genome_downstream=substr($targ_seq,$count_hash{$position}{'sub_pos'}+30,30);
		my $insert_string="";	
#		print "$#reads\t$#consensi\n";
		CON: foreach my $con (@consensi) {
			my @s=split/\t/,$con;
			my $nuc=lc($s[0]); my $count=$s[1];
#			print "I am in the con $con\t$nuc\n";
			next if ($nuc=~/ambiguous/);
			$altered=~s/^.//;
			my $last=chop($altered);
			$altered=$altered.$nuc.$last;
			my $new_readct=0;
			$new_readct+=$kmer_hash{uc($altered)};
			$insert_string.=lc($nuc);
			print STDERR "$chrom $con\n$position $current\n$position $altered $genome_downstream\t$new_readct\n";
			if ($new_readct>10) {				
				$fixed_flag='true';
				$consensus.=$insert_string.$last;
				print STDERR  "$chrom fix $position $current $altered $nuc$last\n";
			 	next POSITION;
			}
		}
		# so don't get here unless you didn't fix it above.
		#truncate 1 and test
		print STDERR "$position i am past the extension bit\n";
		my $truncated=truncate_seq($position,\%count_hash);
		my $new_readct=0;
		$new_readct+=$kmer_hash{uc($truncated)};
		if ($new_readct>10) {
			# do nothing with consensus. as you haven't added anything yet.
			print STDERR "$chrom truncated $position to $truncated and it worked\n";
		} else {
			$consensus.=$seqs[$#seqs];
		}

	} elsif (($flag eq 'false')&&($count_hash{$position}{'count'}<=5))	{
		my $genome_downstream=substr($targ_seq,$count_hash{$position}{'sub_pos'}+30,30);
		print STDERR "$chrom\t$position\tlow coverage region or not homopolymer \t$count_hash{$position}{'mean'}\t$genome_downstream\n";
		$consensus.=$seqs[$#seqs];
	} elsif ($flag eq 'true') {
		#in the middle of an error-containing read.
		print STDERR "$chrom $position\tI am in the middle of an error read\n";
		$consensus.=$seqs[$#seqs];
	} else {
		$consensus.=$seqs[$#seqs];
	}

}
return $consensus;

#my %kmer_hash; my %kmer_hash_reads;  # re-initialize kmer hashes

} #end sub genome fix

sub get_downstream_read_seqs {
#	print "I am in get_downstream_read_seqs\n";
	my $position=shift; my $sequence=shift; my $kmers=shift;
	my @reads=split/\t/,$kmers;
#	print "reads $#reads\n";
	my @downstreams;
	if ($#reads>-1) {
	foreach my $read (@reads) {
		next unless ($read=~/[actgACTG]+/);
		if ($read=~/$sequence/) {
			my $downstream=$';
			push (@downstreams,$downstream);
#			print " $sequence $'\n";
		}
	}
	}
	return(\@downstreams);

}

sub get_downstream_consensus {
	my $r=shift; my @reads=@$r;
	my $number=shift; # number of consensi to get
	my @consensi;
#	print " I am in downstream consensus\n";
#	print "$#reads\n";
	if ($#reads>=0) {
	for (my $i=1;$i<=$number;$i++) {
#		print "I am in the i $i\n";
		my %hash;
		foreach my $read (@reads) {
			my $s=substr($read,$i-1,1);
			$hash{$s}++ if ($s=~/[actgACTG]/);	
		}
		my $consen=get_max_value_hash(\%hash);
		push (@consensi,$consen);
	}
	}
#	print "I have finished downstream consensus\n";
	return(\@consensi);
}

sub truncate_seq {
	my $pos=shift; my $r=shift;
	my %hash=%$r;
	my $curr=$hash{$pos}{'sequence'};
	my $next=$hash{$pos+1}{'sequence'};	
	chop($curr);
	my $chopped=chop($next); #gets last character in next
	my $truncated=$curr.$chopped;
#	print "$pos\n$curr\n$next\n$truncated\n$chopped\n";
	return $truncated;
	
}


sub get_max_value_hash {
    my $hash   = shift;
    my (@keys) = keys   %$hash;
    my (@vals) = values %$hash;
	my ($key,$big);
	my $total=0;

    for (0 .. $#keys) {
	if ($keys[$_]=~/[acgtACGTnN]/) { 
		$total++; 
	        if ($vals[$_] > $big) {
       	   	     $big = $vals[$_];
       	    	     $key = $keys[$_];
       		 }
	}
    }
    my $str="$key\t$big";
	if ($total==0) {
		$total--;
		$str="ambiguous";
	} elsif ($total>0)  {
		my $ratio=$big/$total;
		if (($ratio<0.9)||($big==1)) { $str="ambiguous2"; }
	} else {
		$total--;
		$str="ambiguous";
	}
#	print STDERR "subroutine hash\n";
#	print_hash_stderr($hash); 
#	print STDERR "number $big total $total\n";
#	print STDERR "ratio\: ".($big/$total)."\n";
#	print STDERR "string $str\n";
	

	return $str;
	
}

=pod
procedure:
use blast to find range of target nanopore assembly contains the tandem repeats
put that range in a bed file (to extract mapping reads) and a regions file (for faidx) chr:start-end
make the sam file with all those reads
make the fasta file with that range of sequence
this is to conserve memory since the counting method uses 30 mers to count and they
need to be hashed. 


cut
