#!/usr/bin/perl

########################  strategy

# make array for each seq 4^k-mer long; populate with 0s
# go through seqs word by word, calc index, increment k-mer
# print each k-mer array
# rows are seqs, cols are k-mers
# col 2 is seq length for each seq
# if a word has some character besides ACGT, it is skipped

use strict;
use warnings;
use Getopt::Long;

######################### MAIN ######################

my ($opt_in, $opt_len, $opt_min);
# in - input fasta file
# len - k ; default [8]; range [1-20]
# min - minimum number of nucleotides in sequence for it to be included; min k*10

GetOptions('in:s' => \$opt_in,
			'len:s' => \$opt_len,
			'min:s' => \$opt_min);

#unless (-e $opt_in) { #write this
#       usage();
#}

$opt_len = 8 unless ( defined $opt_len && $opt_len > 0 && $opt_len <= 20 );
$opt_min = $opt_len*10 unless ( defined $opt_min && $opt_min > $opt_len*10 );

my $outfile = "$opt_in.$opt_len-mer.minLen$opt_min.txt";
open (OUT, ">$outfile") || die;

open (IN, $opt_in);
$/ = '>';
my ($id, $seqs) = readData();
if(scalar @{$id} != scalar @{$seqs}){ 
	print "Different numbers of ids and seqs indicates a problem with parsing\n";
	print scalar @{$id}, "-ids\t", scalar @{$seqs}, "-seqs\n";
}
$/ = '\n';
close IN;

# make array of length 4^k-mer
my $high = 4 ** $opt_len;
my @wrd;
my @nts;
for(my $w=0; $w < $high; $w++){
	push(@wrd, 0);
	push(@nts, "NA");	# an array that will have the k-mers in nucleotide language
}

my @seqMers = ();		# array (#seqs) of arrays (one 4^k-mer long array for each seq)
my @seqLens = ();		# array of sequence lengths
my @totalMers = @wrd;	# counts up the total number of each k-mer over all seqs

# populate count of words array for each seq
for(my $i=0; $i < scalar @{$id}; $i++){
	my $len = (length $seqs->[$i]) - $opt_len + 1;
	push(@seqLens, $len+$opt_len-1);
	my @words = @wrd;
	# populate
	for(my $j=0; $j < $len; $j++){
		my $k = substr($seqs->[$i], $j, $opt_len); # word
		my ($countGood) = ($k =~ tr/0123/0123/);
		if($countGood == $opt_len){	# if Ns or other non-ACGT characters, skip word
			my $value = calcIndex($k);
			$words[$value]++;
			$totalMers[$value]++;
			if($nts[$value] eq "NA"){ $nts[$value] = $k;}
		}
	}
	my $w = join (",",@words);
	push(@seqMers, $w);
}

# print out seq seqLengths k-mers
print OUT "seq\tseqLength";
foreach my $k(@nts){
	if($k ne "NA"){
		($k =~ tr/0123/ACGT/);
		print OUT "\t$k";
	}
}
print OUT "\n";

# seqID k-mer counts
for(my $i=0; $i < scalar @{$id}; $i++){
	print OUT $id->[$i], "\t$seqLens[$i]";
	my @nums = split(/,/,$seqMers[$i]);
	for(my $w=0; $w < scalar @totalMers; $w++){
		if($nts[$w] ne "NA"){
			 print OUT "\t$nums[$w]";
		}	
	}
	print OUT "\n";
}

close OUT;
exit;


###################### calcIndex

# calculates a value associated with each k-mer
# starts from left - e.g AAAA == (0,0,0,0) == 0, CAAA == (1,0,0,0) == 1

sub calcIndex {
	my $word = $_[0];
	my $power = 1;
	my $wordIndex = 0;
	for(my $i = 0; $i < length $word; $i++){
		my $letterValue = substr($word, $i, 1);
		$wordIndex += $letterValue * $power;
		$power *= 4;
	}
	return ($wordIndex);
}


######################## readData

sub readData{
	my ($seqs, $id);
	<IN>;
	while(my $data = <IN>){
		my ($info) = ($data =~ /(\S+)/);
		
		$data =~ s/.*?\n//;	# remove defline
		$data =~ s/\s+//g;	# remove whitespace
		$data =~ s/>//g;	# remove > at end
		$data = uc $data;
		if(length $data > $opt_min){
			($data =~ tr/ACGT/0123/);
			push(@{$id}, $info);
			push(@{$seqs}, $data);
		}
	}		
	return ($id, $seqs);
}


#########################   EOF   ##########################