#define MATHLIB_STANDALONE 1
#include <Rmath.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "Timer.h"
#include "Markov.h"
#include "SeqIO.h"
#include "CollStats.h"
#include "ExpBin.h"

#define N_BINS 20

void die_usage(char *argv0, char *error) {
  if(error) puts(error);
  printf("\n%s: Find co-locating k-mers\n", argv0);
  printf("\tUsage: %s fasta_fn nts0 nts1 k nseq outbase\n", argv0);
  printf("\nCreates outbase.*.tsv and outbase.log\n");
  printf("with k-mer and colocation stats and run info\n\n");
  exit(0);
}

/*
int gc_content(char *s, int nt) {
  int p;
  int gc = 0;
  for(p = 0; p < nt; p++) 
    gc += ((s[p] == 'G') || (s[p] == 'C'));
  return gc;
}
*/

int gc_content(char *s, int nt) {
  int p;
  int gc = 0;
  for(p = 0; p < nt; p++) 
    gc += ((s[p] == 'G') || (s[p] == 'C'));
  return (int) ((double)gc / nt * N_BINS);
}

FILE *out_fopen(char *base, char *ext, char *mode) {
  char *name = (char*)malloc((2+strlen(base)+strlen(ext)) * sizeof(char));
  strcpy(name, base);
  strcat(name, ".");
  strcat(name, ext);
  FILE *fp = fopen(name, mode);
  if(!fp) {
    printf("Error opening %s in mode '%s'\n", name, mode);
    exit(1);
  }
  free(name);
  return fp;
}


void save_gc(char *outbase, char *ext, int *n, int gc_max) {
  FILE *fp = out_fopen(outbase, ext, "w");
  fprintf(fp,"GC\tn\n");
  int gc;
  for(gc = 0; gc <= gc_max; gc++) 
    fprintf(fp, "%d\t%d\n", gc, n[gc]);
  fclose(fp);
}

void save_gc_pair(char *outbase, char *ext, int *n, int max0, int max1) {
  FILE *fp = out_fopen(outbase, ext, "w");
  fprintf(fp,"GC.sq0\tGC.sq1\tn\n");
  int gc0, gc1;
  for(gc0 = 0; gc0 <= max0; gc0++) 
    for(gc1 = 0; gc1 <= max1; gc1++)
      fprintf(fp,"%d\t%d\t%d\n", gc0, gc1, n[gc0 * (1+max1) + gc1]);
  fclose(fp);
}

void save_kmer_by_gc(char *outbase, char *ext, int *n, int *n_gc,
		     int k, int n_kmers, int gc_max) {
  FILE *fp = out_fopen(outbase, ext, "w");
  fprintf(fp,"Kmer\tGC\tn\tf_GC\n");
  int code, gc;
  for(gc = 0; gc <= gc_max; gc++) 
    for(code = 0; code < n_kmers; code++)
      fprintf(fp,"%s\t%d\t%d\t%g\n", code_to_kmer(code,k),
	      gc, n[gc * n_kmers + code],
	      (double)(n[gc*n_kmers + code]) / n_gc[gc]);
  fclose(fp);
}  

void save_kmer(char *outbase, char *ext, int *n, int k, int n_kmers) {
  FILE *fp = out_fopen(outbase, ext, "w");
  fprintf(fp,"Kmer\tn\n");
  int code;
  for(code = 0; code < n_kmers; code++) 
    fprintf(fp, "%s\t%d\n", code_to_kmer(code, k), n[code]);
  fclose(fp);
}

/* extract a list of all kmer codes from a sequence */
/* Termintae the list with -1 */
/* list is going to hold the list and must be long enough
   for the list with terminator.
   in_list is going to hold the lookup and must be n_kmers
   long. seq2code_list assumes it is initialized to 0, 
   unless last_list is specified in which case the codes
   in last_list are first zeroed. */
int *seq2code_list(char *s, int len, int k, int n_kmers,
		   int *list, int *in_list, int *last_list) {
  int *next = list;
  if(last_list) {
    int *ip;
    for(ip = last_list; *ip > -1; ip++) in_list[*ip] = 0;
  }
  int i;
  for(i = 0; i <= len - k; i++) {
    int code = kmer_to_code(s+i, k);
    if (!in_list[code]) {
      in_list[code] = 1;
      *next = code;
      next++;
    }
  }
  *next = -1;
  return list;
}



int main(int argc, char **argv) {
  if (argc < 7) die_usage(argv[0], "Not enough arguments");

  FILE *seqin = fopen(argv[1], "r");
  if(!seqin) die_usage(argv[0], "Couldn't open seqin");

  int nts0 = atoi(argv[2]);
  int absnts0 = abs(nts0);
  int nts1 = atoi(argv[3]);
  int absnts1 = abs(nts1);
  int k = atoi(argv[4]);
  if( (k > absnts0) || (k > absnts1))  die_usage(argv[0],"k < number of nucleotides");
    

  int nseq = atoi(argv[5]);
  char *outbase = argv[6];

  char *logname = (char*)malloc((5+strlen(outbase))*sizeof(char));
  strcpy(logname, outbase);
  strcat(logname, ".log");
  char *cmd = (char*)malloc((15+strlen(logname))*sizeof(char));
  strcpy(cmd, "date >");
  strcat(cmd, logname);
  //puts(cmd);
  system(cmd);
  strcpy(cmd, "hostname >> ");
  strcat(cmd, logname);
  //puts(cmd);
  system(cmd);
  strcpy(cmd, "pwd >> ");
  strcat(cmd, logname);
  system(cmd);
  free(logname);
  free(cmd);
  FILE *log = out_fopen(outbase, "log", "a");
  fprintf(log, "\n");
  
  fprintf(log, "seqin\t= %s\nnts0\t= %d\nnts1\t= %d\nk\t= %d\nnseq\t= %d\n",
	  argv[1], nts0, nts1, k, nseq);
  fprintf(log, "N_BINS\t= %i\n", N_BINS);
  fprintf(log, "outbase\t= %s\n", outbase);

  fprintf(log, "\nMaking k-mer list\n");
  make_all_kmers(k);
  int n_kmers = pow4_fcn(k);

  fprintf(log, "\nGetting memory for storing sequence strings\n");  
  fflush(log);

  /* seqs0[i] = "CGATCTCA...", seqs1[i] = "CGACGCTA..."
     'i' is the index of the sequence pair */
  int i;
  char** seqs0 = (char**)malloc((int)(nseq/2.0) * sizeof(char*));
  char** seqs1 = (char**)malloc((int)(nseq/2.0) * sizeof(char*));
  for(i=0; i<(int)(nseq/2.0); i++) {
    seqs0[i] = (char*)malloc((absnts0 + 1) * sizeof(char));
    seqs1[i] = (char*)malloc((absnts1 + 1) * sizeof(char));
    memset( seqs0[i], '\0', absnts0 + 1 );
    memset( seqs1[i], '\0', absnts1 + 1 );
  }

  fprintf(log, "\nstoring sequence strings\n");  
  fflush(log);

  Seq *seq0, *seq1;

  i = 0;
  while((seq0 = next_Seq(seqin))) {
    seq1 = next_Seq(seqin);
    if(!seq1) break;
    
    if((seq0->len < absnts0) || (seq1->len < absnts1)) continue;

    char *start0 = (nts0 > 0) ? seq0->seq :
      (seq0->seq + strlen(seq0->seq) + nts0);
    char *start1 = (nts1 > 0) ? seq1->seq :
      (seq1->seq + strlen(seq1->seq) + nts1);

    strncpy(seqs0[i], start0, absnts0);
    strncpy(seqs1[i], start1, absnts1);

    destroy_Seq(seq0);
    destroy_Seq(seq1);

    i++;
  }
  fclose(seqin);
  fprintf(log, "successfully stored %i sequences (you said there would be %i)\n", 2*i, nseq);  
  fflush(log);
  

  fprintf(log, "\nInitializing Binning objects\n");  
  fflush(log);

  Binning* bins0 = new_Binning(seqs0, (int)(nseq/2.0), k, 1);
  Binning* bins1 = new_Binning(seqs1, (int)(nseq/2.0), k, 1);

  fprintf(log, "\nGetting memory for first parse of G+C-content\n");  
  fflush(log);
/*
S0, S1 are G+C content bins
H0, H1 and hexamers (or k-mers)
We look at N(H0H1) = Sum_{S0,S1} N(H0H1|S0S1)
And so we need to calculate 
EN(H0H1|S0S1) = N(S0S1) * f(H0|S0) * f(H1|S1)
where f(Hi|Si) = N(Hi in Si) / N(Si)

So as we parse the sequences we have to calculate
N(H0H1)
N(S0)
N(S1)
N(S0,S1)
N(H0,S0)
N(H1,S1)
*/
  
  /* N(H0H1) = n_kmer_pair[code0 * n_kmers + code1] */
  int *n_kmer_pair = (int*)calloc(n_kmers*n_kmers, sizeof(int));

  /* N(S0) = n_gc0[s0], N(S1) = n_gc1[s1] */
  int rowsize0 = N_BINS; //1+absnts0;
  int rowsize1 = N_BINS; //1+absnts1;
  int *n_gc0 = (int*)calloc(rowsize0, sizeof(int));
  int *n_gc1 = (int*)calloc(rowsize1, sizeof(int));

  /* N(S0S1) = n_gc_pair[s0 * rowsize1 + s1] */
  int *n_gc_pair = (int*)calloc(rowsize0 * rowsize1, sizeof(int));

  /* N(Hi,Si = n_kmer_by_gci[si * n_kmers + code] */
  int *n_kmer_by_gc0 = (int*)calloc(n_kmers * rowsize0, sizeof(int));
  int *n_kmer_by_gc1 = (int*)calloc(n_kmers * rowsize1, sizeof(int));

  /* These two are not necessary for the calculation but nice to
     have as controls anyway */
  int *n_kmer0 = (int*)calloc(n_kmers, sizeof(int));
  int *n_kmer1 = (int*)calloc(n_kmers, sizeof(int));


  //Seq *seq0, *seq1;
  Timer t;
  int iseq = 0;

  fprintf(log, "Now parsing sequences\n");
  fflush(log);

  start_timer(&t);
  t.info_wait = 30;  /* give progress updates about every half-minute */
  int *in_list = (int*)calloc(n_kmers, sizeof(int));
  int *list0 = (int*)malloc(absnts0 * sizeof(int));
  int *list1 = (int*)malloc(absnts1 * sizeof(int));
  list1[0] = -1;
  for(i = 0; i < (int)(nseq/2.0); i++) {
    int gc0 = get_bin_for_seq(bins0, 0, i);
    n_gc0[gc0]++;
    int gc1 = get_bin_for_seq(bins1, 0, i);
    n_gc1[gc1]++;
    n_gc_pair[gc0 * rowsize1 + gc1]++;

    int *i0, *i1;

    /* These two function calls make lists of the unique
       k-mers in each of the regions of interest */
    seq2code_list(seqs0[i], absnts0, k, n_kmers,
		  list0, in_list, list1);
    seq2code_list(seqs1[i], absnts1, k, n_kmers,
		  list1, in_list, list0);
    for(i1 = list1; *i1 > -1; i1++)  {
      n_kmer_by_gc1[gc1 * n_kmers + *i1]++;
      n_kmer1[*i1]++;
    }

    for(i0 = list0; *i0 > -1; i0++)  {
      n_kmer_by_gc0[gc0 * n_kmers + *i0]++;
      n_kmer0[*i0]++;
      for(i1 = list1; *i1 > -1; i1++)
	n_kmer_pair[(*i0) * n_kmers + *i1]++;
    }

    iseq += 2;
    timer_info(log, &t, iseq, nseq);
  }
  free(list0);
  free(list1);
  free(in_list);
  
  fprintf(log, "Now outputing GC_sq0, GC_sq1\n");

  save_gc(outbase, "GC_sq0.tsv", n_gc0, N_BINS-1); //absnts0);
  save_gc(outbase, "GC_sq1.tsv", n_gc1, N_BINS-1); //absnts1);

  fprintf(log,"Now outputing GC_pair\n");
  save_gc_pair(outbase, "GC_pair.tsv", n_gc_pair, N_BINS-1, N_BINS-1); //absnts0, absnts1);

  fprintf(log,"Now outputing kmer_by_GC\n");
  save_kmer_by_gc(outbase, "kmer_by_GC_sq0.tsv", n_kmer_by_gc0, n_gc0, k,
		  n_kmers, N_BINS-1); //, absnts0);
  save_kmer_by_gc(outbase, "kmer_by_GC_sq1.tsv", n_kmer_by_gc1, n_gc1, k,
		  n_kmers, N_BINS-1); //, absnts1);

  fprintf(log,"Now outputing kmer counts\n");
  save_kmer(outbase, "kmer_sq0.tsv", n_kmer0, k, n_kmers);
  save_kmer(outbase, "kmer_sq1.tsv", n_kmer1, k, n_kmers);

  fprintf(log,"Now doing p-values and poisson statistics for kmer pairs\n");

  FILE *fp = out_fopen(outbase, "kmer_pairs.tsv","w");
  fprintf(fp, "KmerSq0\tKmerSq1\tn\tmuSCOA\tmu\tP\n");

  FILE *fp_sig = out_fopen(outbase, "kmer_pairs.sig.tsv", "w");
  fprintf(fp_sig, "KmerSq0\tKmerSq1\tn\tmuSCOA\tmu\tP\n");

  int code0,code1;
  int n_kmer_pairs = n_kmers * n_kmers;
  double P_cutoff = 1.0 / n_kmer_pairs;
  int i_pair = 0;
  CollStats **cs = (CollStats**)malloc(n_kmer_pairs * sizeof(CollStats*));
  start_timer(&t);
  t.info_wait = 30;  /* give progress updates about every half-minute */  
  for(code0 = 0; code0 < n_kmers; code0++) {
    for(code1 = 0; code1 < n_kmers; code1++) {
      double muNonStratified = n_kmer0[code0] * n_kmer1[code1] / (double)(nseq/2.0);

      double mu = 0;  /* poisson parameter */

      int gc0,gc1;
      //for(gc0 = 0; gc0 <= absnts0; gc0++) {
      for(gc0 = 0; gc0 < N_BINS; gc0++) {
	if(n_gc0[gc0] == 0) continue;
	double f0 = (double)(n_kmer_by_gc0[gc0 * n_kmers + code0]) / n_gc0[gc0];
	//for(gc1 = 0; gc1 <= absnts1; gc1++) {
	for(gc1 = 0; gc1 < N_BINS; gc1++) {
	  if(n_gc1[gc1] == 0) continue;
	  double f1 = (double)(n_kmer_by_gc1[gc1 * n_kmers + code1]) / n_gc1[gc1];
	  int n = n_gc_pair[gc0 * rowsize1 + gc1];
	  /*	  printf("%s %s %d (%g) %d (%g) %d %g\n",
		 code_to_kmer(code0, k),
		 code_to_kmer(code1, k),
		 gc0, f0, gc1, f1, n, n*f0*f1);*/
	  mu += n * f0 * f1;
	}
      }

      int n_pair = n_kmer_pair[code0 * n_kmers + code1];

      // Third argument is lower.tail = FALSE and says
      // to give P(X > x) and fourth is log.p = FALSE and
      // says to return the probability, not its logarithm.
      double P = ppois(n_pair - 1, mu, 0, 0);
      
      cs[code0 * n_kmers + code1] = new_CollStats(code0, code1, n_pair, muNonStratified, mu, P);
      
      if(P <= P_cutoff)
	fprintf(fp_sig, "%s\t%s\t%d\t%g\t%g\t%g\n",
		code_to_kmer(code0, k),
		code_to_kmer(code1, k),
		n_pair, muNonStratified, mu, P);
      timer_info(log, &t, ++i_pair, n_kmer_pairs);
    }
  }  

  fprintf(log, "Done calculating collocation statistics. Now sorting for output\n");
  fflush(log);

  //printf("Calling quicksortCS(0, %d -1 = %d)\n", n_kmer_pairs, n_kmer_pairs-1);

  //  quicksortCS(cs, 0, n_kmer_pairs - 1);
  fprintf(log, "Done sorting. Writing sorted kmer pairs\n");
  fflush(log);
  for(i_pair = 0; i_pair < n_kmer_pairs; i_pair++) {
    write_CollStats(cs[i_pair], fp, k, 1);
    free(cs[i_pair]);
  }
  free(cs);
  fclose(fp);
  fclose(fp_sig);

  free_all_kmers();
  free(n_gc0);
  free(n_gc1);
  free(n_gc_pair);
  free(n_kmer_by_gc0);
  free(n_kmer_by_gc1);
  free(n_kmer0);
  free(n_kmer1);
  free(n_kmer_pair);
  
  for(i=0; i<(int)(nseq/2.0); i++) {
    free(seqs0[i]);
    free(seqs1[i]);
  }
  free(seqs0); // will be freed by destroy_Binning()?
  free(seqs1);

  destroy_Binning(bins0);
  destroy_Binning(bins1);
  
  fputs(current_date(), log);
  fprintf(log, "\n");
  fclose(log);

  return 0;
}
