/************************************************************************************************************************************************************************************/
/*  Date: 2010 Aug. 5                                                                                                                                                               */
/*  Author: Gung-wei CHirn                                                                                                                                                          */
/*  Function: Calculate coverage for each base pair on genome                                                                                                                       */
/*            and create cluster of reads                                                                                                                                           */
/*            used in microRNA analysis to identify the genome                                                                                                                      */
/*            spots where produces a lot of microRNA.                                                                                                                               */
/*  Input: sam file (reads map to genome)                                                                                                                                           */
/************************************************************************************************************************************************************************************/

#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <search.h>
#include <sys/types.h>
#include <unistd.h>
#include <time.h>
#include <math.h>


#include "chin_lib.h"
#include "chin_stat.h"

#define MXLGH	1000000
#define INTERSECTBED    "/nlmusr/gchirn/linux/TOOLS/bin/intersectBed"
#define GROUP           "/nlmusr/gchirn/linux/CORE/bin/group"
#define LT_CREATE_IDX   "/nlmusr/gchirn/linux/CORE/bin/lt_create_idx"
#define LTR             "/nlmusr/gchirn/linux/CORE/bin/ltr_gwc"
#define SUM             "/nlmusr/gchirn/linux/CORE/bin/sum"
void usage();
int nd_cmp(const void *x,const void *y);
void action(const void *nodep, const VISIT which, const int depth);
int simulation(int mrna_length, double mrna_coverage, double avg_clip_length,int clip_count,int repeat,double p_cutoff, double *avg, double *sd);

typedef struct noise_node{  /* one node pre gene */
  char  *gene;
  int   count;   /* noise level */
}NOISE_NODE;

int Window,Window_calculation;
float CLUSTER_CUTOFF;
float Rpm;
FILE *Out;

main(argc, argv, envp)
int argc;
char **argv, **envp;
{
  char gene_length_mrna[1000],gene_bed[1000],out_gene_bed[1000];
  double clip_length,total_clip_count,total_mrna_count,mrna_length,mrna_count;

  int pid,clip_count;
  char a[MXLGH],b[MXLGH],c,**pp,*pt;
  FILE *tmpf;
  double total_clip,total_clip_coverage,total_mrna,total_mrna_coverage,avg,sd,norm_factor;

  int i,j,m,n;
  FILE *in;
  NOISE_NODE *ptr=NULL,*sqpt;
  void *root=NULL,*p=NULL;
  float total;

  clip_length=total_clip_count=total_mrna_count=norm_factor=0;in=stdin;Out=stdout;
  gene_length_mrna[0]=gene_bed[0]=out_gene_bed[0]='\0';
/*
   printf("  -i: input gene-length-mrna file. one entry per gene, tab-delimited. default: mandatory\n");
   printf("  -l: average clip length. default: mandatory\n");
   printf("  -t: total clip count. default: mandatory\n");
   printf("  -m: total mrna count. default: mandatory\n");
   printf("  -r: gene bed file. default: mandatory\n");
   printf("  -R: modified gene bed file. default: mandatory\n");
   printf("  -o: output file. default: stdout\n");
*/
  
  for(i=1;i<argc;i++){
    if(argv[i][0]!='-')continue;c=argv[i][1];
    switch(c){
      case 'i': i++;strcpy(gene_length_mrna,argv[i]);argv[i-1]=argv[i]=NULL;break;
      case 'l': i++;clip_length=atof(argv[i]);argv[i-1]=argv[i]=NULL;break;
      case 't': i++;total_clip_count=atof(argv[i]);argv[i-1]=argv[i]=NULL;break;
      case 'm': i++;total_mrna_count=atof(argv[i]);argv[i-1]=argv[i]=NULL;break;
      case 'r': i++;strcpy(gene_bed,argv[i]);argv[i-1]=argv[i]=NULL;break;
      case 'R': i++;strcpy(out_gene_bed,argv[i]);argv[i-1]=argv[i]=NULL;break;
      case 'y': i++;norm_factor=atof(argv[i]);argv[i-1]=argv[i]=NULL;break;
      case 'o': i++;
                if((Out=fopen(argv[i],"w"))==NULL){
                  fprintf(stderr, "Can't open output file %s\n",argv[i]);exit(2);
                }argv[i-1]=argv[i]=NULL;break;
      case 'h': usage();
    }
  }
  if(gene_length_mrna[0]=='\0'){printf("ERROR: gene_length_mrna file is required.\n"); usage();}
  if(clip_length<20){printf("ERROR: clip_length too short: %lf\n",clip_length); usage();}
  if(total_clip_count<20){printf("ERROR: total_clip_count is required: %lf\n",total_clip_count); usage();}
  if(total_mrna_count<20){printf("ERROR: total_mrna_count is required: %lf\n",total_mrna_count); usage();}
  if(gene_bed[0]=='\0'){printf("ERROR: gene_bed is required.\n"); usage();}
  if(out_gene_bed[0]=='\0'){printf("ERROR: out_gene_bed is required.\n"); usage();}
  if(norm_factor<=0)norm_factor=total_clip_count;
  pid=getpid();


  srand((unsigned int)time(NULL));
  fprintf(Out,">clip_length = %lf\n",clip_length);fflush(stdout);
  fprintf(Out,">total_clip_count = %lf\n",total_clip_count);fflush(stdout);
  fprintf(Out,">total_mrna_count = %lf\n",total_mrna_count);fflush(stdout);
  fprintf(Out,">Gene\tmrna_length\tclip_length\test_clip_count\tmrna_count\tavg\tsd\tcutoff\tcutoff(RPM)\n"); fflush(Out);
  in=fopen(gene_length_mrna,"r");
  while(nexttext(a,in)!=0){
    pp=strsplit(a,"\t",&i);
    strcpy(b,pp[0]+1);
    mrna_length=atof(pp[1]);
    mrna_count=log(atof(pp[1]));
    if(mrna_count<0)mrna_count=0;
    clip_count=mrna_count*total_clip_count/total_mrna_count+0.5;
    j=simulation((int)mrna_length,mrna_count,clip_length,clip_count,(int)500,(double)(0.01),&avg,&sd);
    fprintf(Out,">%s\t%d\t%lf\t%d\t%lf\t%lf\t%lf\t%d\t%lf\n",b,(int)mrna_length,clip_length,clip_count,mrna_count,avg,sd,j,j*1000000.0/total_clip_count); fflush(Out);
    ptr=(NOISE_NODE *)malloc((size_t)sizeof(NOISE_NODE));
    ptr->gene=strdup(b);
    ptr->count=j;
    p=tsearch((void *)ptr,&root,nd_cmp);
    if(p==NULL){fprintf(Out,"error: p==NULL %s\n",ptr->gene);exit(0);}
    if(p==ptr){fprintf(Out,"error: duplicate %s\n",ptr->gene);exit(0);}
  }fclose(in);
  in=fopen(gene_bed,"r");
  ptr=(NOISE_NODE *)malloc((size_t)sizeof(NOISE_NODE));
  ptr->gene=(char *)malloc((size_t)1000*sizeof(char));
  while(nexttext(a,in)!=0){
    pp=strsplit(a,"\t",&i);
    strcpy(ptr->gene,pp[3]);
    p=tfind((void *)ptr,&root,nd_cmp);
    if(p==NULL){j=1;}
    else {
      sqpt=*(NOISE_NODE **)p;
      j=sqpt->count;
    }
    m=atoi(pp[1])-100;if(m<1)m=1;
    n=atoi(pp[2])+100;
    fprintf(Out,"%s\t%d\t%d\t%s\t%d\t%s\n",pp[0],m,n,pp[3],j,pp[5]); fflush(stdout);
  }fclose(in);
/*
 chr2L   12918457        12918803        ACXB    0       +
*/
 

}

int simulation(int mrna_length, double mrna_coverage, double avg_clip_length,int clip_count,int repeat,double p_cutoff, double *avg, double *sd)
{
int i,j,k,n_cut,max,m,n;
int *cut;
double *cutpoint;
double *peak,p;

  mrna_coverage*=mrna_length;
  if(mrna_coverage<mrna_length)mrna_coverage=mrna_length;
  cutpoint=(double *)malloc((size_t)((mrna_coverage+10)*sizeof(double)));
  n_cut=mrna_coverage/avg_clip_length+0.5;
  peak=(double *)malloc((size_t)((repeat+10)*sizeof(double)));
  for(i=0;i<=repeat;i++)peak[i]=0;
  cut=(int *)malloc((size_t)((n_cut+10)*sizeof(int)));

  for(k=0;k<repeat;k++){

    for(i=0;i<=n_cut;i++)cut[i]=0;
    for(i=0;i<=mrna_coverage;i++)cutpoint[i]=0;
    for(i=0;i<n_cut;i++){
      j = 1 + (int) ((mrna_coverage-1) * (rand() / (RAND_MAX + 1.0)));
      if(cutpoint[j]==1)i--;else cutpoint[j]=1;
    }
    for(i=0,j=1;i<=mrna_coverage;i++)if(cutpoint[i]==1)cut[j++]=i; cut[0]=0; cut[j]=mrna_coverage;
    for(i=1;i<=mrna_length;i++)cutpoint[i]=0;
    for(i=0;i<clip_count;i++){
      j = 1 + (int) (n_cut * (rand() / (RAND_MAX + 1.0)));
      m=(cut[j-1]+1)%mrna_length+1;
      n=cut[j]%mrna_length+1;if(m>n)m=1;
      for(j=m;j<=n;j++)cutpoint[j]++;
    }
    peak[k]=maxima(cutpoint+1,(double)mrna_length);
  }
  (*avg)=mean(peak,(long)repeat);(*sd)=standard_deviation(peak,(long)repeat);
  for(i=p=0;p<1-p_cutoff;i++){
    p=z_score_to_p_value((i-(*avg))/(*sd));
  }
  free(peak);free(cut);free(cutpoint);
  return(i);
}
/*
       #include <stdlib.h>
       int rand(void);
       int rand_r(unsigned int *seedp);
       void srand(unsigned int seed);

              "If you want to generate a random integer between 1 and 10, you should always do it by using high-order bits, as in
                     j = 1 + (int) (10.0 * (rand() / (RAND_MAX + 1.0)));
              and never by anything resembling
                     j = 1 + (rand() % 10);
              (which uses lower-order bits)."
*/


int nd_cmp(const void *x,const void *y)
{
  return(strcmp( ((NOISE_NODE *)x)->gene,((NOISE_NODE *)y)->gene) );
}


void usage()
{  printf("ngs_remove_clip_noise_by_mrna -i gene_length_mrna_file -m input_mRNA_bed_file [-o output_coverage_file]\n");
   printf("  -i: input gene-length-mrna file. one entry per gene, tab-delimited. default: mandatory\n");
   printf("  -l: average clip length. default: mandatory\n");
   printf("  -t: total clip count. default: mandatory\n");
   printf("  -m: total mrna count. default: mandatory\n");
   printf("  -r: reference gene bed file. default: mandatory\n");
   printf("  -R: modified gene bed file. default: mandatory\n");
   printf("  -y: normalized factor. default: total clip count\n");
   printf("  -o: output file. default: stdout\n");

   printf("\nExample: ngs_coverage -i pass1.sam -o pass1.cvg\n");
   exit(2);
}
/* ngs_remove_clip_noise_by_mrna -i gene_length_mrna -l avg_clip_length -t total_clip_count -m total_mrna_count -r reference_gene_bed */

