/*******************************************************************/
/*  Date: 2010 Aug. 5                                              */
/*  Author: Gung-wei CHirn                                         */
/*  Function: Create a gene centric table.                         */
/*                                                                 */
/*                                                                 */
/*                                                                 */
/*  Input:                                                         */
/*    1. gene bed file:                                            */
/*       chr4    24052   25619   JYalpha 0       +                 */
/*       chr4    53433   64403   plexB   0       -                 */
/*       chr4    68333   77667   ci      0       -                 */
/*       chr4    86744   87863   RpS3A   0       -                 */
/*       chr4    89955   129430  pan     0       +                 */
/*       chr4    137014  150378  Ank     0       -                 */
/*       chr4    152849  163754  CG32000 0       +                 */
/*       chr4    171392  174174  CG32006 0       +                 */
/*       chr4    175334  176678  CG31997 0       -                 */
/*    2. gene cds file:                                            */
/*       chr4    24052   25619   JYalpha 0       +                 */
/*       chr4    53643   64050   plexB   0       -                 */
/*       chr4    68619   76494   ci      0       -                 */
/*       chr4    86898   87806   RpS3A   0       -                 */
/*       chr4    93055   129118  pan     0       +                 */
/*       chr4    137494  150052  Ank     0       -                 */
/*       chr4    155154  163421  CG32000 0       +                 */
/*       chr4    172376  173894  CG32006 0       +                 */
/*       chr4    175404  176529  CG31997 0       -                 */
/*       chr4    179843  196605  CG33978 0       -                 */
/*  Output:                                                        */
/*    Coverage line: line not starting with '##'                   */
/*    id:start-stop   uniq_read(+)    uniq_read(-)    uniq_read       normalized_read(+)      normalized_read(-)      normalized_read */
/*    chr2L:1-5000 0 0 0 9 6 15                                    */
/*    chr2L:5001-10000 4 2164 2168 11 2260 2271                    */
/*    chr2L:10001-15000 2 8221 8223 5 8272 8277                    */
/*    chr2L:15001-20000 0 152 152 8 175 183                        */
/*    chr2L:20001-25000 390 1 391 402 19 421                       */
/*    chr2L:25001-30000 1554 5 1559 1572 10 1582                   */
/*    chr2L:30001-35000 1477 2 1479 1486 6 1492                    */
/*    Cluster line: line starting with '##'                        */
/*    ##      id:start-stop   length  cluster_uniq_read(+)    cluster_uniq_read(-)    cluster_uniq_read       cluster_normalized_read(+)      cluster_normalized_read(-)     cluster_normalized_read */
/*    ##      chr2L:1-85000   85000   9351    10598   19949   9576    11018   20594 */
/*    ##      chr2L:90001-105000      15000   297     149     446     322     167     489 */
/*******************************************************************/

#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <unistd.h>

#include "chin_lib.h"

#define MXLGH	100000


typedef struct entry_node{
  long  start;
  long  stop;
  char  chr[1000];
  long  gene_start;
  long  gene_stop;
  char  gene_name[1000];
  char  strand;
  long  cds_start;
  long  cds_stop;
  int   n_bk;
  char  bk_length[5000];
  char  bk_starts[5000];
}ENTRY_NODE;

void process_gene(int x,int y,ENTRY_NODE *entry);
static int entry_cmp(const void *x,const void *y);
void usage();
int Pid,Extend_window,Cluster_only;
FILE *Out;
double Rpm;

main(argc, argv, envp)
int argc;
char **argv, **envp;
{
  char a[MXLGH],b[MXLGH],mapping_file[1000],coverage_file[1000],chr[1000],strand,gene_name[1000],c,**pp;
  char refseq_file[1000],pre_chr[100],pre_gene_name[1000],pre_strand,bk_length[10000],bk_starts[10000];
  int i,j,change,n_bk,pre_entry;
  long gene_start,gene_stop,cds_start,cds_stop,locus_start,locus_stop,pre_start;
  int uni_read_locus,uni_read_5utr,uni_read_3utr,uni_read_cds;
  int tot_read_locus,tot_read_5utr,tot_read_3utr,tot_read_cds;
  double nor_read_locus,nor_read_5utr,nor_read_3utr,nor_read_cds;
  int length_gene,length_locus,length_5utr,length_3utr,length_cds;
  int n_entry;
  FILE *in,*refseqfid,*tmpfid;
  ENTRY_NODE entry[1000];

  Pid=getpid();Extend_window=500;Cluster_only=0;
  in=stdin;Out=stdout;Rpm=1;

  for(i=1;i<argc;i++){
    if(argv[i][0]!='-')continue;c=argv[i][1];
    switch(c){
      case 'o': i++;
                if((Out=fopen(argv[i],"w"))==NULL){
                  fprintf(stderr, "Can't open output file %s\n",argv[i]);exit(2);
                }argv[i-1]=argv[i]=NULL;break;
      case 'r': i++;
                strcpy(refseq_file,argv[i]);
                argv[i-1]=argv[i]=NULL;break;
      case 'v': i++;
                strcpy(coverage_file,argv[i]);
                argv[i-1]=argv[i]=NULL;break;
      case 'm': i++;
                strcpy(mapping_file,argv[i]);
                argv[i-1]=argv[i]=NULL;break;
      case 'e': i++;
                Extend_window=atoi(argv[i]);
                if(Extend_window<0)Extend_window=500;
                argv[i-1]=argv[i]=NULL;break;
      case 'n': i++;Rpm=1000000.0/atof(argv[i]);if(Rpm<=0)Rpm=1;
                argv[i-1]=argv[i]=NULL;break;
      case 'l': Cluster_only=1;argv[i]=NULL;break;
      case 'h': usage();
    }
  }

/* TURNOFF CLUSTER --- put it back later
*/
  sprintf(a,"grep '^##' %s | grep -v 'id:start-stop' | cut -f2,6,9 | sed 's/:/-/' | sed 's/	/-/' | sed 's/	/:/' | tr '-' '\t' > z-%d-cluster.bed",coverage_file,Pid);system(a);
/*
*/
  sprintf(a,"cut -f1 %s | grep -v 'track' | uniq | sort -u",mapping_file);
  tmpfid=popen(a,"r");
  while(nexttext(a,tmpfid)!=0){
    sprintf(b,"grep    '^%s	' %s | cut -f1-4,6 | grep -E ':[0-9]	|:[0-9][0-9]	' > z0-%d-%s",a,mapping_file,Pid,a);system(b);
  }
  pclose(tmpfid);

  fprintf(Out,"Gene\tLocus\tChromosome\tStrand\tAnnotated_Start\tAnnotated_Stop\tLocus_Start\tLocus_Stop\tLength\t5'UTR_Length\tCoding_Length\t3'UTR_Length\tUniq_5'UTR\tUniq_CDS\tUniq_3'UTR\tUniq_Locus\tTot_5'UTR\tTot_CDS\tTot_3'UTR\tTot_Locus\tNor_5'UTR\tNor_CDS\tNor_3'UTR\tNor_Locus\t");fflush(Out);
  fprintf(Out,"Uniq_5'UTR(+)\tUniq_CDS(+)\tUniq_3'UTR(+)\tUniq_Locus(+)\tTot_5'UTR(+)\tTot_CDS(+)\tTot_3'UTR(+)\tTot_Locus(+)\tNor_5'UTR(+)\tNor_CDS(+)\tNor_3'UTR(+)\tNor_Locus(+)\t");fflush(Out);
  fprintf(Out,"Uniq_5'UTR(-)\tUniq_CDS(-)\tUniq_3'UTR(-)\tUniq_Locus(-)\tTot_5'UTR(-)\tTot_CDS(-)\tTot_3'UTR(-)\tTot_Locus(-)\tNor_5'UTR(-)\tNor_CDS(-)\tNor_3'UTR(-)\tNor_Locus(-)\n");fflush(Out);

/*
chrII   1866    4663    2L52.1  0       +       1866    4663    0       7       45,189,151,106,147,183,463,     0,639,871,1064,1539,1935,2334,
chrII   15268113        15273216        2RSSE.1 0       +       15268382        15273216        0       5       343,336,172,178,272,    0,1247,1648,2584,4831,
chrII   15274292        15275591        2RSSE.2 0       +       15274292        15275591        0       4       126,276,108,156,        0,638,961,1143,
chrIII  13780107        13781013        3R5.1   0       +       13780107        13780851        0       3       348,248,214,    0,395,692,
chrIII  8769416 8776047 4-NITROPHENYLPHOSPHATASE        0       +       8769454 8775794 0       4       106,173,564,318,        0,4802,5446,6313,
chrIII  8773051 8774214 4-NITROPHENYLPHOSPHATASE        0       +       8773051 8774059 0       2       779,276,        0,887,
chrIII  8773051 8775794 4-NITROPHENYLPHOSPHATASE        0       +       8773051 8775794 0       4       779,173,564,65, 0,1167,1811,2678,
chrIV   17480351        17483286        4R79.2  0       -       17480351        17483286        0       10      94,77,226,129,273,116,53,109,54,57,     0,137,766,1067,1241,1568,1913,2013,2538,2878,
*/
  sprintf(a,"sort +3 -4 +0 -1 +4 -5 %s",refseq_file);
  refseqfid=popen(a,"r");
  pre_chr[0]='\0';
  pre_gene_name[0]='\0';
  strand=' ';
  n_entry=0;
  while(nexttext(a,refseqfid)!=0){
/* printf("%s\n",a);fflush(stdout); */
    pp=strsplit(a,"\t",&i);
    strcpy(chr,pp[0]);
    gene_start=atol(pp[1]);
    gene_stop=atol(pp[2]);
    strcpy(gene_name,pp[3]);
    strand=*(pp[5]);
    cds_start=atol(pp[6]);
    cds_stop=atol(pp[7]);
    n_bk=atoi(pp[9]);
    strcpy(bk_length,pp[10]);
    strcpy(bk_starts,pp[11]);
    if(strcmp(pre_chr,chr)==0&&strcmp(pre_gene_name,gene_name)==0&&pre_strand==strand   && 1==0  /*** treat isoforms individually ***/  ){
      entry[n_entry].start=gene_start;
      entry[n_entry].stop=gene_stop;
      strcpy(entry[n_entry].chr,chr);
      entry[n_entry].gene_start=gene_start;
      entry[n_entry].gene_stop=gene_stop;
      strcpy(entry[n_entry].gene_name,gene_name);
      entry[n_entry].strand=strand;
      entry[n_entry].cds_start=cds_start;
      entry[n_entry].cds_stop=cds_stop;
      entry[n_entry].n_bk=n_bk;
      strcpy(entry[n_entry].bk_length,bk_length);
      strcpy(entry[n_entry].bk_starts,bk_starts);
      n_entry++;
      continue;
    }
/* printf("XXX\n");fflush(stdout); */
ag: for(change=i=0;i<n_entry;i++){
      for(j=i+1;j<n_entry;j++){
        if(entry[i].start > entry[j].stop)continue;
        if(entry[j].start > entry[i].stop)continue;
        if(entry[i].start==entry[j].start&&entry[i].stop==entry[j].stop)continue;
        if(entry[i].start > entry[j].start)entry[i].start=entry[j].start;else entry[j].start=entry[i].start;
        if(entry[i].stop > entry[j].stop)entry[j].stop=entry[i].stop;else entry[i].stop=entry[j].stop;
        change=1;
      }
    }
    if(change==1)goto ag;
    qsort(entry,n_entry,sizeof(ENTRY_NODE),entry_cmp);

    pre_start=-1;pre_entry=0;
    for(i=0;i<n_entry;i++){
      if(entry[i].start==pre_start)continue;
      process_gene(pre_entry,i,entry);
      pre_start=entry[i].start;
      pre_entry=i;
    }
    process_gene(pre_entry,i,entry);
/*
for(i=0;i<n_entry;i++){
printf("%d %ld %ld %s %ld %ld %s %c %ld %ld %d %s %s\n",i,entry[i].start,entry[i].stop,entry[i].chr,entry[i].gene_start,entry[i].gene_stop,entry[i].gene_name,entry[i].strand,entry[i].cds_start,entry[i].cds_stop,entry[i].n_bk=n_bk,entry[i].bk_length,entry[i].bk_starts);fflush(stdout);
}
*/
    n_entry=0;
    entry[n_entry].start=gene_start;
    entry[n_entry].stop=gene_stop;
    strcpy(entry[n_entry].chr,chr);
    entry[n_entry].gene_start=gene_start;
    entry[n_entry].gene_stop=gene_stop;
    strcpy(entry[n_entry].gene_name,gene_name);
    entry[n_entry].strand=strand;
    entry[n_entry].cds_start=cds_start;
    entry[n_entry].cds_stop=cds_stop;
    entry[n_entry].n_bk=n_bk;
    strcpy(entry[n_entry].bk_length,bk_length);
    strcpy(entry[n_entry].bk_starts,bk_starts);
    n_entry++;

    strcpy(pre_chr,chr);
    strcpy(pre_gene_name,gene_name);
    pre_strand=strand;
  }
/*
for(i=0;i<n_entry;i++){
printf("%d %ld %ld %s %ld %ld %s %c %ld %ld %d %s %s\n",i,entry[i].start,entry[i].stop,entry[i].chr,entry[i].gene_start,entry[i].gene_stop,entry[i].gene_name,entry[i].strand,entry[i].cds_start,entry[i].cds_stop,entry[i].n_bk=n_bk,entry[i].bk_length,entry[i].bk_starts);fflush(stdout);
}
*/
ga:for(change=i=0;i<n_entry;i++){
    for(j=i+1;j<n_entry;j++){
      if(entry[i].start > entry[j].stop)continue;
      if(entry[j].start > entry[i].stop)continue;
      if(entry[i].start==entry[j].start&&entry[i].stop==entry[j].stop)continue;
      if(entry[i].start > entry[j].start)entry[i].start=entry[j].start;else entry[j].start=entry[i].start;
      if(entry[i].stop > entry[j].stop)entry[j].stop=entry[i].stop;else entry[i].stop=entry[j].stop;
      change=1;
    }
  }
  if(change==1)goto ga;
  qsort(entry,n_entry,sizeof(ENTRY_NODE),entry_cmp);

  pre_start=-1;pre_entry=0;
  for(i=0;i<n_entry;i++){
    if(entry[i].start==pre_start)continue;
    process_gene(pre_entry,i,entry);
    pre_start=entry[i].start;
    pre_entry=i;
  }
  process_gene(pre_entry,i,entry);
  pclose(refseqfid);
  fclose(Out);
}


void process_gene(int x,int y,ENTRY_NODE *entry)
{
char a[1000],b[1000],chr[1000],gene_name[1000],strand,**pp,read_strand,read[10000];
int i,j,k,read_fq,read_locus;
long bk_str[1000],bk_stp[1000],utr5_length,utr3_length,cds_length,u,v;
long start,stop,locus_start,locus_stop;
int locus_read_tot_p,locus_read_unq_p,locus_read_tot_n,locus_read_unq_n;
double locus_read_nrm_p,locus_read_nrm_n;
int utr5_read_tot_p,utr5_read_unq_p,utr5_read_tot_n,utr5_read_unq_n;
double utr5_read_nrm_p,utr5_read_nrm_n;
int utr3_read_tot_p,utr3_read_unq_p,utr3_read_tot_n,utr3_read_unq_n;
double utr3_read_nrm_p,utr3_read_nrm_n;
int cds_read_tot_p,cds_read_unq_p,cds_read_tot_n,cds_read_unq_n;
double cds_read_nrm_p,cds_read_nrm_n;
FILE *tmpfid;

  
  if(y<=x)return;
/*
for(i=x;i<y;i++){
printf("%d ",i);
printf("%ld ",entry[i].start);
printf("%ld ",entry[i].stop);
printf("%s ",entry[i].chr);
printf("%ld ",entry[i].gene_start);
printf("%ld ",entry[i].gene_stop);
printf("%s ",entry[i].gene_name);
printf("%c ",entry[i].strand);
printf("%ld ",entry[i].cds_start);
printf("%ld ",entry[i].cds_stop);
printf("%d ",entry[i].n_bk);
printf("%s ",entry[i].bk_length);
printf("%s\n",entry[i].bk_starts);
fflush(stdout);
}
*/
  locus_start=(entry[x].start-Extend_window<0)?(0):(entry[x].start-Extend_window);
  locus_stop=entry[x].stop+Extend_window;
  strand=entry[x].strand;
  strcpy(gene_name,entry[x].gene_name);
  strcpy(chr,entry[x].chr);
/***********************************/
/* obtain locus_start & locus_stop */
/***********************************/

/* TURNOFF cluster - put it back later
*/
  sprintf(a,"z1-%d.bed",Pid);
  tmpfid=fopen(a,"w");
  fprintf(tmpfid,"%s\t%ld\t%ld\n",entry[x].chr,locus_start,locus_stop);
  fclose(tmpfid);
  sprintf(a,"/nlmusr/gchirn/linux/TOOLS/bin/intersectBed -a z-%d-cluster.bed -b z1-%d.bed -wa",Pid,Pid);
  tmpfid=popen(a,"r");
  while(nexttext(a,tmpfid)!=0){
    sscanf(a,"%s %ld %ld",chr,&start,&stop);
    if(start<locus_start)locus_start=start;
    if(stop>locus_stop)locus_stop=stop;
  }pclose(tmpfid);
/*
*/

/*********************/
/* obtain locus_read */
/*********************/
  sprintf(a,"z1-%d.bed",Pid);
  tmpfid=fopen(a,"w");
  fprintf(tmpfid,"%s\t%ld\t%ld\t%c\n",chr,locus_start,locus_stop,strand);
  fclose(tmpfid);
  if(Cluster_only==1){
    sprintf(a,"/nlmusr/gchirn/linux/TOOLS/bin/intersectBed -a z0-%d-%s -b z-%d-cluster.bed -wa | sort -u > z9-%d; mv z9-%d z0-%d-%s",Pid,chr,Pid,Pid,Pid,Pid,chr);
    system(a);
  }
  sprintf(a,"/nlmusr/gchirn/linux/TOOLS/bin/intersectBed -a z0-%d-%s -b z1-%d.bed -wa | sort -u | tee z0-%d-sub",Pid,chr,Pid,Pid);
  tmpfid=popen(a,"r");
  locus_read_tot_p=locus_read_nrm_p=locus_read_unq_p=locus_read_tot_n=locus_read_nrm_n=locus_read_unq_n=0;
  while(nexttext(a,tmpfid)!=0){
    pp=strsplit(a,"\t",&i);
    strcpy(read,pp[3]);
    read_strand=pp[4][0];
    pp=strsplit(read,":",&i);
    read_fq=atoi(pp[1]);
    read_locus=atoi(pp[2]);
    if(read_strand==strand){
      locus_read_tot_p+=read_fq;
      locus_read_nrm_p+=((double)read_fq/read_locus);
      locus_read_unq_p+=(read_locus==1)?(read_fq):(0);
    }
    else{
      locus_read_tot_n+=read_fq;
      locus_read_nrm_n+=((double)read_fq/read_locus);
      locus_read_unq_n+=(read_locus==1)?(read_fq):(0);
    }
  }pclose(tmpfid);
/*
goto only_locus;
*/
/* printf("%s:%ld-%ld %s  %d %.2lf %d  %d %.2lf %d     ",chr,locus_start,locus_stop,gene_name,locus_read_tot_p,locus_read_nrm_p,locus_read_unq_p,locus_read_tot_n,locus_read_nrm_n,locus_read_unq_n);fflush(stdout); */

/********************/
/* obtain utr5_read */
/********************/
  sprintf(a,"z1-%d.bed",Pid);tmpfid=fopen(a,"w");
  for(i=x;i<y;i++){
    pp=strsplit(entry[i].bk_starts,",",&k);
    for(j=0;j<entry[i].n_bk;j++)bk_str[j]=entry[i].gene_start+atoi(pp[j]);
    pp=strsplit(entry[i].bk_length,",",&k);
    for(j=0;j<entry[i].n_bk;j++)bk_stp[j]=bk_str[j]+atoi(pp[j]);
    for(j=0;j<entry[i].n_bk;j++){
      if(strand=='+'){
        if(j==0){bk_str[j]-=Extend_window;if(bk_str[j]<0)bk_str[j]=0;if(bk_str[j]>locus_start)bk_str[j]=locus_start;}
        if(bk_stp[j]>entry[i].cds_start)bk_stp[j]=entry[i].cds_start;
        if(bk_str[j]<bk_stp[j])fprintf(tmpfid,"%s\t%ld\t%ld\t%c\n",chr,bk_str[j],bk_stp[j],strand);
      }
      else{
        if(j==entry[i].n_bk-1){bk_stp[j]+=Extend_window;if(bk_stp[j]<locus_stop)bk_stp[j]=locus_stop;}
        if(bk_str[j]<entry[i].cds_stop)bk_str[j]=entry[i].cds_stop;
        if(bk_str[j]<bk_stp[j])fprintf(tmpfid,"%s\t%ld\t%ld\t%c\n",chr,bk_str[j],bk_stp[j],strand);
      }
    }
  }fclose(tmpfid);
  sprintf(a,"sort -u z1-%d.bed > z2-%d.bed; /nlmusr/gchirn/linux/TOOLS/bin/intersectBed -a z0-%d-sub -b z2-%d.bed -wa | sort -u",Pid,Pid,Pid,Pid);
  tmpfid=popen(a,"r");
  utr5_read_tot_p=utr5_read_nrm_p=utr5_read_unq_p=utr5_read_tot_n=utr5_read_nrm_n=utr5_read_unq_n=0;
  while(nexttext(a,tmpfid)!=0){
    pp=strsplit(a,"\t",&i);
    strcpy(read,pp[3]);
    read_strand=pp[4][0];
    pp=strsplit(read,":",&i);
    read_fq=atoi(pp[1]);
    read_locus=atoi(pp[2]);
    if(read_strand==strand){
      utr5_read_tot_p+=read_fq;
      utr5_read_nrm_p+=((double)read_fq/read_locus);
      utr5_read_unq_p+=(read_locus==1)?(read_fq):(0);
    }
    else{
      utr5_read_tot_n+=read_fq;
      utr5_read_nrm_n+=((double)read_fq/read_locus);
      utr5_read_unq_n+=(read_locus==1)?(read_fq):(0);
    }
  }pclose(tmpfid);
/* printf("%d %.2lf %d  %d %.2lf %d     ",utr5_read_tot_p,utr5_read_nrm_p,utr5_read_unq_p,utr5_read_tot_n,utr5_read_nrm_n,utr5_read_unq_n);fflush(stdout); */
  sprintf(a,"/nlmusr/gchirn/linux/TOOLS/bin/mergeBed -i z2-%d.bed",Pid);
  tmpfid=popen(a,"r");utr5_length=0;
  while(nexttext(a,tmpfid)!=0){
    sscanf(a,"%s %ld %ld",b,&u,&v);
    utr5_length+=(v-u);
  }pclose(tmpfid);

/********************/
/* obtain utr3_read */
/********************/
  sprintf(a,"z1-%d.bed",Pid);tmpfid=fopen(a,"w");
  for(i=x;i<y;i++){
    pp=strsplit(entry[i].bk_starts,",",&k);
    for(j=0;j<entry[i].n_bk;j++)bk_str[j]=entry[i].gene_start+atoi(pp[j]);
    pp=strsplit(entry[i].bk_length,",",&k);
    for(j=0;j<entry[i].n_bk;j++)bk_stp[j]=bk_str[j]+atoi(pp[j]);
    for(j=0;j<entry[i].n_bk;j++){
      if(strand=='+'){
        if(j==entry[i].n_bk-1){bk_stp[j]+=Extend_window;if(bk_stp[j]<locus_stop)bk_stp[j]=locus_stop;}
        if(bk_str[j]<entry[i].cds_stop)bk_str[j]=entry[i].cds_stop;
        if(bk_str[j]<bk_stp[j])fprintf(tmpfid,"%s\t%ld\t%ld\t%c\n",chr,bk_str[j],bk_stp[j],strand);
      }
      else{
        if(j==0){bk_str[j]-=Extend_window;if(bk_str[j]<0)bk_str[j]=0;if(bk_str[j]>locus_start)bk_str[j]=locus_start;}
        if(bk_stp[j]>entry[i].cds_start)bk_stp[j]=entry[i].cds_start;
        if(bk_str[j]<bk_stp[j])fprintf(tmpfid,"%s\t%ld\t%ld\t%c\n",chr,bk_str[j],bk_stp[j],strand);
      }
    }
  }fclose(tmpfid);
  sprintf(a,"sort -u z1-%d.bed > z2-%d.bed; /nlmusr/gchirn/linux/TOOLS/bin/intersectBed -a z0-%d-sub -b z2-%d.bed -wa | sort -u",Pid,Pid,Pid,Pid);
/*
printf("sort -u z1-%d.bed > z2-%d.bed; /nlmusr/gchirn/linux/TOOLS/bin/intersectBed -a z0-%d-sub -b z2-%d.bed -wa | sort -u\n",Pid,Pid,Pid,Pid);fflush(stdout);
*/
  tmpfid=popen(a,"r");
  utr3_read_tot_p=utr3_read_nrm_p=utr3_read_unq_p=utr3_read_tot_n=utr3_read_nrm_n=utr3_read_unq_n=0;
  while(nexttext(a,tmpfid)!=0){
    pp=strsplit(a,"\t",&i);
    strcpy(read,pp[3]);
    read_strand=pp[4][0];
    pp=strsplit(read,":",&i);
    read_fq=atoi(pp[1]);
    read_locus=atoi(pp[2]);
    if(read_strand==strand){
      utr3_read_tot_p+=read_fq;
      utr3_read_nrm_p+=((double)read_fq/read_locus);
      utr3_read_unq_p+=(read_locus==1)?(read_fq):(0);
    }
    else{
      utr3_read_tot_n+=read_fq;
      utr3_read_nrm_n+=((double)read_fq/read_locus);
      utr3_read_unq_n+=(read_locus==1)?(read_fq):(0);
    }
  }pclose(tmpfid);
/* printf("%d %.2lf %d  %d %.2lf %d     ",utr3_read_tot_p,utr3_read_nrm_p,utr3_read_unq_p,utr3_read_tot_n,utr3_read_nrm_n,utr3_read_unq_n);fflush(stdout); */
  sprintf(a,"/nlmusr/gchirn/linux/TOOLS/bin/mergeBed -i z2-%d.bed",Pid);
  tmpfid=popen(a,"r");utr3_length=0;
  while(nexttext(a,tmpfid)!=0){
    sscanf(a,"%s %ld %ld",b,&u,&v);
    utr3_length+=(v-u);
  }pclose(tmpfid);

/********************/
/* obtain cds_read */
/********************/
  sprintf(a,"z1-%d.bed",Pid);
  tmpfid=fopen(a,"w");
  for(i=x;i<y;i++){
    pp=strsplit(entry[i].bk_starts,",",&k);
    for(j=0;j<entry[i].n_bk;j++){
      bk_str[j]=entry[i].gene_start+atoi(pp[j]);
    }
    pp=strsplit(entry[i].bk_length,",",&k);
    for(j=0;j<entry[i].n_bk;j++){
      bk_stp[j]=bk_str[j]+atoi(pp[j]);
      if(bk_str[j]<entry[i].cds_start)bk_str[j]=entry[i].cds_start;
      if(bk_stp[j]>entry[i].cds_stop)bk_stp[j]=entry[i].cds_stop;
    }
    for(j=0;j<entry[i].n_bk;j++){
      if(bk_str[j]<bk_stp[j])fprintf(tmpfid,"%s\t%ld\t%ld\t%c\n",chr,bk_str[j],bk_stp[j],strand);
    }
  }fclose(tmpfid);
  sprintf(a,"sort -u z1-%d.bed > z2-%d.bed; /nlmusr/gchirn/linux/TOOLS/bin/intersectBed -a z0-%d-sub -b z2-%d.bed -wa | sort -u",Pid,Pid,Pid,Pid);
  tmpfid=popen(a,"r");
  cds_read_tot_p=cds_read_nrm_p=cds_read_unq_p=cds_read_tot_n=cds_read_nrm_n=cds_read_unq_n=0;
  while(nexttext(a,tmpfid)!=0){
    pp=strsplit(a,"\t",&i);
    strcpy(read,pp[3]);
    read_strand=pp[4][0];
    pp=strsplit(read,":",&i);
    read_fq=atoi(pp[1]);
    read_locus=atoi(pp[2]);
    if(read_strand==strand){
      cds_read_tot_p+=read_fq;
      cds_read_nrm_p+=((double)read_fq/read_locus);
      cds_read_unq_p+=(read_locus==1)?(read_fq):(0);
    }
    else{
      cds_read_tot_n+=read_fq;
      cds_read_nrm_n+=((double)read_fq/read_locus);
      cds_read_unq_n+=(read_locus==1)?(read_fq):(0);
    }
  }pclose(tmpfid);
/* printf("%d %.2lf %d  %d %.2lf %d\n",cds_read_tot_p,cds_read_nrm_p,cds_read_unq_p,cds_read_tot_n,cds_read_nrm_n,cds_read_unq_n);fflush(stdout); */

  sprintf(a,"/nlmusr/gchirn/linux/TOOLS/bin/mergeBed -i z2-%d.bed",Pid);
  tmpfid=popen(a,"r");cds_length=0;
  while(nexttext(a,tmpfid)!=0){
    sscanf(a,"%s %ld %ld",b,&u,&v);
    cds_length+=(v-u);
  }pclose(tmpfid);
/*
printf("====> %s\t%ld\n",gene_name,cds_length);fflush(stdout);
*/

only_locus:



fprintf(Out,"%s\t",gene_name);
fprintf(Out,"%s:%ld-%ld\t",chr,locus_start,locus_stop);
fprintf(Out,"%s\t",chr);
fprintf(Out,"%c\t",strand);
fprintf(Out,"%ld\t",entry[x].start);
fprintf(Out,"%ld\t",entry[x].stop);
fprintf(Out,"%ld\t",locus_start);
fprintf(Out,"%ld\t",locus_stop);
fprintf(Out,"%ld\t",entry[x].stop-entry[x].start);
fprintf(Out,"%ld\t",utr5_length);
fprintf(Out,"%ld\t",cds_length);
fprintf(Out,"%ld\t",utr3_length);
fprintf(Out,"%d\t",utr5_read_unq_p+utr5_read_unq_n);
fprintf(Out,"%d\t",cds_read_unq_p+cds_read_unq_n);
fprintf(Out,"%d\t",utr3_read_unq_p+utr3_read_unq_n);
fprintf(Out,"%d\t",locus_read_unq_p+locus_read_unq_n);
fprintf(Out,"%d\t",utr5_read_tot_p+utr5_read_tot_n);
fprintf(Out,"%d\t",cds_read_tot_p+cds_read_tot_n);
fprintf(Out,"%d\t",utr3_read_tot_p+utr3_read_tot_n);
fprintf(Out,"%d\t",locus_read_tot_p+locus_read_tot_n);
fprintf(Out,"%.3lf\t",(utr5_read_nrm_p+utr5_read_nrm_n)*Rpm);
fprintf(Out,"%.3lf\t",(cds_read_nrm_p+cds_read_nrm_n)*Rpm);
fprintf(Out,"%.3lf\t",(utr3_read_nrm_p+utr3_read_nrm_n)*Rpm);
fprintf(Out,"%.3lf\t",(locus_read_nrm_p+locus_read_nrm_n)*Rpm);fflush(Out);

fprintf(Out,"%d\t",utr5_read_unq_p);
fprintf(Out,"%d\t",cds_read_unq_p);
fprintf(Out,"%d\t",utr3_read_unq_p);
fprintf(Out,"%d\t",locus_read_unq_p);
fprintf(Out,"%d\t",utr5_read_tot_p);
fprintf(Out,"%d\t",cds_read_tot_p);
fprintf(Out,"%d\t",utr3_read_tot_p);
fprintf(Out,"%d\t",locus_read_tot_p);
fprintf(Out,"%.3lf\t",(utr5_read_nrm_p)*Rpm);
fprintf(Out,"%.3lf\t",(cds_read_nrm_p)*Rpm);
fprintf(Out,"%.3lf\t",(utr3_read_nrm_p)*Rpm);
fprintf(Out,"%.3lf\t",(locus_read_nrm_p)*Rpm);fflush(Out);

fprintf(Out,"%d\t",utr5_read_unq_n);
fprintf(Out,"%d\t",cds_read_unq_n);
fprintf(Out,"%d\t",utr3_read_unq_n);
fprintf(Out,"%d\t",locus_read_unq_n);
fprintf(Out,"%d\t",utr5_read_tot_n);
fprintf(Out,"%d\t",cds_read_tot_n);
fprintf(Out,"%d\t",utr3_read_tot_n);
fprintf(Out,"%d\t",locus_read_tot_n);
fprintf(Out,"%.3lf\t",(utr5_read_nrm_n)*Rpm);
fprintf(Out,"%.3lf\t",(cds_read_nrm_n)*Rpm);
fprintf(Out,"%.3lf\t",(utr3_read_nrm_n)*Rpm);
fprintf(Out,"%.3lf\n",(locus_read_nrm_n)*Rpm);fflush(Out);

/*
chrII   1866    4663    2L52.1  0       +       1866    4663    0       7       45,189,151,106,147,183,463,     0,639,871,1064,1539,1935,2334,

chrII   15271338        15271360        GSM297742|GTAGATCTACAAAAAAATGCGG:1:19   +       chrII   15267613        15273716        +
chrII   15271364        15271381        GSM297742|GAAATCAGTTGAGAACT:1:270       -       chrII   15267613        15273716        +
chrII   15271399        15271415        GSM297742|AGAAAAATGTGACGTC:1:113        -       chrII   15267613        15273716        +

typedef struct entry_node{
  long  start;
  long  stop;
  char  chr[1000];
  long  gene_start;
  long  gene_stop;
  char  gene_name[1000];
  char  strand;
  long  cds_start;
  long  cds_stop;
  int   n_bk;
  char  bk_length[5000];
  char  bk_starts[5000];
}ENTRY_NODE;
*/
}


static int entry_cmp(const void *x,const void *y)
{
  return(((ENTRY_NODE *)y)->start-((ENTRY_NODE *)x)->start);
}



void usage()
{  printf("ngs_genecentric [-i input_sam_file] [-o output_coverage_file]\n");
   printf("  -r: refseq file in bed format. (eg. fly_refseq.bed)\n");
   printf("  -v: coverage file. (eg. s_2678.uq.nomirna-v2-coverage)\n");
   printf("  -m: mapping file in bed format. (eg. s_2678.uq.nomirna-v2.bed)\n");
   printf("  -e: extension window for 5' & 3' UTR. default: 500(bp)\n");
   printf("  -n: normalized by RPM (read pre million). default: 1 \n");
   printf("  -l: only count reads within the clusters. default: off (count reads in whole locus)\n");
   printf("  -o: output file. default: stdout\n");
   exit(2);
}
