#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>

#include "mconf.h"
#include "protos.h"
#include "chin_stat.h"

static int data_cmp(const void *x,const void *y);

double mean(double *x,long n)
{double sum;
 long i;
  if(n<=0)return(HUGE);
  for(sum=0,i=0;i<n;i++)sum+=(*(x+i));return(sum/n);
}

double minima(double *m, long n)
{double min;
 long i;

  if(n<=0)return(HUGE);min=m[0];
  for(i=1;i<n;i++)if(m[i]<min)min=m[i];
  return(min);
}

double maxima(double *m, long n)
{double max;
 long i;

  if(n<=0)return(0-HUGE);max=m[0];
  for(i=1;i<n;i++)if(m[i]>max)max=m[i];
  return(max);
}

double median(double *x,long n)
{long i,m;
 double *y,med;

  if(n<=0)return(HUGE);
  y=(double *)malloc((size_t)((n+10)*sizeof(double)));
  for(i=0;i<n;i++)y[i]=x[i];
  qsort(y,(size_t)n,sizeof(double),data_cmp);m=n/2;
  if(n%2==0)med=(y[m]+y[m-1])/2;else med=y[m];
  free(y); return(med);
}

double quartile(double *x,long n,int flag)
{long i,m,k;
 double *y,qua;

  if(n<=0)return(HUGE);
  if(flag==0)return(minima(x,n));
  if(flag==2)return(median(x,n));
  if(flag==4)return(maxima(x,n));
  y=(double *)malloc((size_t)((n+10)*sizeof(double)));
  for(i=0;i<n;i++)y[i]=x[i];
  qsort(y,(size_t)n,sizeof(double),data_cmp);
  m=n/2;
  k=m/2;
  if(flag==1){
    if(m%2==0)qua=(y[k]+y[k-1])/2;   /* n= 8; m=4; k=2 or n= 9; m=4; k=2 */
    else      qua=y[k];              /* n=10; m=5; k=2 or n=11; m=5; k=2 */
  }
  else if(flag==3){
    if(m%2==0)qua=(y[n-k]+y[n-k-1])/2;   /* n= 8; m=4; k=2 or n= 9; m=4; k=2 */
    else      qua=y[n-k-1];              /* n=10; m=5; k=2 or n=11; m=5; k=2 */
  }
  else qua=HUGE;
  free(y); return(qua);
}


/*****************************************************************/
/* downloaded from http://ndevilla.free.fr/median/               */
/* The following code is in public domain.                       */
/* Algorithm by Torben Mogensen, implementation by N. Devillard. */
/*   return "lower median" when n is even (not average)          */
/*****************************************************************/
/*
double median(double *m, long n)
{
  long    i, less, greater, equal;
  double  min, max, guess, maxltguess, mingtguess;

  if(n<=0)return(HUGE);
  min = max = m[0] ;
  for(i=1;i<n;i++){if(m[i]<min)min=m[i];if(m[i]>max)max=m[i];}

  while(1){
    guess=(min+max)/2;
    less=greater=equal=0;
    maxltguess = min ;
    mingtguess = max ;
    for (i=0; i<n; i++) {
      if(m[i]<guess) {
        less++;
        if (m[i]>maxltguess) maxltguess = m[i] ;
      } else if (m[i]>guess) {
        greater++;
        if (m[i]<mingtguess) mingtguess = m[i] ;
      } else equal++;
    }
    if (less <= (n+1)/2 && greater <= (n+1)/2) break ;
    else if (less>greater) max = maxltguess ;
    else min = mingtguess;
  }
  if (less >= (n+1)/2) return(maxltguess);
  else if (less+equal >= (n+1)/2) return(guess);
  else return(mingtguess);
}
*/

double variance(double *x, long n)
{double avg,s,sum;
 long i;
 if(n==1)return(0);
 if(n<1)return(HUGE);
 avg=mean(x,n);
 for(sum=0,i=0;i<n;i++){s=(*(x+i))-avg;sum+=(s*s);}
 return(sum/(n-1));
}

double standard_deviation(double *x, long n)
{
 return(sqrt(variance(x,n)));
}

/***************************************/
/*                                     */
/*          3 * ( SD(pos) + SD(neg) )  */
/* Z = 1 - --------------------------- */
/*                  ___   ___          */
/*                | pos - neg |        */
/***************************************/
double z_factor(long n_pos, double *pos, long n_neg, double *neg)
{double mean_pos,mean_neg,x;
int i;

 if(n_pos<=0 || n_neg<=0)return((double)(0-999));
 mean_pos=mean(pos,n_pos);mean_neg=mean(neg,n_neg);
 if(mean_pos>mean_neg)x=mean_pos-mean_neg;
 else if(mean_pos<mean_neg)x=mean_neg-mean_pos;
 else return((double)(0-999));
 return(1-3*(standard_deviation(pos,n_pos)+standard_deviation(neg,n_neg))/x);
}

/***************/
/*         _   */
/*    Xi - X   */
/* Z=--------- */
/*      SD(X)  */
/***************/
double z_score(double *x, long n, double y)
{double avg,sd;
 avg=mean(x,n);sd=standard_deviation(x,n);
 if(sd==0)return(0);else return((y-avg)/sd);
}

void z_score_set(double *x, long n, double *y)
{double avg,sd;
 long i;
 avg=mean(x,n);sd=standard_deviation(x,n);
 if(sd==0){for(i=0;i<n;i++)y[i]=0;}
 else {for(i=0;i<n;i++)y[i]=(x[i]-avg)/sd;}return;
}

/***********************************************************/
/*                    from CEPHES                          */
/* Returns the area under the Gaussian probability density */
/* function, integrated from minus infinity to x:          */
/*                                                         */
/*                            x                            */
/*                             -                           */
/*                   1        | |          2               */
/*    ndtr(x)  = ---------    |    exp( - t /2 ) dt        */
/*               sqrt(2pi)  | |                            */
/*                           -                             */
/*                          -inf.                          */
/*                                                         */
/*             =  ( 1 + erf(z) ) / 2                       */
/*             =  erfc(z) / 2                              */
/*                                                         */
/*       http://www.danielsoper.com/statcalc/              */
/***********************************************************/
double z_score_to_p_value(double x)  /* Cumulative */
{
  return(ndtr(x));
}

double z_score_to_p_value_one_tailed(double x)  /* One-tailed */
{double p;

  p=ndtr(x);
  if(p<=0.5)return(p);else return(1-p);
}

/***********************************************/
/*  mad(X) = 1.4826 * median( |Xi-median(X)| ) */
/***********************************************/
double mad(double *x, long n)
{double *y,med,m;
 long i;
  y=(double *)malloc((size_t)((n+10)*sizeof(double))); med=median(x,n);
  for(i=0;i<n;i++){m=x[i]-med; if(m>0)y[i]=m; else y[i]=0-m;}
  med=median(y,n); free(y); return(1.4826*med);
}

/***********************/
/*     Xi - median(X)  */
/* NZ=---------------- */
/*          mad(X)     */
/***********************/
double nz_score(double *x, long n, double y)
{double med,md;
 med=median(x,n);md=mad(x,n);
 if(md==0)return(0);else return((y-med)/md);
}

void nz_score_set(double *x, long n, double *y)
{double med,md;
 long i;
 med=median(x,n);md=mad(x,n);
 if(md==0){for(i=0;i<n;i++)y[i]=0;}
 else {for(i=0;i<n;i++)y[i]=(x[i]-med)/md;}return;
}

/********************
http://www.graphpad.com/www/book/Choose.htm
******************************/



/* VOID july 5, 2007, merged into t_test
########
Calculate the Mean values (M1, M2) and standard deviations (SD1 and SD2) of both samples.
SDg=sqrt(((n1-1)*SD1^2+(n2-1)*SD2^2)/(n1+n2-2))
t=(M1-M2)/(SDg*sqrt(1/n1+1/n2)).
The number of Degrees of Freedom = n1 + n2 - 2.
(2-sample assuming equal variances)
(homoscedastic t-test)
http://projectile.is.cs.cmu.edu/research/public/talks/t-test.htm
#########

double t_test_1(double *x,double *y,long m,long n)
{double xmean,ymean;
 double xvar,yvar;
 double sdg;
  if(m+n-2==0||m==0||n==0)return(0);
  xmean=mean(x,m);ymean=mean(y,n);
  xvar=variance(x,m);yvar=variance(y,n);
  sdg=sqrt(((m-1)*xvar+(n-1)*yvar)/(m+n-2));if(sdg==0)return(0);
  return((xmean-ymean)/(sdg*sqrt((double)1/m+(double)1/n)));
}

double t_1_p_value(double *x,double *y,long m,long n)
{double t_value;
  t_value=t_test_1(x,y,m,n);
  return(p_value_2_tailed(t_value,m+n-2));
}

double t_test_2(double *x,double *y,long m,long n)
## same as t_test_1 ##
{double xmean,ymean;
 double s2p,est;
  if(m+n-2==0||m==0||n==0)return(0);
  xmean=mean(x,m);ymean=mean(y,n);
  s2p=(variance(x,m)*(m-1)+variance(y,n)*(n-1))/(m+n-2);
  est=sqrt(s2p/m+s2p/n);if(est==0)return(0);
  return((xmean-ymean)/est);
}
*/


/********************************************************************************************************/
/* student t-test (http://projectile.is.cs.cmu.edu/research/public/talks/t-test.htm)                    */
/*                                                                                                      */
/*  type==0, homoscedastic t-test, (2-sample assuming equal variances)                                  */
/*           Two samples are referred to as *independent* if the observations in one sample are not in  */
/*           any way related to the observations in the other. This is also used in cases where one     */
/*           randomly assign subjects to two groups, give first group treatment A and the second group  */
/*           treatment B and compare the two groups. 	                                                */
/*                                _   _                                                                 */
/*                                X - Y                                                                 */
/*        t = -----------------------------------------------                                           */
/*               ---------------------------     -----------                                            */
/*              / (m-1)var(X) + (n-1)var(Y)     /  1     1                                              */
/*             / --------------------------    /  --- + ---                                             */
/*           \/           m + n - 2          \/    m     n                                              */
/*                                                                                                      */
/*            degree of freedom = m + n - 2                                                             */
/*                                                                                                      */
/*  type==1, heteroscedastic t-test, (2-sample assuming unequal variances), Welch's correction.         */
/*           The variance in the two groups are *extremely different*. e.g. the two samples are of very */
/*           different sizes.                                                                           */
/*                      _   _                                                                           */
/*                      X - Y                                                                           */
/*        t = ------------------------                                                                  */
/*                -----------------                                                                     */
/*               / var(X)   var(Y)                                                                      */
/*              /  ------ + ------                                                                      */
/*            \/     m        n                                                                         */
/*                                                                                                      */
/*            degree of freedom =round{ (var(X)/m+var(Y)/n)^2/[(var(X)/m)^2/(m-1)+(var(Y)/n)^2/(n-1)] } */
/********************************************************************************************************/
double t_test(double *x,double *y,long m,long n,int type)
{double xvar,yvar;
 double sdg;
  if(type==0){
    if(m+n-2==0||m==0||n==0)return(0);
    xvar=variance(x,m);yvar=variance(y,n);
    sdg=sqrt(((m-1)*xvar+(n-1)*yvar)/(m+n-2));if(sdg==0)return(0);
    return((mean(x,m)-mean(y,n))/(sdg*sqrt((double)1/m+(double)1/n)));
  }
  else if(type==1){
    if(m<=0||n<=0)return(0);
    xvar=variance(x,m);yvar=variance(y,n);
    if(xvar==0&&yvar==0)return(0);
    return((mean(x,m)-mean(y,n))/sqrt(xvar/m+yvar/n));
  }
  else return(0);
}

/************************************************************/
/* the p_value of the student t-test, 2-tailed              */
/* (The probability that X and Y are different by chance)   */
/************************************************************/
double t_p_value(double *x,double *y,long m,long n,int type)
{double t_value,a1,a2;
 long freedom;
  if(m<=1||n<=1)return(1);
  t_value=t_test(x,y,m,n,type);
  if(type==0)return(p_value_2_tailed(t_value,m+n-2));
  else if(type==1){ /* Welch's correction */
    a1=variance(x,m)/m;a2=variance(y,n)/n;
    if(a1==0&&a2==0)return(1);
    freedom=(long)((a1+a2)*(a1+a2)/(a1*a1/(m-1)+a2*a2/(n-1))+0.5);
    return(p_value_2_tailed(t_value,freedom));
  }
  else return(1);
}

/****************************************/
/*       paired student t-test          */
/*                 _                    */
/*      Xi = (Xi - X)                   */
/*      ^          _                    */
/*      Yi = (Yi - Y)                   */
/*      ^                               */
/*                 ---------------      */
/*       _   _    /     n(n-1)          */
/*  t = (X - Y)  /----------------      */
/*              / ___ n                 */
/*             /  \     (Xi-Yi)^2       */
/*           \/   /__i=1 ^  ^           */
/*                                      */
/*  degree of freedom = n - 1           */
/****************************************/
double paired_t_test(double *x,double *y,long n)
{double avgx,avgy,s,sum;
 long i;
  avgx=mean(x,n);avgy=mean(y,n);
  for(sum=0,i=0;i<n;i++){s=((*(x+i))-avgx)-((*(y+i))-avgy);sum+=s*s;}
  if(sum==0)return(0);return((avgx-avgy)*sqrt((n*(n-1)/sum)));
}

/************************************************************/
/* the p_value of the paired student t-test, 2-tailed       */
/* (The probability that X and Y are different by chance)   */
/************************************************************/
double paired_t_p_value(double *x,double *y,long n)
{double t_value;

   t_value=paired_t_test(x,y,n);
   return(p_value_2_tailed(t_value,n-1));
}

/************************************************************/
/* the p_value of the Welch's unpaired t-test (VOID)        */
/*                                                          */
/*         Vari                                             */
/*   Ai = ------                                            */
/*          ni                                              */
/*                              (A1+A2)^2                   */
/* degree of freedom = floor( -------------- )              */
/*                             A1^2   A2^2                  */
/*                             ---- + ----                  */
/*                             n1-1   n2-1                   */
/* degree of freedom =round{ (var(X)/m+var(Y)/n)^2/[(var(X)/m)^2/(m-1)+(var(Y)/n)^2/(n-1)] } */
/************************************************************/
/* VOID july 5, 2007, merged into t_test (type 1).
double welch_t_p_value(double *x,double *y,long m,long n)
{double t_value,a1,a2;
 long freedom;

  if(m<=1||n<=1)return(1);
  t_value=t_test(x,y,m,n);a1=variance(x,m)/m;a2=variance(y,n)/n;
  if(a1==0&&a2==0)return(1);
  freedom=(a1+a2)*(a1+a2)/(a1*a1/(m-1)+a2*a2/(n-1));
  return(p_value_2_tailed(t_value,freedom));
}
*/

/************************************************************/
/*                        correlation                       */
/*                      __       __  __                     */
/*                      \        \   \                      */
/*                     n/_XiYi - /_Xi/_Yi                   */
/*  r = --------------------------------------------------- */
/*        _________________________________________________ */
/*       /   __         __            __         __         */
/*      /    \          \             \          \          */
/*    \/  [ n/_ Xi^2 - (/_ Xi)^2 ] [ n/_ Yi^2 - (/_ XY)^2 ] */
/*                                                          */
/*  degree of freedom = n - 2                               */
/************************************************************/
double correlation(double *x,double *y,long n)
{long i;
 double a,b,xi,yi,sum_xy,sum_x,sum_y,sum_x2,sum_y2;
 if(n<=0)return(HUGE);
 sum_xy=sum_x=sum_y=sum_x2=sum_y2=0;
 for(i=0;i<n;i++){
   xi=(*(x+i));yi=(*(y+i));
   sum_xy+=xi*yi;
   sum_x+=xi;
   sum_y+=yi;
   sum_x2+=xi*xi;
   sum_y2+=yi*yi;
 }
 a=n*sum_x2-sum_x*sum_x;
 b=n*sum_y2-sum_y*sum_y;
 if(a==0||b==0)return(HUGE);
 return((n*sum_xy-sum_x*sum_y)/sqrt(a*b));
}

/*************************************************************/
/* the p_value of the correlation, 2-tailed                  */
/* (The probability that X and Y are correlated by chance)   */
/*************************************************************/
double correlation_p_value(double *x,double *y,long n)
{double r,t;
 r=correlation(x,y,n);
 if(r==1)return(0);
 t=r*sqrt((n-2)/(1-r*r));
 /*     return(2*(1-stdtr((short)(n-2),t))); */
 return(p_value_2_tailed(t,n-2));
}

double p_value_2_tailed(double t_value,long freedom)
{double f;
  f=stdtr((int)(freedom),t_value);
  if(f>1.0)f=1.0;if(f<0.0)f=0.0;
  if(t_value>0)return(2.0*(1.0-f));
  else if(t_value<0)return((double)(2.0*f));else return((double)1.0);
}

static int data_cmp(const void *x,const void *y)
{
double a,b;
  a=*((double *)x);b=*((double *)y);
  if(a>b)return(1);else if(a<b)return(-1);return(0);
}


/*********************************************************************/
/* hypergeometric distribution                                       */
/* n balls, m are red, draw k balls, probability of exactly x red is */
/* p=choose(m, x)*choose(n-m, k-x)/choose(n, k)                      */
/*********************************************************************/
double hypergeometric_prob(int n, int m, int k, int x)
{
int i;
double s=0;

  for(i=m-x+1; i<=m; i++) s+=log(i);
  for(i=n-m-k+x+1; i<=n-m; i++) s+=log(i);
  for(i=k-x+1; i<=k; i++) s+=log(i);

  for(i=2; i<=x; i++) s-=log(i);
  for(i=n-k+1; i<=n; i++) s-=log(i);

  return(exp(s));
}

double hypergeometric_cumulative_prob(int n, int m, int k, int x){
int min_m_k,i;
double s=0;

    min_m_k=m<k ? m : k;
    for(i=x; i<=min_m_k; i++)s+=hypergeometric_prob(n, m, k, i);
    return(s);
}



/*************************************************************************************************/
/*                                Chi-square test                                                */
/* http://graphpad.com/quickcalcs/chisquared1.cfm                                                */
/* This calculator compares observed and expected frequencies with the chi-square test.          */
/*                                                                                               */
/* Assume that an average of 10% of patients die during or immediately following a certain       */
/* risky operation. But last month 16 of 75 patients died. You want to know whether the          */
/* increase reflects a real change or whether it is just a coincidence. Statistical              */
/* calculations cannot answer that question definitively, but they can answer a related          */
/* one: If the probability of dying remained at 10%, what is the probability of observing        */
/* 16 or more deaths out of 75 patients? If the probability of dying remained at 10%, we         */
/* would expect 10% x 75 or 7.5 deaths in an average sample of 75 patients. But in a             */
/* particular sample of 75 patients, we might see more or less than the expected number.         */
/*                                                                                               */
/* Enter the data into GraphPad's calculator like this:                                          */
/*    Category  Observed #   Expected                                                            */
/*    Alive             59       67.5                                                            */
/*    Dead              16        7.5                                                            */
/*                                                                                               */
/* Check the option that you entered the expected values as actual numbers expected.             */
/* Or check the option that you are entering percentages and enter 90 and 10, rather             */
/* than 67.5 and 7.5.                                                                            */
/*                                                                                               */
/* Here are the results:                                                                         */
/*                                                                                               */
/* P value and statistical significance:                                                         */
/* Chi squared equals 10.704 with 1 degrees of freedom.                                          */
/* The two-tailed P value equals 0.0011                                                          */
/* By conventional criteria, this difference is considered to be very statistically              */
/* significant.                                                                                  */
/*                                                                                               */
/* The P value answers this question: If the theory that generated the expected values were      */
/* correct, what is the probability of observing such a large discrepancy (or larger) between    */
/* observed and expected values? A small P value is evidence that the data are not sampled       */
/* from the distribution you expected.                                                           */
/*                                                                                               */
/* How the calculations work.                                                                    */
/* The null hypothesis is that the observed data are sampled from a populations with the         */
/* expected frequencies. We need to combine together the discrepancies between the observed      */
/* and expected, and then calculate a P value answering this question: If the null hypothesis    */
/* were true, what is the chance of randomly selecting subjects with this large a discrepancy    */
/* between observed and expected counts?                                                         */
/*                                                                                               */
/* We can combine the observed and expected counts into a variable, chi-square.                  */
/* To calculate chi-square:                                                                      */
/*                                                                                               */
/*   1. For each category compute the difference between observed and expected counts.           */
/*   2. Square that difference and divide by the expected count.                                 */
/*   3. Add the values for all categories. In other words, compute the sum of (O-E)2/E.          */
/*   4. Use a table (or computer program) to calculate the P value. You need to know that        */
/*      the number of degrees of freedom equals the number of categories minus 1.                */
/*                                                                                               */
/* When there are only two categories, some statisticians recommend using the Yates' correction. */
/* Reduces the value of chi-square so increases the P value. With large sample sizes, this       */
/* correction makes little difference. With small samples, it makes more difference.             */
/* Statisticians disagree about when to use the Yates' correction, and this calculator does      */
/* not apply it.                                                                                 */
/*     Returns: Chi squared value and p value                                                    */
/*              -1 for error                                                                     */
/* not apply it.                                                                                 */
/* not apply it.                                                                                 */
/*************************************************************************************************/

double chi_square(long n, double *observed, double *expected, double *p)
{double df,x,y;
 long i;


/* printf("chin_stat: n=%ld",n);for(i=0;i<n;i++)printf(" %lf %lf",observed[i],expected[i]);printf("\n");fflush(stdout); */
  (*p)=1;if(n<2)return((double)(0));
  /* adjust expected values such that sum(expected) = sum(observed) */
  for(x=y=i=0;i<n;i++){
    if(expected[i]<=0)return((double)(0));
    if(observed[i]< 0)return((double)(0));
    x+=observed[i];y+=expected[i];
  }if(x<=0)return((double)(0));
/* printf("chin_stat: x=%lf y=%lf\n",x,y);fflush(stdout); */
  df=x/y;for(i=0;i<n;i++)expected[i]*=df;

  df=n-1; 
  for(x=i=0;i<n;i++){y=observed[i]-expected[i];x+=y*y/expected[i];}
  (*p)=chdtr(df,x);(*p)=1-(*p);
/* printf("chin_stat: x=%lf\n",x);fflush(stdout); */
/* printf("chin_stat: %lf %lf\n",x,(*p));fflush(stdout); */
  return(x);
}

/*************************************************************/
/* linear regression                                         */
/*                                                           */
/*      __                                                   */
/*      >_ (xy)                                              */
/*                                                           */
/*                                                           */
/*                                                           */
/*                                                           */
/*                                                           */
/*************************************************************/
/*
double linear_regression(double *x, double *y, int n, int *m, int *b)
{
  if(n<=0){
    (*m)=(*b)=0;
    return(0);
  }
  s_xy=s_x=s_y=s_x2=s_y2=0;
  for(i=0;i<n;i++){
    s_xy+=x[i]*y[i];
    s_x +=x[i];
    s_y +=y[i];
    s_x2+=x[i]*x[i];
    s_y2+=y[i]*y[i];
  }
  if(n*s_x2-s_x*s_x==0){
    (*m)=(*b)=0;
    return(0);
  }
  (*m)=(n*s_xy-s_x*s_y)/(n*s_x2-s_x*s_x);
  (*b)=(s_y-m*s_x)/n;
  return((n*s_xy-s_x*s_y)/sqrt((n*s_x2-s_x*s_x)*(n*s_y2-s_y*s_y)));
}
*/




/*
http://onlinestatbook.com/chapter2/boxplots.html

Box Plots

Prerequisites
Percentiles

We have already discussed techniques for visually representing data (see histograms and frequency polygons). In this section we present another important graph, called a box plot. Box plots are useful for identifying outliers and for comparing distributions. We will explain box plots with the help of data from an in-class experiment. Students in Introductory Statistics were presented with a page containing 30 colored rectangles. Their task was to name the colors as quickly as possible; their times were recorded. We'll compare the scores for the 16 men and 31 women who participated in the experiment by making separate box plots for each gender. (Such a display is said to involve parallel box plots.)

There are several steps in constructing a box plot. The first relies on the 25th, 50th, and 75th percentiles in the distribution of scores. Figure 1 shows how these three statistics are used. For each gender we draw a box extending from the 25th percentile to the 75th percentile. The 50th percentile is drawn inside the box. Therefore,

the bottom of each box is the 25th percentile,

the top is the 75th percentile,

and the line in the middle is the 50th percentile.

The data for the women in our sample are shown in Table 1.
Table 1. Times (in seconds) for women to name the colors.
14
15
16
16
17
	
17
17
17
17
18
	18
18
18
18
18
	19
19
19
20
20
	20
20
20
20
21
	21
22
23
24
24
	29

For these data, the 25th percentile is 17, the 50th percentile is 19, and the 75th percentile is 20. For the men (whose data are not shown), the 25th percentile is 19, the 50th percentile is 22.5, and the 75th percentile is 25.5.
Figure 1. The first step in creating box plots.

Before proceeding, the terminology in Table 2 is helpful.
Table 2. Terminology.
Name
	
Formula
	
Value for Women's Data
Upper Hinge 	
75th Percentile
	20
Lower Hinge 	
25th Percentile
	17
H-Spread 	
Upper Hinge - Lower Hinge
	3
Step 	
1.5 x H-Spread
	4.5
Upper Inner Fence 	
Upper Hinge + 1 Step
	24.5
Lower Inner Fence 	
Lower Hinge - 1 Step
	12.5
Upper Outer Fence 	
Upper Hinge + 2 Steps
	29
Lower Outer Fence 	
Lower Hinge - 2 Steps
	8
Upper Adjacent 	
Largest value below Upper Inner Fence
	24

Lower Adjacent
	
Smallest value above Lower Inner Fence
	14
Outside Value 	
A value beyond an Inner Fence but not beyond an Outer Fence
	29 (this value is on the fence, but not beyond)
Far Out Value 	
A value beyond an Outer Fence
	None in these data

Continuing with the box plots, we put "whiskers" above and below each box to give additional information about the spread of data. Whiskers are vertical lines that end in a horizontal stroke. Whiskers are drawn from the upper and lower hinges to the upper and lower adjacent values (24 and 14 for the women's data).
Figure 2. The box plots with the whiskers drawn.

 

Although we don't draw whiskers all the way to outside or far out values, we still wish to represent them in our box plots. This is achieved by adding additional marks beyond the whiskers. Specifically, outside values are indicated by small "o's, and far out values are indicated by asterisks. In our data, there are no far out values, and just one outside value. This outside value of 29 is for the women and is shown in Figure 3.
Figure 3. The box plots with the outside value shown.

There is one more mark to include in box plots (although sometimes it is omitted). We indicate the mean score for a group by inserting a plus sign. Figure 4 shows the result of adding means to our box plots.
Figure 4. The completed box plots.

Figure 4 provides a revealing summary of the data. Since half the scores in a distribution are between the hinges (recall that the hinges are the 25th and 75th percentiles), we see that half the women's times are between 17 and 20 whereas half the men's times are between 19 and 25. We also see that women generally named the colors faster than the men did, although one woman was slower than almost all of the men. Figure 5 shows the box plots for the women's data with detailed labels.

 
Figure 5. The box plots for the women's data.

Here are some other examples of box plots.
Time to move the mouse over a target
Draft lottery

Variations on box plots

Statistical analysis programs may offer options on how box plots are created. For example, the box plot in Figure 6 is constructed from our data but differs from the previous box plot in several ways.

   1. It does not mark outliers.
   2. The means are indicated by green lines rather than plus signs.
   3. The mean of all scores is indicated by a gray line.
   4. Individual scores are represented by dots. Since the scores have been rounded to the nearest second, any given dot might represent more than one score.
   5. The box for the women is wider than the box for the men because the widths of the boxes are proportional to the number of subjects of each gender (31 women and 16 men).

Figure 6. Box plots showing the individual scores and the means.

Each dot in Figure 6 represents a group of subjects with the same score (rounded to the nearest second). An alternative graphing technique is to jitter the points. This means spreading out different dots at the same horizontal position, one dot for each subject. The exact horizontal position of a point is determined randomly (under the constraint that different dots don.t overlap). Spreading out the dots allows you to see multiple occurrences of a given score. Figure 7 shows what jittering looks like.
Figure 7. Box plots with the individual scores jittered.

Different styles of box plots are best for different situations, and there are no firm rules for which to use. When exploring your data you should try several ways of visualizing them. Which graph you include in your report should depend on how well different graphs reveal the aspects of the data you consider most important. 

*/
