# Poisson regression using D. virilis and D. melanogaster data   v. 16nov2013
# part of the ms "Efficient identification of Y chromosome sequences in the human and Drosophila genomes"  AB Carvalho and AG Clark
# Genome Research VOLUME 23 ISSUE 11  NOVEMBER 2013   PAGES: 1894-1907

#implements the Poisson regression test for the equality of the gain and loss rates
#data file structure: 
#branchlength, events, gainloss, branch_name



 dataset <- "AF_vir_mel"	#Procedure 1: Assumption-free (D. virilis + D. melanogaster)
#dataset <- "HGL_vir"		#Procedure 2: Homogeneous gain loss (D. virilis data)	
#dataset <- "HGL_mel"		#Procedure 2: Homogeneous gain loss (D. melanogaster data)	
#dataset <- "HGL_vir_mel"	#Procedure 2: Homogeneous gain loss (D. virilis + D. melanogaster)

if (dataset == "AF_vir_mel" )   { bias <- 1.000 ;d<-read.table("AF_vir_mel_data.txt", header=T, sep=",")} 
if (dataset == "HGL_vir" )      { bias <- 1.185 ;d<-read.table("HGL_vir_data.txt", header=T, sep=",") } 
if (dataset == "HGL_mel" ) 	{ bias <- 1.436 ;d<-read.table("HGL_mel_data.txt", header=T, sep=",") } 
if (dataset == "HGL_vir_mel" )  { bias <- 1.321 ;d<-read.table("HGL_vir_mel_data.txt", header=T, sep=",")} 

names(d)
attach(d)

factor(gainloss)

gainloss_test2<-glm(events ~ offset(log(branchlength)) + gainloss, family=poisson)
summary(gainloss_test2)

#the following line test for the fit of the data to a Poisson distribuition
1 - pchisq(deviance(gainloss_test2), df.residual(gainloss_test2))


coef_regr <- coefficients(summary(gainloss_test2))
beta1 <-  coef_regr[2,1]           #this is the Poisson regression coefficient for "gainloss"
Std_Err_beta1 <- coef_regr[2,2]    #this is Std. Error of the Poisson regression coefficient for "gainloss"
z <- beta1 / Std_Err_beta1         #this is the standard test for Ho = 0
adjusted_z <- (abs(beta1) - log(bias)) / Std_Err_beta1   #this is the test for Ho = log(bias)

noquote (c("P value for gain/loss ratio <> ", bias , 2*(1-pnorm(adjusted_z))))

gain_loss_ratio <- exp(abs(beta1))
noquote (c("gain/loss ratio = ", gain_loss_ratio ))
adjusted_beta1 <- (abs(beta1) - log(bias))
adjusted_gain_loss_ratio <- exp(adjusted_beta1)
noquote (c("unbiased gain/loss ratio = ", adjusted_gain_loss_ratio ))
noquote (c("95% confidence interval for the unbiased gain/loss ratio:"))
exp( adjusted_beta1 - 1.96 * Std_Err_beta1 )
exp( adjusted_beta1 + 1.96 * Std_Err_beta1 )