#v. 10dec12 1AM #vir_indelsim_free.r 20jun2012 v. 7 Jul 4 AM with normal priors ( gain 0 - 0.5 loss 0 - 0.01 ) #3 acceptance criteria: 0 losses of ancestral genes in red branches, 4 gains in red branches, and 3 losses in blue branches [virilis data] #based on indelsim_free_2012.R #Simulates gain and loss of genes on Y chromosome of 10 fly species, with melanogaster or virilis-biased ascertainment. Gives approx. Bayes posterior #densities of gain rate (genes / Myr ) and loss rate PER GENE (genes / gene / Myr). #assumed tree below #((((((sim:2,sec:2):3.4,mel:5.4):7.3,(yak:10.4,ere:10.4):2.3):31.5,ana:44.2):18.0,wil:62.2):0.7,(gri:42.9,(moj:32.5,vir:32.5):10.4):20); #2012 shortcuts and changes in variable names : gn=gene_number bb=blue_branch or blueb eq=equilibrium. I also removed the "_all" and "_keep" in most variable names; bkeep and dkeep replaced by gain_keep and loss_keep #net_gain_rate -> gen_gain_Myr ; gain_loss_ratio -> gen_gl_ratio ) #options(echo=FALSE) rm(list=ls()) #clears previous workspace data, if loaded saved_sim <- 1000 chrom_size<- 1000 max_gain <- 0.5 #Nominal value virilis 0.0636 genes / Myr melanogaster: 0.1113 genes / Myr max_loss <- 0.01 #Nominal value virilis 0.001645 genes lost / gene / Myr melanogaster: 0.001026 genes lost / gene / Myr #focal_species <- "mel" #this is the focal species, whose Y gene content has been well studied ("mel" or "vir") focal_species <- "vir" #this is the focal species, whose Y gene content has been well studied ("mel" or "vir") if (identical(focal_species , "vir" )){ #virilis-centric data obs_red_branch_gains <- 4 #Nominal value virilis: 4 melanogaster: 7 obs_blue_branch_losses <- 3 #Nominal value virilis: 3 melanogaster: 2 obs_bb_loss_ancestral_genes <- 3 #Note that obs_blue_branch_losses = obs_bb_loss_ancestral_genes + obs_bb_loss_non_ancestral_genes obs_bb_loss_non_ancestral_genes <- 0 ancestral_Y_genes <- 6 #Nominal value virilis: 6 melanogaster: 5 [this is the number of Y genes at the root (node A) ] red_branches <- c("A_H","H_I","I_vir") blue_branches <- c("E_mel","D_E","C_D","B_C","A_B","G_sim","G_sec","E_G","F_yak","F_ere","D_F","C_ana","B_wil","I_moj","H_gri") #virilis } if (identical(focal_species , "mel" )){ #melanogaster-centric data obs_red_branch_gains <- 7 obs_blue_branch_losses <- 2 obs_bb_loss_ancestral_genes <- 2 #Note that obs_blue_branch_losses = obs_bb_loss_ancestral_genes + obs_bb_loss_non_ancestral_genes obs_bb_loss_non_ancestral_genes <- 0 ancestral_Y_genes <- 5 red_branches <- c("E_mel","D_E","C_D","B_C","A_B") blue_branches <- c("G_sim","G_sec","E_G","F_yak","F_ere","D_F","C_ana","B_wil","I_vir","I_moj","H_I","A_H","H_gri") } set.seed(1963) # branch names all_branches <- c("E_mel","D_E","C_D","B_C","A_B","G_sim","G_sec","E_G","F_yak","F_ere","D_F","C_ana","B_wil","I_vir","I_moj","H_I","A_H","H_gri") # branch lengths branch_length<-c(5.4,7.3,31.5,18.0,0.7,2.0,2.0,3.4,10.4,10.4,2.3,44.2,62.2,32.5,32.5,10.4,20.0,42.9) # n=18 branches names(branch_length)<-all_branches attempted_sim <- 0 gain_keep<-rep(NA,saved_sim); loss_keep<-rep(NA,saved_sim); new_genes_bb<-rep(NA,saved_sim); new_genes_bb_loss<-rep(NA,saved_sim); net_new_genes_bb<-rep(NA,saved_sim) branch_gain<-matrix(nrow = saved_sim, ncol = 18, byrow = FALSE) colnames(branch_gain) <- all_branches branch_loss<-matrix(nrow = saved_sim, ncol = 18, byrow = FALSE) colnames(branch_loss) <- all_branches branch_loss_newgenes<-matrix(nrow = saved_sim, ncol = 18, byrow = FALSE) colnames(branch_loss_newgenes) <- all_branches node_gn<-matrix(nrow = saved_sim, ncol = 19, byrow = FALSE) colnames(node_gn)<-c("A","B","C","D","E","F","G","H","I","mel","sim","sec","yak","ere","ana","wil","vir","moj","gri") gene_gain <- rep(NA,saved_sim) gene_loss <- rep(NA,saved_sim) gen_gl_ratio <- rep(NA,saved_sim) ancestral_genes_focal_species <- rep(NA,saved_sim) bb_loss_ancestral_genes <- rep(NA,saved_sim) rb_loss_ancestral_genes <- rep(NA,saved_sim) eq_gn <- rep(NA,saved_sim) gen_gain_Myr <- rep(NA,saved_sim) index<-0 while(index < saved_sim) { attempted_sim <- attempted_sim + 1 #draw gene gain rate and loss rate from uniform distributions and simulate gains and losses. If tree has the right number of gene gains on the red branch, gene losses (among the known genes) in the blue branches, and no loss of ancestral genes in the red branches, then keep these values of gain and loss rate #lower range of gain and loss set to 0. Upper range obtained from pilot runs gain <- max_gain*runif(1,0,1) # uniform distribuition between 0 and max_gain loss <- max_loss*runif(1,0,1) # uniform distribuition between 0 and max_loss #pgg_matrix is position of gene gain, a random uniform number between 0 and 1 specifying the relative position in the branch where a gene was gained pgg_matrix <- matrix(runif(chrom_size*18, min=0, max=1), nrow = 18, ncol = chrom_size) gain_matrix <- matrix(rbinom(chrom_size*18, 1, gain*branch_length/chrom_size),nrow = 18, ncol = chrom_size) #chrom_size genes, 18 branches rownames(gain_matrix)<-all_branches branch_length_matrix <- matrix(c(rep(branch_length,chrom_size)), ncol=chrom_size, nrow=18,byrow=FALSE) effective_branch_length_matrix <- branch_length_matrix * pgg_matrix loss_matrix_newgenes <- matrix(rbinom(chrom_size*18, 1, (1 - exp(-loss*effective_branch_length_matrix)) ) , nrow = 18, ncol = chrom_size ) not_loss_matrix_newgenes <- ! loss_matrix_newgenes net_gain_matrix <- gain_matrix & not_loss_matrix_newgenes #this contains the newly arised genes of each branch that were not subsequently lost in the same branch rownames(net_gain_matrix)<-all_branches #Loss_matrix do not contain actual losses; each element contains the outcome of the loss process (loss / no loss) irrespective whether a gene exists or not at this paticular position of the chromosome. #To obtain the actual losses use a logical AND between each element of loss_matrix , and the gene content of this particular branch. loss_matrix <- matrix(rbinom(chrom_size*18, 1, (1 - exp(-loss*branch_length)) ), nrow = 18, ncol = chrom_size) not_loss_matrix <- ! loss_matrix # became 1 1 1 1 1 1 1 0 1 1 1 1 rownames(not_loss_matrix)<-all_branches if ( ( sum(is.na(gain_matrix)) + sum(is.na(loss_matrix_newgenes)) + sum(is.na(loss_matrix)) ) > 0 ) { cat("warning: NA present in matrix \n" , "gain=" , gain , "loss=" , loss , "\n") ; break} #below is the simulation of gains and losses in all branches (red and blue), in a a chromosome with "chrom_size" potential loci, across 19 nodes. "0" is gene absence, "1" is presence #First we compute the loss of "old" genes with "&" (i.e., genes already present in the start node of the branch), and then add the new genes (with "|"), already including the losses among them. Start from the oldest nodes. gene_content<-matrix(rep(0,19*chrom_size), nrow = 19, ncol = chrom_size) rownames(gene_content) <- c("A","B","C","D","E","F","G","H","I","mel","sim","sec","yak","ere","ana","wil","vir","moj","gri") # node A is the root of Sophophora and Drosophila subgenera gene_content["A",] <-rep(c(1,0), c(ancestral_Y_genes,chrom_size - ancestral_Y_genes)) #in virilis, six "1" followed by "0" until the end. Node A has 6 ancestral genes: kl-2, kl-3, Ppr-Y, ORY, PRY, Jya # Sophophora subgenus gene_content["B",] <- ( gene_content["A",] & not_loss_matrix["A_B",] ) | net_gain_matrix["A_B",] gene_content["C",] <- ( gene_content["B",] & not_loss_matrix["B_C",] ) | net_gain_matrix["B_C",] gene_content["D",] <- ( gene_content["C",] & not_loss_matrix["C_D",] ) | net_gain_matrix["C_D",] gene_content["E",] <- ( gene_content["D",] & not_loss_matrix["D_E",] ) | net_gain_matrix["D_E",] gene_content["mel",]<- ( gene_content["E",] & not_loss_matrix["E_mel",]) | net_gain_matrix["E_mel",] gene_content["G",] <- ( gene_content["E",] & not_loss_matrix["E_G",] ) | net_gain_matrix["E_G",] gene_content["sim",]<- ( gene_content["G",] & not_loss_matrix["G_sim",]) | net_gain_matrix["G_sim",] gene_content["sec",]<- ( gene_content["G",] & not_loss_matrix["G_sec",]) | net_gain_matrix["G_sec",] gene_content["F",] <- ( gene_content["D",] & not_loss_matrix["D_F",] ) | net_gain_matrix["D_F",] gene_content["yak",]<- ( gene_content["F",] & not_loss_matrix["F_yak",]) | net_gain_matrix["F_yak",] gene_content["ere",]<- ( gene_content["F",] & not_loss_matrix["F_ere",]) | net_gain_matrix["F_ere",] gene_content["ana",]<- ( gene_content["C",] & not_loss_matrix["C_ana",]) | net_gain_matrix["C_ana",] gene_content["wil",]<- ( gene_content["B",] & not_loss_matrix["B_wil",]) | net_gain_matrix["B_wil",] # Drosophila subgenus gene_content["H",] <- ( gene_content["A",] & not_loss_matrix["A_H",] ) | net_gain_matrix["A_H",] gene_content["I",] <- ( gene_content["H",] & not_loss_matrix["H_I",] ) | net_gain_matrix["H_I",] gene_content["vir",]<- ( gene_content["I",] & not_loss_matrix["I_vir",] )| net_gain_matrix["I_vir",] gene_content["moj",]<- ( gene_content["I",] & not_loss_matrix["I_moj",]) | net_gain_matrix["I_moj",] gene_content["gri",]<- ( gene_content["H",] & not_loss_matrix["H_gri",]) | net_gain_matrix["H_gri",] red_branch_gains <- sum(gene_content[focal_species,]) - sum(gene_content["A",]) change_matrix <- matrix(rep(0,18*chrom_size),nrow = 18, ncol = chrom_size) #chrom_size genes, 18 branches rownames(change_matrix)<-all_branches #elements will be 0 in case of no change, +1 in case of gain, and -1 in case of loss change_matrix["A_B",] <- gene_content["B",] - gene_content["A",] change_matrix["B_C",] <- gene_content["C",] - gene_content["B",] change_matrix["C_D",] <- gene_content["D",] - gene_content["C",] change_matrix["D_E",] <- gene_content["E",] - gene_content["D",] change_matrix["E_mel",] <- gene_content["mel",]- gene_content["E",] change_matrix["A_H",] <- gene_content["H",] - gene_content["A",] change_matrix["H_I",] <- gene_content["I",] - gene_content["H",] change_matrix["I_vir",] <- gene_content["vir",]- gene_content["I",] change_matrix["I_moj",] <- gene_content["moj",]- gene_content["I",] change_matrix["H_gri",] <- gene_content["gri",]- gene_content["H",] change_matrix["E_G",] <- gene_content["G",] - gene_content["E",] change_matrix["G_sim",] <- gene_content["sim",]- gene_content["G",] change_matrix["G_sec",] <- gene_content["sec",]- gene_content["G",] change_matrix["D_F",] <- gene_content["F",] - gene_content["D",] change_matrix["F_yak",] <- gene_content["yak",]- gene_content["F",] change_matrix["F_ere",] <- gene_content["ere",]- gene_content["F",] change_matrix["C_ana",] <- gene_content["ana",]- gene_content["C",] change_matrix["B_wil",] <- gene_content["wil",]- gene_content["B",] change_gain_matrix <- (change_matrix == 1) change_loss_matrix <- (change_matrix == -1) change_gain_matrix <- change_gain_matrix | ( gain_matrix & loss_matrix_newgenes ) #This includes the genes that were gained and lost in the same branch. change_loss_matrix <- change_loss_matrix | ( gain_matrix & loss_matrix_newgenes ) #This includes the genes that were gained and lost in the same branch. change_gain_matrix <- change_gain_matrix * 1 #forces conversion from logical to numeric (avoids problems with the "identical" function used in the acceptance) change_loss_matrix <- change_loss_matrix * 1 #forces conversion from logical to numeric (avoids problems with the "identical" function used in the acceptance) #classifying the losses (change_loss_matrix ) in losses of known genes (i.e., those observed in the focal species) and losses of new genes focal_species_matrix <- matrix(c(rep(gene_content[focal_species,],18)), ncol=chrom_size, nrow=18,byrow=TRUE) change_loss_new_genes_matrix <- change_loss_matrix *(! focal_species_matrix) change_loss_known_genes_matrix <- change_loss_matrix * focal_species_matrix bb_loss_known_genes_value <- sum( change_loss_known_genes_matrix[blue_branches,] ) # for additional acceptance criteria # number of ancestral genes that survived in the focal species ancestral_genes_focal_species_value <- sum(gene_content[focal_species,1:ancestral_Y_genes]) # losses of ancestral genes in the blue branches bb_loss_ancestral_genes_value <- sum( change_loss_matrix[blue_branches,1:ancestral_Y_genes] ) # losses of ancestral genes in the red branches rb_loss_ancestral_genes_value <- sum( change_loss_matrix[red_branches,1:ancestral_Y_genes] ) #cat("red branch gains:" , red_branch_gains, " blue branch knwon genes losses:" , bb_loss_known_genes_value, " rb_loss_ancestral_genes:" , rb_loss_ancestral_genes_value , " ancestral_genes_focal_species: ",ancestral_genes_focal_species_value,"\n") if ( identical(red_branch_gains,obs_red_branch_gains) && identical(bb_loss_known_genes_value,obs_blue_branch_losses) && identical(rb_loss_ancestral_genes_value,0) ) { #2008; 2jul12 if ( identical(red_branch_gains,obs_red_branch_gains) && identical(bb_loss_known_genes_value,obs_blue_branch_losses) ) { #cat("attempted simulations: ", attempted_sim, " red branch gains:" , red_branch_gains, " blue branch knwon genes losses:" , bb_loss_known_genes_value, " rb_loss_ancestral_genes:" , rb_loss_ancestral_genes_value , " ancestral_genes_focal_species: ",ancestral_genes_focal_species_value,"\n") index <- index + 1 gain_keep[index] <- gain loss_keep[index] <- loss eq_gn[index] <- gain / loss node_gn[index,] <- rowSums(gene_content) #count of the genes (sum across the chromosome) for each node branch_gain[index,] <- rowSums(change_gain_matrix) branch_loss[index,] <- rowSums(change_loss_matrix) branch_loss_newgenes[index,] <- rowSums(change_loss_new_genes_matrix) new_genes_bb[index] <- sum(rowSums( change_gain_matrix[blue_branches,] )) new_genes_bb_loss[index] <- sum(rowSums( change_loss_new_genes_matrix[blue_branches,] )) net_new_genes_bb[index] <- new_genes_bb[index] - new_genes_bb_loss[index] gene_gain[index] <- sum(change_gain_matrix) gene_loss[index] <- sum(change_loss_matrix) gen_gain_Myr[index] <- (gene_gain[index] - gene_loss[index]) / 338.1 # this gives the statistic: is the gene content increasing or decreasing? total branch length of the phylogeny = 275.2 Myr blue branch + 62.9 Myr red branch = 338.1 Myr gen_gl_ratio[index] <- gene_gain[index] / gene_loss[index] # this is the basic statitics tested by the Poisson regression ancestral_genes_focal_species[index] <- ancestral_genes_focal_species_value # additional acceptance criteria considered in 2012 bb_loss_ancestral_genes[index] <- bb_loss_ancestral_genes_value # additional acceptance criteria considered in 2012 rb_loss_ancestral_genes[index] <- rb_loss_ancestral_genes_value # additional acceptance criteria considered in 2012 } } result <- data.frame(gain_keep,loss_keep,eq_gn,new_genes_bb,new_genes_bb_loss,net_new_genes_bb,gene_gain,gene_loss,gen_gain_Myr,gen_gl_ratio,ancestral_genes_focal_species,bb_loss_ancestral_genes,rb_loss_ancestral_genes) save.image() ############ ANCILLARY RESULTS ################## cat ("FOCAL SPECIES: ", focal_species , "\n\n") cat ("Gains of new genes in the BLUE branches \n") ; colMeans(branch_gain[,blue_branches]) cat ("Total gains: ", mean(new_genes_bb) , "\n\n") cat ("Losses of new genes in the BLUE branches \n") ; colMeans(branch_loss_newgenes[,blue_branches]) cat ("Total losses: ", mean(new_genes_bb_loss) , "\n\n") cat ("Gains of new genes in the RED branches \n") ; colMeans(branch_gain[,red_branches]) cat ("Sum of gains: ", sum(colMeans(branch_gain[,red_branches])) , "\n\n") cat ("Losses of new genes in the RED branches \n"); colMeans(branch_loss_newgenes[,red_branches]) cat ("Sum of losses: ", sum(colMeans(branch_loss_newgenes[,red_branches])) , "\n\n") cat ("average number of genes in the nodes" , "\n") colMeans(node_gn) cat ("attempted_sim: ", attempted_sim , "saved simulations" , index , "\n") cat ("percent success ratio: ", 100*index / attempted_sim , "\n" ) cat ("gene_gain", mean(gene_gain), "gene_loss", mean(gene_loss) , "\n" ) cat ("average ratio of gains to losses:", mean(gen_gl_ratio) , "\n") ############ MAIN RESULTS ################## cat ("FOCAL SPECIES: ", focal_species , "\n\n") cat("average gain rate (gene / Myr) = " , mean(gain_keep) , "\n") cat("average loss rate per gene (gene / gene / Myr) = " , mean(loss_keep), "\n") cat("number of simulations that losses outnumber gains: ", sum(gen_gl_ratio < 1)," out of ",index , "\n") #Fig 3 #hist (gen_gain_Myr , br=20,col="light grey",cex.lab=1.4,cex.axis=1.15, xlab="Net gain rate (genes / Myr)", ylab="Density" ,main="Fig 3") cat("Posterior density of net rate of Y-linked gene gain in the Drosophila phylogeny. \n") cat("The average net gain rate is " , mean(gen_gain_Myr) , " genes / Myr. range: " , min(gen_gain_Myr) , " to " , max(gen_gain_Myr) , "\n\n") #Fig Suppl 11A #hist (gen_gl_ratio , br=20,col="light grey",cex.lab=1.4,cex.axis=1.15, xlab="Ratio of gain rate to loss rate", ylab="Density" ,main="Fig Suppl 11A") cat("Posterior distribution of the ratio of the rate of gene gain (genes/Myr) to the rate of gene loss (genes/Myr).\n") cat("The average value is " , mean(gen_gl_ratio) , " range: " , min(gen_gl_ratio) , " to " , max(gen_gl_ratio) , " 95% credibility interval: " , quantile(gen_gl_ratio , 0.025) , " - " , quantile(gen_gl_ratio , 0.975) , "\n\n") #Fig Suppl 11B #plot(gain_keep,loss_keep,xlab="Gene gain rate per MY", ylab="Gene loss rate per gene per MY",pch=19,col="blue",main="Fig Suppl 11B") cat("Joint posterior distribution of gain rate and loss rate per gene.\n") cat("The average values are " , mean(gain_keep), " genes / Myr and " , mean(loss_keep) , " genes / gene / Myr, respectively. \n\n" ) #Fig Suppl 11C #hist (eq_gn, br=20,col="light grey",cex.lab=1.4,cex.axis=1.15, xlab="Equilibrium gene number", ylab="Density" ,main="Fig Suppl 11C") cat("Posterior distribution of the predicted equilibrium gene number. \n") cat("The average value is " , mean(eq_gn) , " genes. range: " , min(eq_gn) , " to " , max(eq_gn) , "\n") focal_species_gn <- ancestral_Y_genes + obs_red_branch_gains cat( sum(eq_gn < focal_species_gn) , " out " , saved_sim , " simulations had predicted equilibrium gene number below " , focal_species_gn , " (the present gene number of the focal species Y ) \n" ) cat( sum(eq_gn < ancestral_Y_genes) , " out " , saved_sim , " simulations had predicted equilibrium gene number below " , ancestral_Y_genes , " (the number of ancestral Y genes ) \n" ) #to export to SYSTAT: write.table(result[,c("gain_keep","loss_keep","eq_gn","gene_gain","gene_loss","gen_gain_Myr","gen_gl_ratio","ancestral_genes_focal_species","bb_loss_ancestral_genes")],file="systat_file.txt",sep="\t",quote=FALSE,col.names=c("index gain","loss","eq_gn","gene_gain","gene_loss","gen_gain_Myr","gen_GL_ratio","ancg_focal","bb_loss_ancg") )