# Download https://rohlfslab.weebly.com/software.html
"Updated R package for EVE (Expression Variance and Evolution)"
# https://genomebiology.biomedcentral.com/articles/10.1186/s13059-021-02323-0
#remotes::install_gitlab("sandve-lab/evemodel")

#The test compares a model with two optimal expression levels, one for the arid branches and another for the mesic branches, against the null-model which has the same optimal expression level across the entire tree 


## miscellaneaous
library(tidyr)
library(dplyr)
library(tibble)

## tree management
library(ape)
library(treeio)

## eve model
library(evemodel)


## DESeq2
library(DESeq2)

##################
## read annotated tree

file_in <- "local_online_species_tree_rep_8_20200605.raxml.annotated_habitat.nhx"

nhx_tree <- read.nhx(file_in)
arbre_ape <- as.phylo(nhx_tree)
arbre_ape$tip.label[arbre_ape$tip.label=="Peromyscus_maniculatus_bairdii"]="Peromyscus_maniculatus"
arbre_ape$tip.label[arbre_ape$tip.label=="Mastomys_kollmanspergeri"]="Mastomys_kollmannspergeri"
arbre_ape$tip.label[arbre_ape$tip.label=="Heterocephalus_glaber_male"]="Heterocephalus_glaber"
arbre_ape$tip.label[arbre_ape$tip.label=="Arvicanthis_niloticus_sen"]="Arvicanthis_niloticus"

nhx_tree@phylo$tip.label[nhx_tree@phylo$tip.label=="Peromyscus_maniculatus_bairdii"]="Peromyscus_maniculatus"
nhx_tree@phylo$tip.label[nhx_tree@phylo$tip.label=="Mastomys_kollmanspergeri"]="Mastomys_kollmannspergeri"
nhx_tree@phylo$tip.label[nhx_tree@phylo$tip.label=="Heterocephalus_glaber_male"]="Heterocephalus_glaber"
nhx_tree@phylo$tip.label[nhx_tree@phylo$tip.label=="Arvicanthis_niloticus_sen"]="Arvicanthis_niloticus"

## Read expression data and metadata

## count_table
Alltable_onl = read.csv('datalocalandonline_cds_counts_2025_final.tsv', h=T, sep = "\t", row.names=1) #15796   125
colnames(Alltable_onl) = gsub("loc_cds_SHPC43_Mastomys_kollmanspergeri", "loc_cds_SHPC43_Mastomys_kollmannspergeri", colnames(Alltable_onl))
colnames(Alltable_onl) = gsub("loc_cds_SHPC74_Mastomys_kollmanspergeri", "loc_cds_SHPC74_Mastomys_kollmannspergeri", colnames(Alltable_onl))
colnames(Alltable_onl) = gsub("loc_cds_SHPC75_Mastomys_kollmanspergeri", "loc_cds_SHPC75_Mastomys_kollmannspergeri", colnames(Alltable_onl))
colnames(Alltable_onl) = gsub("loc_cds_SHPC76_Mastomys_kollmanspergeri", "loc_cds_SHPC76_Mastomys_kollmannspergeri", colnames(Alltable_onl))
colnames(Alltable_onl) = gsub("SHPC46_Arvicanthis_niloticus_lmo", "SHPC46_Mastomys_natalensis", colnames(Alltable_onl)) #change Ani_lmo to Mna even if not using in expression)
colnames(Alltable_onl) = gsub("onl5_cds_SRR17216323_Peromyscus_maniculatus", "onl_cds_SRR17216323_Peromyscus_maniculatus", colnames(Alltable_onl))
colnames(Alltable_onl) = gsub("onl5_cds_SRR17216333_Peromyscus_maniculatus", "onl_cds_SRR17216333_Peromyscus_maniculatus", colnames(Alltable_onl))
colnames(Alltable_onl) = gsub("onl5_cds_SRR17216352_Peromyscus_maniculatus", "onl_cds_SRR17216352_Peromyscus_maniculatus", colnames(Alltable_onl))


## coldata : columns with correct names of species
coldata <- read.csv("coldata_July2025.tsv", sep = "\t")
coldata_tot <- coldata[which(coldata$total_set == "yes"),]
coldata_mur <- coldata[which(coldata$muridae_set == "yes"),]
coldata_rec <- coldata[which(coldata$recent_set == "yes"),]
coldata_anc <- coldata[which(coldata$ancient_set == "yes"),]


###################
#' @description Patch to fix the issue of eve estimation functions
#'     when there is one sample is a species
#'
#' @details
#' In evemodel, there are initialisation functions initParamsOneTheta
#' and initParamsTwoTheta that set up intraspecific variance, which
#' does not work in case of ine sample.
#' 
#' So we added line:
#' beta <- replace_na(beta, 1)
#' 
initParamsOneTheta <- function(gene.data, colSpecies)
{
  colSpeciesIndices <- split(seq_along(colSpecies), f = colSpecies)
  species.mean <- sapply(colSpeciesIndices, function(i){ rowMeans(gene.data[ ,i, drop=F]) })
  species.var <- sapply(colSpeciesIndices, function(i){ apply(gene.data[ ,i, drop=F],1,var)})
  
  theta <- rowMeans(species.mean,na.rm = T)
  sigma2 <- apply(species.mean,1,var,na.rm = T)
  alpha <- .5
  beta <- rowMeans(species.var,na.rm = T) / sigma2
  
  beta <- replace_na(beta, 1)
  return(cbind(theta,sigma2,alpha,beta))
}

assignInNamespace("initParamsOneTheta",initParamsOneTheta,"evemodel") # to ensure this function is used


initParamsTwoTheta <- function(gene.data, colSpecies, shiftSpecies)
  {
    colSpeciesIndices <- split(seq_along(colSpecies), f = colSpecies)
    species.mean <- sapply(colSpeciesIndices, function(i){ rowMeans(gene.data[,i, drop=F]) })
    species.var <- sapply(colSpeciesIndices, function(i){ apply(gene.data[,i, drop=F],1,var) })
    
    
    nonShiftSpecies <- setdiff(colnames(species.mean),shiftSpecies)
    
    theta1 <- rowMeans(species.mean[ ,nonShiftSpecies, drop=F],na.rm=T)
    theta2 <- rowMeans(species.mean[ ,shiftSpecies, drop=F],na.rm=T)
    sigma2 <- apply(species.mean,1,var,na.rm=T)
    alpha <- .5
    beta <- rowMeans(species.var,na.rm=T) / sigma2
    
    beta <- replace_na(beta, 1)
    
    return(cbind(theta1,theta2,sigma2,alpha,beta))
  }

assignInNamespace("initParamsTwoTheta",initParamsTwoTheta,"evemodel")

 
 
#' @description Function to perform eve analysis, given expression
#'     data, tree, mesic/arid branches, and normalization method.
#'
#' @param alltable table of the gene X individuals raw expression data
#' @param coldata columns of alltable that will be used (ie matching
#'     species)
#' @param tree phylogenetic tree of species
#' @param normMethod normalisation method of the data
#' @param nrow umber of rows used for analysis (mostly for development
#'     purpose). nrow=0 means all rows are considered #
#' @return a list with many input features, that will be helpful for
#'     pipeline management:
#' 
#'    reseval: tibble with outputs of evemodeling as columns: LRT,
#'      theta1, theta2, sigma2, alpha, beta, shift.direction, pval,
#'      llTwoTheta, llOneTheta, gene, isSig, FDR, isFDR (see
#'      evemodel.twoThetaTest output).
#'    expr: data frame gene X individuals of the used expression levels
#'    shifts: vector of branch specific booleans for presence of shift ,
#'    speciesLabels: species used in the data
#'    tree: the tree
#' 
compute_eve=function(alltable, coldata, tree=nhx_tree, normMethod="Basemean", nrow=0)
{
    if (nrow!=0){
        data = alltable[1:nrow,colnames(alltable) %in% coldata$ID]
    } else {
        data = alltable[,colnames(alltable) %in% coldata$ID]
    }

    data <- round(data[complete.cases(data),])
    data[is.na(data)]=0
    n=apply(data,1,function(x){sum(x==0)})
    data=data[n==0,]

    ## Prepare coldata
    coldata <- coldata[ which(coldata$ID %in% colnames(data)), ]
    coldata <- coldata[ order(match(coldata$ID, colnames(data))), ]#Reorder table such coldata and matrix has same order
    coldata$Batch_number <- as.factor(coldata$Batch_number)
    coldata$cond_season = factor(coldata$cond_season, level = c("arid","mesic"))

    combExprMat = as.matrix(data)
    colnames(combExprMat) = coldata$species
    speciesLabels= coldata$species
    row.names(coldata)=coldata$ID
    coldata=coldata[names(data),]

    ## normalize the data
    ddsInput <- DESeqDataSetFromMatrix(countData = as.matrix(round(data)),
                                       colData = coldata,
                                       design = ~ 1)
    dds <- DESeq(ddsInput)

    if (normMethod=="ntd")
    {
        dataexpr <- data.frame(assay(normTransform(dds)))
        names(dataexpr) = coldata$species
    } else if (normMethod=="Basemean")
    {
        dataexpr <- data.frame(counts(dds, norm=T))
        names(dataexpr) = coldata$species
    }

    species_toremove = arbre_ape$tip.label[!arbre_ape$tip.label %in% speciesLabels]
    arbre_io_tot=drop.tip(tree,species_toremove)
    dtree=as_tibble(arbre_io_tot)
    edge_node  <- dtree$node[dtree$Trait=="yes"]
    arbre_ape_tot <- as.phylo(arbre_io_tot)

    # stores boolean values corresponding to whether or not the branch
    # is included in the shift. This variable will be used later when
    # conducting the twoThetaTest.
    thetaShiftBool_full <- arbre_ape_tot$edge[,2]%in%edge_node


    truedatatest <- twoThetaTest(tree = arbre_ape_tot, gene.data = dataexpr, 
                                 isTheta2edge = thetaShiftBool_full, colSpecies = speciesLabels, cores=2)

    res_truedata=tibble(LRT = truedatatest$LRT) %>% 
        bind_cols(as_tibble(truedatatest$twoThetaRes$par)) %>% 
        mutate( shift.direction = ifelse(theta2>theta1, "up","down")) %>% 
        mutate( pval = pchisq(LRT,df = 1,lower.tail = F))  %>% 
        mutate( llTwoTheta = truedatatest$twoThetaRes$ll)  %>% 
        mutate( llOneTheta = truedatatest$oneThetaRes$ll ) %>% 
        mutate( gene = row.names(dataexpr)) %>%
        mutate( isSig = pval < 0.05) %>% 
        mutate( FDR = p.adjust(pval)) %>%
        mutate( isFDR = FDR < 0.1)

    return(list(reseval = res_truedata, expr = dataexpr, shifts = thetaShiftBool_full, speciesLabels = speciesLabels, tree = arbre_ape_tot))
}


#' @description Simulation with evemodel.simTwoTheta function, of
#'     expression using output of compute_eve, ie estimated
#'     parameters, tree, shift branches, ...
#'
#' @param res_bm output of compute_eve function
#' 
#' @return a dataframe  genes X individuals with simulated expression.

simulExp=function(res_bm)
{
    reseval = res_bm$reseval

    simThetaShift <- t(
        sapply(1:nrow(reseval), function(i)
        {
            
            lparam=reseval[i,]
            return(
                simTwoTheta(1, res_bm$tree, res_bm$speciesLabels, res_bm$shifts, 
                            theta1=lparam$theta1, theta2=lparam$theta2, sigma2 = lparam$sigma2,
                            alpha = lparam$alpha, beta = lparam$beta)
            )
        }
        )
    )

    colnames(simThetaShift)=res_bm$speciesLabels
    
    return(simThetaShift)
}


#' @description Pipeline of simulation and test with evemodel of the
#'     simulated expression levels. From previously estimation of data
#'     with eve (see compute_eve function), for each parameter
#'     different from theta1 and theta2, it is set to the median of
#'     its estimated values among all genes. Gene specific theta1 is
#'     used, and theta2 = theta1 * ratio or theta2 = theta1 / ratio,
#'     depending if the estimated shift was up or down.
#' 
#' @param ratio ratio of theta1 vs theta2
#' @param res_bm result of compute_eve
#' @param coldata columns of alltable that will be used (ie matching
#'     species)
#' @param name suffix of the figures filenames
#'
#' @details Plot histogram of ratio of simulations between arid & mesic
#'     mean expression ('observed_ratio_mesic_arid=...pdf') and
#'     p-values of twoThetaTest of these simulations
#'     ('pval_ratio...pdf')/
#'
#' @return similar output as compute_eve function, but with simulated
#'     values
#' 

testRatioEve = function(ratio, res_bm, coldata, name="")
{
    if (name!=""){
        name=paste("_",name,sep="")
    }

    ressimul = res_bm
    ressimul$reseval$alpha = median(res_bm$reseval$alpha)
    ressimul$reseval$beta  = median(res_bm$reseval$beta)
    ressimul$reseval$sigma2  = median(res_bm$reseval$sigma2)
    
    ressimul$reseval$theta2 = (ifelse(res_bm$reseval$shift.direction=="up",ratio,1./ratio))*res_bm$reseval$theta1
    
    sim=simulExp(ressimul)

    sarid=sim[,coldata$cond_season=="arid"]
    smesic=sim[,coldata$cond_season=="mesic"]

    mmesic=apply(smesic,1,mean)
    marid=apply(sarid,1,mean)

    pdf(paste("observed_ratio_mesic_arid=",ratio,name,".pdf",sep=""))
    hist(marid/mmesic,nclass=100,xlab="ratio of arid/mesic expression ratio")  
    dev.off()

    ## estimation
    truedatatest <- twoThetaTest(tree = ressimul$tree,
                                 gene.data = sim, 
                                 isTheta2edge = ressimul$shift,
                                 colSpecies = ressimul$speciesLabels, cores=2)

    
    res_truedata=tibble(LRT = truedatatest$LRT) %>% 
        bind_cols(as_tibble(truedatatest$twoThetaRes$par)) %>% 
        mutate( shift.direction = ifelse(theta2>theta1, "up","down")) %>% 
        mutate( pval = pchisq(LRT,df = 1,lower.tail = F))  %>% 
        mutate( llTwoTheta = truedatatest$twoThetaRes$ll)  %>% 
        mutate( llOneTheta = truedatatest$oneThetaRes$ll ) %>% 
        mutate( gene = row.names(res_bm$reseval$gene)) %>%
        mutate( isSig = pval < 0.05) %>% 
        mutate( FDR = p.adjust(pval)) %>%
        mutate( isFDR = FDR < 0.1)

    col=c("up"="blue","down"="red")
    pdf(paste("pval_ratio=",ratio,name,".pdf",sep=""))
    plot(res_truedata$pval,
         res_truedata$theta1,
         col=col[res_truedata$shift.direction],pch=16,
         log="x",xlab="p-values twoThetatest", ylab="theta1")
    dev.off()

    return(list(reseval = res_truedata, expr = sim, shifts = ressimul$shifts, speciesLabels = ressimul$speciesLabels, tree = ressimul$tree))
}


########################################
#### Applications of the functions

#################### murinae
#### Eve analysis

res_mur_bm=compute_eve(Alltable_onl, coldata_mur, tree=nhx_tree, normMethod="ntd")

# remove NAs
resna = which(apply(res_mur_bm$reseval,1,function(l){any(is.na(l))}))

if (length(resna)>0){
    res_mur_bm$reseval=res_mur_bm$reseval[-resna,]
    res_mur_bm$expr=res_mur_bm$expr[-resna,]
}


###########################
## Test on recent data

res_rec_bm=compute_eve(Alltable_onl, coldata_rec, tree=nhx_tree, normMethod="ntd")

# remove NAs
resna = which(apply(res_rec_bm$reseval,1,function(l){any(is.na(l))}))

if (length(resna)>0){
    res_rec_bm$reseval=res_rec_bm$reseval[-resna,]
    res_rec_bm$expr=res_rec_bm$expr[-resna,]
}


## Test on ancient data

res_anc_bm=compute_eve(Alltable_onl, coldata_anc, tree=nhx_tree, normMethod="ntd")

# remove NAs
resna = which(apply(res_anc_bm$reseval,1,function(l){any(is.na(l))}))

if (length(resna)>0){
    res_anc_bm$reseval=res_anc_bm$reseval[-resna,]
    res_anc_bm$expr=res_anc_bm$expr[-resna,]
}


## Test on total data

res_tot_bm=compute_eve(Alltable_onl, coldata_tot, tree=nhx_tree, normMethod="ntd")

# remove NAs
resna = which(apply(res_tot_bm$reseval,1,function(l){any(is.na(l))}))

if (length(resna)>0){
    res_tot_bm$reseval=res_tot_bm$reseval[-resna,]
    res_tot_bm$expr=res_tot_bm$expr[-resna,]
}



## DeSeq2 analysis
Demur = read.csv('Table_DE_results_cond_season_murinae_deseq.tsv', h=T, sep = "\t", row.names=1)
Demur[,"log2FoldChange"]=as.numeric(Demur[,"log2FoldChange"])
Demur=Demur[row.names(res_mur_bm$expr),]
Demur$sig = as.factor(Demur$sig)

Detot = read.csv('Table_DE_results_cond_season_total_deseq.tsv', h=T, sep = "\t", row.names=1)
Detot[,"log2FoldChange"]=as.numeric(Detot[,"log2FoldChange"])
Detot=Detot[row.names(res_tot_bm$expr),]
Detot$sig = as.factor(Detot$sig)

Deanc = read.csv('Table_DE_results_cond_season_ancient_deseq.tsv', h=T, sep = "\t", row.names=1)
Deanc[,"log2FoldChange"]=as.numeric(Deanc[,"log2FoldChange"])
Deanc=Deanc[row.names(res_anc_bm$expr),]
Deanc$sig = as.factor(Deanc$sig)

Derec = read.csv('Table_DE_results_cond_season_recent_deseq.tsv', h=T, sep = "\t", row.names=1)
Derec[,"log2FoldChange"]=as.numeric(Derec[,"log2FoldChange"])
Derec=Derec[row.names(res_rec_bm$expr),]
Derec$sig = as.factor(Derec$sig)

#### Comparisons of DeSeq2 & eve analysis

table(res_tot_bm$reseval$isFDR, Detot$sig=="FDR<0.1")
table(res_mur_bm$reseval$isFDR, Demur$sig=="FDR<0.1")
table(res_anc_bm$reseval$isFDR, Deanc$sig=="FDR<0.1")
table(res_rec_bm$reseval$isFDR, Derec$sig=="FDR<0.1")


table(res_tot_bm$reseval$pval<0.05, Detot$sig=="FDR<0.1")
table(res_mur_bm$reseval$pval<0.05, Demur$sig=="FDR<0.1")
table(res_anc_bm$reseval$pval<0.05, Deanc$sig=="FDR<0.1")
table(res_rec_bm$reseval$pval<0.005, Derec$sig=="FDR<0.1")



### Volcano plots of log2FoldChange vs pval

pch=c("FDR<0.1"=19,"Not Sig"=1)

pdf("volcano_FG_pval_rec.pdf")
plot(Derec$log2FoldChange, res_rec_bm$reseval$pval, log="y",col=c("red","black")[Derec$sig], pch=pch[Derec$sig],ylim=rev(range(res_rec_bm$reseval$pval)),xlab="DESeq2 log2_fold_change",ylab="EVE p-value")
abline(h=0.05,v=0)
dev.off()

pdf("volcano_FG_pval_mur.pdf")
plot(Demur$log2FoldChange, res_mur_bm$reseval$pval, log="y",col=c("red","black")[Demur$sig], pch=pch[Demur$sig],ylim=rev(range(res_mur_bm$reseval$pval)),xlab="DESeq2 log2_fold_change",ylab="EVE p-value")
abline(h=0.05,v=0)
dev.off()

pdf("volcano_FG_pval_anc.pdf")
plot(Deanc$log2FoldChange, res_anc_bm$reseval$pval, log="y",col=c("red","black")[Deanc$sig], pch=pch[Deanc$sig],ylim=rev(range(res_anc_bm$reseval$pval)),xlab="DESeq2 log2_fold_change",ylab="EVE p-value")
abline(h=0.05,v=0)
dev.off()

pdf("volcano_FG_pval_tot.pdf")
plot(Detot$log2FoldChange, res_tot_bm$reseval$pval, log="y",col=c("red","black")[Detot$sig], pch=pch[Detot$sig],ylim=rev(range(res_tot_bm$reseval$pval)),xlab="DESeq2 log2_fold_change",ylab="EVE p-value")
abline(h=0.05,v=0)
dev.off()


##### output tables

allmur = cbind(res_mur_bm$reseval[,-11],Demur)
colnames(allmur) = c(paste("EVE_",colnames(res_mur_bm$reseval[,-11]),sep=""))#,paste("DE_",colnames(Demur),sep=""))
write.csv(allmur,"EVE_mur.csv",quote=F)

allanc = cbind(res_anc_bm$reseval[,-11],Deanc)
colnames(allanc) = c(paste("EVE_",colnames(res_anc_bm$reseval[,-11]),sep=""))#,paste("DE_",colnames(Deanc),sep=""))
write.csv(allanc,"EVE_anc.csv",quote=F)

allrec = cbind(res_rec_bm$reseval[,-11],Derec)
colnames(allrec) = c(paste("EVE_",colnames(res_rec_bm$reseval[,-11]),sep=""))#,paste("DE_",colnames(Derec),sep=""))
write.csv(allrec,"EVE_rec.csv",quote=F)

alltot = cbind(res_tot_bm$reseval[,-11],Detot)
colnames(alltot) = c(paste("EVE_",colnames(res_tot_bm$reseval[,-11]),sep=""))#,paste("DE_",colnames(Detot),sep=""))
write.csv(alltot,"EVE_tot.csv",quote=F)



##### Comparison of simulations with eve


testratio1.2_mur = testRatioEve(1.2, res_mur_bm, coldata_mur, name="mur")

testratio1.55_rec = testRatioEve(1.55, res_rec_bm, coldata_rec, name="rec")


