#library(tidyverse)
library(DESeq2)
library(outliers)
library(gplots)
library(ggplot2)
library(ggrepel)
library(ggnewscale)
library(ggsignif)
library(ggpubr)
library(pheatmap)
library(RColorBrewer)
library(reshape2)
library(corrplot)
library(VennDiagram)
library(plyr)
library(dplyr)
library(stringr)
library(cowplot)
library(clusterProfiler)
library(org.Mm.eg.db)
library(topGO)
library(UpSetR)
library(DOSE)
library(Rgraphviz)
library(gridExtra)
library(sva)
#library(pamr)
#library(limma)
library(ape)
library(ade4)
library(kableExtra)

################################
# AVAILABLE DATA TABLE
################################


#Mouse gene information
MusBackground2 <- read.csv('data/extra_files_for_DE/MusBackground2.tsv', h=T, sep = "\t") #Kidney expressed genes
MusAssignIDs <- read.csv('data/extra_files_for_DE/MusAssignIDs.csv', h=T, sep = "\t") #All mouse gene IDs

#List from various studies grouped by

# Kidney diseases
disease_list <- read.csv("data/extra_files_for_DE//Disease_gene_list.csv", sep="\t",stringsAsFactors=FALSE, h=T) #List of genes involved in kidney-associated diseases (from Park et al. 2018)
omim_list <- read.csv("data/extra_files_for_DE//omim_mouse_MGI_kidney_Ap22.csv", sep="\t",stringsAsFactors=FALSE, h=F) #omim entry search using "kidney" as key word

# Genes responding to dehydration stress
macmanes_list <- read.csv("data/extra_files_for_DE//MacManes_dehydration-genes_pval005-FDR001.csv", sep="\t",stringsAsFactors=FALSE, h=F) #List of DE genes from dehydration experiment from MacManes
#macmanes_list <- read.csv("data/extra_files_for_DE//MacManes_dehydration-genes.csv", sep="\t",stringsAsFactors=FALSE, h=F) #List of DE genes from dehydration experiment from MacManes

# Kidney gene markers
deconv_ParkCao_Com_list <- read.csv("data/extra_files_for_DE//MarkersListComShort.csv", sep="\t",stringsAsFactors=FALSE, h=T) #List of genes identified from single cells studies
list_markers_biblio <- read.csv("data/extra_files_for_DE//List_Markers_all_biblio.csv", sep="\t",stringsAsFactors=FALSE, h=T) #List of marker associated to specific cell types, from biblio
#add park marker list

# Genes identified in convergence analysis
bittner_list <- read.csv("data/extra_files_for_DE/Bittner_convergent_expression_disease.csv", sep="\t",stringsAsFactors=FALSE, h=F)

genes_VarFromPDB_df <- read.csv("data/extra_files_for_DE//VarFromPDB_gene_list.tsv", sep="\t",stringsAsFactors=FALSE, h=T)

# Genes identified in convergent analysis at the sequence level
gene_pelican <- read.csv("data/extra_files_for_DE/residuals_scores_2025.tsv", sep="\t",stringsAsFactors=FALSE, h=T)

#Get SC data table
dMusParkMax2 <- read.csv("data/extra_files_for_DE/dMusParkMax2.tsv", sep="\t",stringsAsFactors=FALSE, h=T) 


################################
# Venn Diagram function
################################

# Helper function to display Venn diagram
display_venn <- function(x, ...){
  library(VennDiagram)
  grid.newpage()
  venn_object <- venn.diagram(x, filename = NULL, ...)
  grid.draw(venn_object)
}

################################
# Boxplots quality sequencing
################################

boxplot_seq_FUN <- function(coldata, Boxplots_filename) {

  pdf(Boxplots_filename, height = 8, width = 7)
  
  ## GC content
  
  # Wilcox test
  
  p1 <- ggplot(coldata, aes(x=cond_season, y=GC_content, fill=cond_season)) +
    geom_boxplot() +
    scale_fill_manual(values = c("#FFA500", "#5EA336")) +
    geom_signif(comparisons =  list(c("arid", "mesic")),
                map_signif_level = TRUE, textsize=5, color = "black") +
    ylab("GC content - wilcox") +
    theme_bw()
  
  print(p1 + geom_jitter() + theme(text = element_text(size = 30), axis.title.x = element_blank()) + theme(legend.position='none') )
  
  
  
  # t-test
  p2 <- ggplot(coldata, aes(x=cond_season, y=GC_content, fill=cond_season)) + 
    geom_boxplot() + 
    scale_fill_manual(values = c("#FFA500", "#5EA336")) +
    ylab("GC content - t-test") +
    theme(legend.position='none') +
    theme_bw()
  
  print(p2 + stat_compare_means(method = "t.test") + geom_jitter() + theme(text = element_text(size = 30), axis.title.x = element_blank()) + theme(legend.position='none') )
  
  
  ## Number_of_reads_in_million
  
  # Wilcox test
  
  p3 <- ggplot(coldata, aes(x=cond_season, y=Number_of_spots_in_million, fill=cond_season)) +
    geom_boxplot() +
    scale_fill_manual(values = c("#FFA500", "#5EA336")) +
    geom_signif(comparisons =  list(c("arid", "mesic")),
                map_signif_level = TRUE, textsize=5, color = "black") +
    ylab("Number of reads (million) - wilcox") +
    theme_bw()
  
  print(p3 + geom_jitter() + theme(text = element_text(size = 30), axis.title.x = element_blank()) + theme(legend.position='none') )
  
  
  # t-test
  
  p4 <- ggplot(coldata, aes(x=cond_season, y=Number_of_spots_in_million, fill=cond_season)) + 
    geom_boxplot() + 
    scale_fill_manual(values = c("#FFA500", "#5EA336")) +
    ylab("Number of reads (million) - t-test") +
    theme_bw()
  
  print(p4 + stat_compare_means(method = "t.test") + geom_jitter() + theme(text = element_text(size = 30), axis.title.x = element_blank()) + theme(legend.position='none') )
  
  
  ## Percent_identified_reads
  
  # Wilcox test
  
  p5 <- ggplot(coldata, aes(x=cond_season, y=Percent_identified_reads, fill=cond_season)) +
    geom_boxplot() +
    scale_fill_manual(values = c("#FFA500", "#5EA336")) +
    geom_signif(comparisons =  list(c("arid", "mesic")),
                map_signif_level = TRUE, textsize=5, color = "black") +
    ylab("Percent of reads identified (known genome mapping) - wilcox") +
    theme_bw()
  
  print(p5 + geom_jitter() + theme(text = element_text(size = 30), axis.title.x = element_blank()) + theme(legend.position='none') )
  
  # t-test
  
  p6 <- ggplot(coldata, aes(x=cond_season, y=Percent_identified_reads, fill=cond_season)) + 
    geom_boxplot() + 
    scale_fill_manual(values = c("#FFA500", "#5EA336")) +
    ylab("Percent of reads identified (known genome mapping) - t-test") +
    theme_bw()
  
  print(p6 + stat_compare_means(method = "t.test") + geom_jitter() + theme(text = element_text(size = 30), axis.title.x = element_blank()) + theme(legend.position='none') )
  
  
  # "#5EA336", "#FFA500"
  
  ## Read length (stacked bar)
  
  samples_for_barplot <- plyr::count(coldata, vars = c("cond_season", "read_length_class"))
  #samples_for_barplot <- coldata %>% count(cond_season, read_length_class)
  #samples_for_barplot_melt <- melt(samples_for_barplot)
  
  p7 <- ggplot(samples_for_barplot, aes(x=cond_season, y=freq, fill=read_length_class)) +
    geom_bar(position="fill", stat="identity") +
    scale_fill_manual(values = c("grey30", "red3", "dodgerblue")) +
    ylab("Average read length") +
    theme_bw()
  
  print(p7 + theme(text = element_text(size = 30), axis.title.x = element_blank()) + theme(legend.position='right') )
  
  dev.off()
  
  return()
}


################################
# QUALITY FIGS AFTER DESEQ
################################

qualDE <- function(resTableRaw,resTableMod,condition,rld, quality_plots_filename, dds,ddsRes,resRaw,plots_filename, se,
                   coldata, var1_design, var2_design, Alltableok1, res_season_padj01_onl, ntd, resBasemean ) {
  
  pdf(file = quality_plots_filename, height = 15, width = 15)
  
  ##PlotMA
  ylim <- c(-10,10)
  drawLines <- function() abline(h=c(-.4,.4),col="dodgerblue",lwd=2)
  plotMA(resRaw, ylim=ylim); drawLines()
  

  pEnrich = ggplot2::ggplot(resTableMod, ggplot2::aes(log2FoldChange, -log10(pvalue))) +
    ggplot2::geom_point(ggplot2::aes(col = sig)) +
    ggplot2::scale_color_manual(values = c("red", "black")) +
    ggplot2::ggtitle("Volcano Plot of DESeq2 analysis")
  
  pEnrich2 = pEnrich + ggrepel::geom_text_repel(data=resTableMod[1:10, ], ggplot2::aes(label=rownames(resTableMod[1:10, ])))
  print(pEnrich2)
  
  
  ## Dispersion plot
  plotDispEsts(ddsRes)
  
  # Other plots
  plot(resTableRaw$baseMean+1, -log10(resTableRaw$pvalue),
       log="x", xlab="mean of normalized counts",
       ylab=expression(-log[10](pvalue)),
       ylim=c(0,30),
       cex=.4, col=rgb(0,0,0,.3))
  
  use <- resTableRaw$baseMean > 10
  table(use)
  h1 <- hist(resTableRaw$pvalue[!use], breaks=0:50/50, plot=FALSE)
  h2 <- hist(resTableRaw$pvalue[use], breaks=0:50/50, plot=FALSE)
  colori <- c(`do not pass`="khaki", `pass`="powderblue")
  barplot(height = rbind(h1$counts, h2$counts), beside = FALSE,
          col = colori, space = 0, main = "", ylab="frequency")
  text(x = c(0, length(h1$counts)), y = 0, label = paste(c(0,1)),
       adj = c(0.5,1.7), xpd=NA)
  legend("topright", fill=rev(colori), legend=rev(names(colori)))
  
  # HEATMAP FIGURES
  
  #Correlation matrix all genes all individuals
  sampleDists <- dist( t( assay(rld) ) )
  sampleDistMatrix <- as.matrix( sampleDists )
  
  sp_xeric <- sample(row.names(coldata)[coldata$cond_season == "arid"], length(row.names(coldata)[coldata$cond_season == "arid"]))
  cols = rep('#5EA336', nrow(sampleDistMatrix))
  cols[row.names(sampleDistMatrix) %in% sp_xeric] <- '#FFA500'
  colours = colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
  heatmap.2( sampleDistMatrix, trace="none", col=colours, colRow = cols, main = "Corr matrix with all genes")
  
  #Correlation matrix DE genes all individuals
  DEresBasemean <- resBasemean[rownames(resBasemean) %in% rownames(res_season_padj01_onl),]
  sampleDists <- dist( t( DEresBasemean ) )
  sampleDistMatrix <- as.matrix( sampleDists )
  
  sp_xeric <- sample(row.names(coldata)[coldata$cond_season == "arid"], length(row.names(coldata)[coldata$cond_season == "arid"]))
  cols = rep('#5EA336', nrow(sampleDistMatrix))
  cols[row.names(sampleDistMatrix) %in% sp_xeric] <- '#FFA500'
  colours = colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
  
  heatmap.2( sampleDistMatrix, trace="none", col=colours, colRow = cols, main = "Correlation with DE genes only")
  
  #Heatmap DE genes all individuals -Showing gene expression
  reslog <- log10(resBasemean)
  DEreslog <- merge(x=reslog, y=res_season_padj01_onl, by=0, all=F)
  DEreslog <- DEreslog[ order(DEreslog$log2FoldChange), ]
  DEreslog <- subset(DEreslog, select = -c(baseMean,log2FoldChange,lfcSE,stat,pvalue,padj))
  row.names(DEreslog) <- DEreslog$Row.names
  DEreslogOK <- DEreslog[-1]
  DEreslogOK_mat <- as.matrix(DEreslogOK)
  DEreslogOK_mat[!is.finite(DEreslogOK_mat)] <- 0
  heatmap(DEreslogOK_mat,Rowv=NA, scale="row", main = "Heatmap DE genes only")
  
  
  if (!is.null(var1_design)) {
    df <- as.data.frame(colData(ddsRes))[, c(var2_design, var1_design), drop = FALSE]
  } else {
    df <- as.data.frame(colData(ddsRes))[, c(var2_design), drop = FALSE]
  }
  
  #variable genes
  selectvar <- order(rowVars(resBasemean), decreasing=TRUE)[1:50]
  p_var=pheatmap(assay(ntd)[selectvar,], cluster_rows=FALSE, show_rownames=TRUE,
                 cluster_cols=FALSE, annotation_col=df, main = "Variable genes" )
  
  print(p_var)
  p_var
  
  print("heatmaps done")
  
  dev.off()
  
}


################################
# PCA Function colored by different factors for quality figure
################################

PCA_colored_FUN <- function(pca, pcs_to_plot, intgroup.df, var1_design, output_dir){

  # the contribution to the total variance for each component
  percentVar <- pca$sdev^2 / sum( pca$sdev^2 )
  
  d5 <- data.frame(PCa=pca$x[,pcs_to_plot[1]], PCb=pca$x[,pcs_to_plot[2]], group="cond_season", intgroup.df)
  
  pc_for_title <- as.character(paste0(pcs_to_plot[1], pcs_to_plot[2]))
  
  pdf(file = paste0(output_dir,"PCA_colored_PC",pc_for_title,".pdf"), width = 10, height = 8, bg = "white")
  
  # colored by species
  plot1 = ggplot(data = d5, aes_string(x = "PCa", y = "PCb", shape = "cond_season",color="species")) + 
    geom_point(size = 3) +
    xlab(paste0("PC", pcs_to_plot[1],": ", round(percentVar[pcs_to_plot[1]] * 100), "% variance")) +
    ylab(paste0("PC", pcs_to_plot[2],": ", round(percentVar[pcs_to_plot[2]] * 100), "% variance")) +
    geom_text_repel(aes(label=intgroup.df$sp_short_name), size=3) + 
    ggtitle("colored by species") +
    theme_bw() +
    theme(legend.position='none') +
    coord_fixed()
  print(plot1)
 
  # Colored by condition (mesic - xeric)
  
  # WARNING with colors, to change !
  plot2_tmp_main =  ggplot(data = d5, aes_string(x = "PCa", y = "PCb",color="cond_season")) +
    geom_point() +
    scale_color_manual(values = c("#FFA500", "#5EA336")) +
    stat_ellipse(type = "norm", level = 0.8) +
    xlab(paste0("PC", pcs_to_plot[1],": ", round(percentVar[pcs_to_plot[1]] * 100), "% variance")) +
    ylab(paste0("PC", pcs_to_plot[2],": ", round(percentVar[pcs_to_plot[2]] * 100), "% variance")) +
    ggtitle("colored by condition") +
    theme_bw() +
    theme(legend.position='top')  +
    geom_text_repel(aes(label=intgroup.df$sp_short_name), max.overlaps = Inf, size = 2.5, min.segment.length = 0.1) 
  
  ggsave(plot2_tmp_main,filename=paste0(output_dir,"PCA_colored_PC",pc_for_title,"_fig_main.pdf"),units="cm",width=12,height=10) 
  
  
  plot2 = ggplot(data = d5, aes_string(x = "PCa", y = "PCb",color="cond_season")) +
    geom_point(size = 3) +
    scale_color_manual(values = c("#FFA500", "#5EA336")) +
    stat_ellipse(type = "norm", level = 0.8) +
    xlab(paste0("PC", pcs_to_plot[1],": ", round(percentVar[pcs_to_plot[1]] * 100), "% variance")) +
    ylab(paste0("PC", pcs_to_plot[2],": ", round(percentVar[pcs_to_plot[2]] * 100), "% variance")) +
    ggtitle("colored by condition") +
    theme_bw() +
    theme(legend.position='top') +
    geom_text_repel(aes(label=intgroup.df$sp_short_name), size=5.5) + 
    coord_fixed()
  
  print(plot2)
  
  
  # Colored by batch correction family
  
  plot3 = ggplot(data = d5, aes_string(x = "PCa", y = "PCb",color=var1_design)) +
    geom_point(size = 3) +
    xlab(paste0("PC", pcs_to_plot[1],": ", round(percentVar[pcs_to_plot[1]] * 100), "% variance")) +
    ylab(paste0("PC", pcs_to_plot[2],": ", round(percentVar[pcs_to_plot[2]] * 100), "% variance")) +
    geom_text_repel(aes(label=intgroup.df$sp_short_name), size=3) + 
    ggtitle("colored by correction family group") +
    theme_bw() +
    theme(legend.position='top') +
    coord_fixed()
  print(plot3)
  
  # Colored by internal vs external
  
  plot4 = ggplot(data = d5, aes_string(x = "PCa", y = "PCb",color="Source_internal_or_external")) +
    geom_point(size = 3) +
    xlab(paste0("PC", pcs_to_plot[1],": ", round(percentVar[pcs_to_plot[1]] * 100), "% variance")) +
    ylab(paste0("PC", pcs_to_plot[2],": ", round(percentVar[pcs_to_plot[2]] * 100), "% variance")) +
    scale_color_manual(values = c("black", "red3")) +
    geom_text_repel(aes(label=intgroup.df$sp_short_name), size=3) + 
    theme_bw() +
    theme(legend.position='top') +
    coord_fixed()
  print(plot4)
  
  plot5 = ggplot(data = d5, aes_string(x = "PCa", y = "PCb",color="Source_internal_or_external", label = NULL)) +
    geom_point(size = 3) +
    xlab(paste0("PC", pcs_to_plot[1],": ", round(percentVar[pcs_to_plot[1]] * 100), "% variance")) +
    ylab(paste0("PC", pcs_to_plot[2],": ", round(percentVar[pcs_to_plot[2]] * 100), "% variance")) +
    scale_color_manual(values = c("black", "red3")) +
    theme_bw() +
    theme(legend.position='top') +
    coord_fixed()
  print(plot5)
  
  # Colored by batch sequencing
  
  plot6 = ggplot(data = d5, aes_string(x = "PCa", y = "PCb",color="Batch_number")) +
    geom_point(size = 3) +
    xlab(paste0("PC", pcs_to_plot[1],": ", round(percentVar[pcs_to_plot[1]] * 100), "% variance")) +
    ylab(paste0("PC", pcs_to_plot[2],": ", round(percentVar[pcs_to_plot[2]] * 100), "% variance")) +
    geom_text_repel(aes(label=intgroup.df$sp_short_name), size=3) + 
    scale_color_manual(values = c("dodgerblue2", "#E31A1C", # red
                                  "green4",
                                  "#6A3D9A", # purple
                                  "#FF7F00", # orange
                                  "black", "gold1",
                                  "tomato4", "#FB9A99", # lt pink
                                  "palegreen2",
                                  "#CAB2D6", # lt purple
                                  "#FDBF6F", # lt orange
                                  "gray70", "khaki2", "maroon", "orchid1", "deeppink1")) +
    theme_bw() +
    theme(legend.position='top') +
    coord_fixed()
  print(plot6)
  
  
  plot7 = ggplot(data = d5, aes_string(x = "PCa", y = "PCb",color="Batch_number", label = NULL)) +
    geom_point(size = 5) +
    xlab(paste0("PC", pcs_to_plot[1],": ", round(percentVar[pcs_to_plot[1]] * 100), "% variance")) +
    ylab(paste0("PC", pcs_to_plot[2],": ", round(percentVar[pcs_to_plot[2]] * 100), "% variance")) +
    scale_color_manual(values = c("dodgerblue2", "#E31A1C", # red
                                  "green4",
                                  "#6A3D9A", # purple
                                  "#FF7F00", # orange
                                  "black", "gold1",
                                  "tomato4", "#FB9A99", # lt pink
                                  "palegreen2",
                                  "#CAB2D6", # lt purple
                                  "#FDBF6F", # lt orange
                                  "gray70", "khaki2", "maroon", "orchid1", "deeppink1")) +
    theme_bw() +
    theme(legend.position='top') +
    coord_fixed()
  print(plot7)
  
  # Colored by read length
  
  plot8 = ggplot(data = d5, aes_string(x = "PCa", y = "PCb",color="read_length_class")) +
    geom_point(size = 3) +
    xlab(paste0("PC", pcs_to_plot[1],": ", round(percentVar[pcs_to_plot[1]] * 100), "% variance")) +
    ylab(paste0("PC", pcs_to_plot[2],": ", round(percentVar[pcs_to_plot[2]] * 100), "% variance")) +
    geom_text_repel(aes(label=intgroup.df$sp_short_name), size=3) + 
    scale_color_manual(values = c("dodgerblue2", "#E31A1C", # red
                                  "black", "gold1",
                                  "palegreen2",
                                  "#CAB2D6", # lt purple
                                  "#FDBF6F", # lt orange
                                  "gray70", "khaki2", "maroon", "orchid1", "deeppink1")) +
    theme_bw() +
    theme(legend.position='top') +
    coord_fixed()
  print(plot8)
  
  plot9 = ggplot(data = d5, aes_string(x = "PCa", y = "PCb",color="read_length_class", label = NULL)) +
    geom_point(size = 5) +
    xlab(paste0("PC", pcs_to_plot[1],": ", round(percentVar[pcs_to_plot[1]] * 100), "% variance")) +
    ylab(paste0("PC", pcs_to_plot[2],": ", round(percentVar[pcs_to_plot[2]] * 100), "% variance")) +
    scale_color_manual(values = c("dodgerblue2", "#E31A1C", # red
                                  "black", "gold1",
                                  "palegreen2",
                                  "#CAB2D6", # lt purple
                                  "#FDBF6F", # lt orange
                                  "gray70", "khaki2", "maroon", "orchid1", "deeppink1")) +
    theme_bw() +
    theme(legend.position='top') +
    coord_fixed()
  print(plot9)
  
  # Colored by RIN class
  
  plot10 = ggplot(data = d5, aes_string(x = "PCa", y = "PCb",color="RIN_class")) +
    geom_point(size = 3) +
    xlab(paste0("PC", pcs_to_plot[1],": ", round(percentVar[pcs_to_plot[1]] * 100), "% variance")) +
    ylab(paste0("PC", pcs_to_plot[2],": ", round(percentVar[pcs_to_plot[2]] * 100), "% variance")) +
    geom_text_repel(aes(label=intgroup.df$sp_short_name), size=3) + 
    scale_color_manual(values = c(
      "darkseagreen", "khaki2", "maroon", "orchid1", "deeppink1")) +
    theme_bw() +
    theme(legend.position='top') +
    coord_fixed()
  print(plot10)
  
  plot11 = ggplot(data = d5, aes_string(x = "PCa", y = "PCb",color="RIN_class", label = NULL)) +
    geom_point(size = 5) +
    xlab(paste0("PC", pcs_to_plot[1],": ", round(percentVar[pcs_to_plot[1]] * 100), "% variance")) +
    ylab(paste0("PC", pcs_to_plot[2],": ", round(percentVar[pcs_to_plot[2]] * 100), "% variance")) +
    scale_color_manual(values = c(
      "darkseagreen", "khaki2", "maroon", "orchid1", "deeppink1")) +
    theme_bw() +
    theme(legend.position='top') +
    coord_fixed()
  print(plot11)
  
  
  dev.off()
  
  return()
}



################################
# CLUSTER PROFILER FOR NETWORK FIGURES
################################

GOwithCP <- function(results_DE, Alltableok, table_CP_name) {
  
  # Extract background/Universe genes
  universe_gene <- select(org.Mm.eg.db, keys = row.names(Alltableok), columns = "ENTREZID", keytype = "SYMBOL") 
  
  # prepare DE gene list - order and get entrez gene ID
  DE_gene <- select(org.Mm.eg.db, keys = row.names(results_DE), columns = "ENTREZID", keytype = "SYMBOL") 
  # head(DE_tot_id)
  # sum(is.na(DE_tot_id$ENTREZID))
  
  # Run EnrichGO function from clusterProfiler
  DE_tot_ego <- enrichGO(gene          = DE_gene$SYMBOL,
                         universe      = universe_gene$SYMBOL,
                         keyType = "SYMBOL",
                         OrgDb         = org.Mm.eg.db,
                         ont           = "ALL",
                         pAdjustMethod = "BH",
                         pvalueCutoff  = 0.1,
                         readable      = TRUE)
  
  head(DE_tot_ego)
  write.table(DE_tot_ego, file = table_CP_name,
              sep = "\t", quote = FALSE)

  return(DE_tot_ego)
}


################################
# Dendrograms expression
################################

# Distance tree based on 1-Spearman distance

computeExpressionDistance <- function (d) {
  cors <- cor(d, method="spearman")
  dists <- 1-cors
  return (dists)
}

buildBootstrappedTree <- function (data, metaD_sh,title="", geneSet, n_bootstraps=1000, output_dir) {
  # compute distances
  dists <- computeExpressionDistance(data)
  
  # Build nj tree
  trw <- nj(dists)
  #plot(trw)
  
  # Extract tip label and order to attribute color
  node.order <- trw$tip.label
  metaD_sh2 <- metaD_sh[ order(match(metaD_sh$species, node.order)), ]
  Var = metaD_sh2$cond_season                                        # factor variable for colours
  varCol = gsub("arid","#FFA500",Var)                        # convert numbers to colours
  varCol = gsub("mesic","#5EA336",varCol)
  
  # compute bootstraps
  bootstraps <- boot.phylo(phy=trw, x=data, FUN = function(xx) nj(computeExpressionDistance(xx)), B=n_bootstraps, trees=TRUE)
  bstrees <- bootstraps$trees
  
  ## get proportions of each bipartition:
  boot <- prop.clades(trw, bstrees)
  
  pdf(file = paste0(output_dir,"dendro_NJ_Boot1000_",geneSet,".pdf"))
  
  layout(1)
  par(mar = rep(2, 4))
  #plot(trw, main = title)
  plot.phylo(trw, tip.color = varCol)
  drawSupportOnEdges(boot)
  legend("bottomright", legend = c("Bootstrap"), pch = 22,
         pt.bg = c("green"), pt.cex = 2.5)
  
  dev.off()
  
  return(list(trw, boot))
}





################################################################################################
# TADAM RUN EXPRESSION ANALYSES
################################################################################################

make_exp_analyses_all = function(data_expr) {
  
  print("Load data")
  
  All_table = data_expr$Alltable
  coldata = data_expr$coldata
  var1_design = data_expr$var1_design
  var2_design = data_expr$var2_design
  output_dir = data_expr$output_dir
  pcs_to_plot = data_expr$pcs_to_plot
  dataset = data_expr$dataset
  test  = data_expr$test
  nCPU = data_expr$nCPU
  
  # Count Table preparation and filtering
  
  print("Count Table preparation")
  
  All_table = All_table[,colnames(All_table) %in% coldata$ID]
  
  mt_genes = grep(pattern = "mt-", rownames(All_table))
  if (length(mt_genes) > 0) {
    Alltableok <- All_table[-mt_genes,] #Remove mitochondrial genes (Co1, Co2, Cytb, Nd1, Nd2, Nd5, Nd6)
  } else {
    Alltableok = All_table
  }
  
  Alltableok1 <- round(Alltableok[complete.cases(Alltableok),])
  Alltableok1[is.na(Alltableok1)]=0
  n=apply(Alltableok1,1,function(x){sum(x==0)})
  #counts=counts[n<ncol(counts),]
  Alltableok1=Alltableok1[n==0,]
  
  # Prepare coldata
  
  coldata <- coldata[ which(coldata$ID %in% colnames(Alltableok1)), ]
  coldata <- coldata[ order(match(coldata$ID, colnames(Alltableok1))), ]#Reorder table such coldata and matrix has same order
  
  coldata$Batch_number <- as.factor(coldata$Batch_number)
  coldata$cond_season = factor(coldata$cond_season, level = c("mesic","arid"))
  
  if (dataset != "murinae") {
    coldata[[var1_design]] = factor(coldata[[var1_design]])
  } else {
    # do nothing
  }
  
  rownames(coldata) <- coldata$ID_final
  
  colnames(Alltableok1) <- coldata$ID_final
  
  #################################################################################
  ## Generate boxplot with sequencing information
  
  print("boxplot sequencing figures")
  
  BF = paste0(output_dir, "Boxplot_sequencing.pdf")
  
  boxplot_seq_FUN(coldata = coldata, Boxplots_filename = BF)
    
  
  
  #################################################################################
  ## Batch correction with Combat-Seq
  
  print("Run correction for all sets except Murinae")
  
  if (dataset != "murinae") {
    adjusted <- ComBat_seq(as.matrix(Alltableok1), batch=coldata[[var1_design]], group=coldata$cond_season)
  } else {
    adjusted <- Alltableok1
  }
  
  # # For Total, Ancient, Recent to change !!!
  # # adjusted <- ComBat_seq(as.matrix(Alltableok1), batch=coldata$total_group_batch, group=coldata$cond_season)
  # adjusted <- ComBat_seq(as.matrix(Alltableok1), batch=coldata[[var1_design]], group=coldata$cond_season)
  # 
  # # For Murinae
  # # adjusted <- Alltableok1
    
  
  #################################################################################
  ## DE seq
  
  print("Run DEseq2")
  
  design = as.formula(paste("~ ", var2_design))
  
  ddsInput <- DESeqDataSetFromMatrix(countData = as.matrix(round(adjusted)),
                                     colData = coldata,
                                     design = design)
  
  dds <- DESeq(ddsInput)
  
  rld <- rlogTransformation(dds, blind=TRUE)
  ntd <- normTransform(dds)
  
  se <- SummarizedExperiment(log2(counts(dds, normalized=TRUE) + 1),
                                 colData=colData(dds))
  resBasemean <- counts(dds, norm=T)
  if (test) {
  print(head(resBasemean))
  }
  
  write.table(resBasemean, file=paste0(output_dir,"Table_resBasemean.tsv"), quote = F, row.names = T, sep = "\t")
  
  res <- results(dds,  lfcThreshold=.4, altHypothesis="greaterAbs")
  summary(res) 
  res_raw <- results(dds)
  
  res <- res[order(res$padj),]
  res_df <- as.data.frame(res)
  
  results = as.data.frame(dplyr::mutate(as.data.frame(res), sig=ifelse(res$padj<0.1, "FDR<0.1", "Not Sig")), 
                          row.names=rownames(res))
  
  
  write.table(results, file=paste0(output_dir,"Table_DE_results_cond_season.tsv"), quote = F, row.names = T, sep = "\t")
  
  res_season_padj01_onl <- res_df[res_df$padj < 0.1,]
  print("head(res_season_padj01_onl)")
  print(head(res_season_padj01_onl))
  write.table(res_season_padj01_onl, file=paste0(output_dir,"Table_DE_results_cond_season_thres01.tsv"), quote = F, row.names = T, sep = "\t")
  
  # Add plots gene counts from resbasemean for top 10 abs(LFC)
  
  
  print("DE done")
  
  #################################################################################
  ## PCA after correction
  
  print("PCA figures by colors")
  
  norm <- log2(counts(dds, normalized=TRUE) + 1)
  pca <- prcomp(t(norm),scale. = T)
  
  
  intgroup.df <- as.data.frame(coldata[, c("cond_season","species", "sp_short_name", var1_design, "ID_final", 
                                           "Batch_number", "read_length_class",
                                           "Source_internal_or_external", "RIN_class"), drop=FALSE])
  
  PCA_colored_FUN(pca, pcs_to_plot, intgroup.df, var1_design, output_dir)
    
  print("PCA done")
  
  #################################################################################
  ## Quality plots general
  
  if (T) {
    print("Quality")
  
    quality_plots_filename = paste0(output_dir,"quality_plots.pdf")
    
    title_fig = paste(var2_design, output_dir)
    
    qualDE(resTableRaw=res,resTableMod=results,condition=title_fig,rld=rld,ddsRes=dds,
           resRaw=res_raw, quality_plots_filename = quality_plots_filename, se = se, coldata = coldata,
           var1_design = var1_design, var2_design = var2_design, Alltableok1 = Alltableok1, 
           resBasemean = resBasemean, res_season_padj01_onl = res_season_padj01_onl, ntd = ntd)
  }
  
  
  #################################################################################
  ## RUN ClusterProfiler
  
  print("Analyse Cluster Profiler")
  
  CP_name = paste0(output_dir,"Table_ClusterProfiler_BH_allGO.tsv")
  
  GO_fam_season <- GOwithCP(results_DE=res_season_padj01_onl,
                            Alltableok = Alltableok1,
                            table_CP_name = CP_name)
  #Table background used is the count table beofre removing na lines (genes expressed in all ind)
  
  
  print("Cluster Pro done")
  
  #################################################################################
  ## RUN dendrograms based on expression
  
  print("Dendrograms")
  
  DE.list <- row.names(res_season_padj01_onl)
  
  # Prepare table to obtain mean expression per species
  
  groups <- data.frame(ID = coldata$ID_final,
                       sp = coldata$species)
  #new.df.mean <- sapply(split.default(resBasemean, groups$sp), rowMeans)
  
  #new.df.mean <- sapply(split(seq_along(groups$sp), groups$sp),
  #                      function(idx) rowMeans(resBasemean[, idx]))
  
  # Count how many samples per group
  group_sizes <- table(groups$sp)
  
  # Collapse counts (sums) then divide by group size to get means
  new.df.mean <- t(rowsum(t(resBasemean), group = groups$sp))
  new.df.mean <- sweep(new.df.mean, 2, group_sizes, "/")
  
  # Prepare table to have mean expression per species only DE genes
  
  new.df.mean.de <- new.df.mean[which(row.names(new.df.mean) %in% DE.list),]
  
  # extract conditions per species
  coldata_short <- coldata[,c("species", "cond_season")]
  coldata_short2 <- coldata_short %>%
    group_by(species) %>%
    slice(1)
  
  coldata_short2_df <- as.data.frame(coldata_short2)
  
  # Using all genes
  tree_bs_all <- buildBootstrappedTree(new.df.mean, coldata_short2_df, "All genes, expression data", 
                                       "allGenes", 1000, output_dir)
  
  # Using DE genes only
  tree_bs_de <- buildBootstrappedTree(new.df.mean.de, coldata_short2_df, "DE genes only, expression data", 
                                      "DEGenes", 1000, output_dir)
  
  
  print("dendrograms done")
  
  #################################################################################
  ## Croisement biblio
  
  print("Croisement biblio")
  
  #Information regarding tables to cross
  #disease_list: 186 List of genes involved in kidney-associated diseases (from Park et al. 2018)
  #omim_list: 106 genes extracted from omim database with kidney keyword in mouse, modified April 2022
  #macmanes_list: genes identified in dehydration analyses (macmanes 2017 paper)
  #deconv_ParkCao_Com_list: 198 List of genes identified from single cells studies (cell-type specific genes)
  #list_marker_biblio: 222 list of genes from various ref, identified as more or less specific to a cell type in kidney
  #bittner_list: Genes identified in bittner paper (check whether seq or DE)
  #PDB: 22genes, list of gene extracted using PDB package with kidney keyword containing hpo phenotype, ClinVar and Uniprot variants

  #Other table list preparation
  #Start with kidney associated diseases or disorders using diseas list from Park, PDB and omim
  disease_list <- disease_list$MGI
  disease_list <- as.data.frame(disease_list, stringsAsFactors=FALSE)
  disease_list$diseaseList <- 1
  colnames(disease_list) <- c("MGI","diseaseList")
  
  omim_list$omim <- 1
  colnames(omim_list) <- c("MGI","omim")
  
  genes_VarFromPDB_df$PDB <- 1
  colnames(genes_VarFromPDB_df) <- c("MGI","PDB")
  
  DISorders0 <- merge(disease_list, omim_list, by = c("MGI"),all= T)
  DISorders <- merge(DISorders0, genes_VarFromPDB_df, by = c("MGI"),all= T)
  DISorders$disorders <- 1
  DISorders <- subset(DISorders, select = c(MGI, disorders))
  
  #Genes associated to response to a hydric stress
  macmanes_list$dehydration_response <- 1
  colnames(macmanes_list) <- c("MGI","dehydration_response")
  
  #We want to do a table with genes expressed in kidney (more or less specific)
  deconv_ParkCao_Com_list <- deconv_ParkCao_Com_list[-2]
  deconv_ParkCao_Com_list$SC <- 1
  colnames(deconv_ParkCao_Com_list) <- c("MGI","SC")
  
  list_markers_biblio$markerBiblio <- 1
  colnames(list_markers_biblio) <- c("MGI","markerBiblio")
  
  kidney_markers <- merge(deconv_ParkCao_Com_list, list_markers_biblio, by = c("MGI"),all= T)
  kidney_markers$kidney_markers <- 1
  kidney_markers <- subset(kidney_markers, select = c(MGI, kidney_markers))

  print("prep table ok")
  
  #Merging all tables to work on it
  CompTable1 <- merge(x=DISorders,y=macmanes_list,by=c("MGI"),all=T)
  CompTable2 <- merge(x=CompTable1,y=kidney_markers,by=c("MGI"),all=T)
  #CompTable3 <- merge(x=CompTable2,y=bittner_list,by=c("MGI"),all=T) 
  
  # There are non-unique rows (as Umod), to check if they are similar in all lines to remove the duplicates, do 'CompTable5[grep("Umod", CompTable5$MGI),]' it looks ok, we can keep unique one
  uCom <- unique(CompTable2$MGI)
  CompTableList <- CompTable2[match(uCom, CompTable2$MGI),] 
  
  write.table(CompTableList, file=paste0(output_dir,"Table_biblio_disease_SC_gene_list.tsv"), quote = F, row.names = T, sep = "\t")
  
  print("prep compTableList ok")
  
  
  print("Literature info Venn Diagram")
  
  jpeg(paste0(output_dir,"plot_biblio_VennDiagram.jpeg"), width = 9, height = 7, units = "in", res = 200)
  venn(list(DISorders = DISorders$MGI, macmanes_list = macmanes_list$MGI, kidney_markers = kidney_markers$MGI, 
            res_season_padj01_onl = row.names(res_season_padj01_onl)))
  dev.off()
  
  v.table <- venn(list(DISorders = DISorders$MGI, macmanes_list = macmanes_list$MGI, kidney_markers = kidney_markers$MGI, 
                       res_season_padj01_onl = row.names(res_season_padj01_onl)))
  write.table(v.table, file = paste0(output_dir,"Table_vennDiagram_intersect.txt"), sep = "\t", quote = FALSE)
  print(v.table)

  
  jpeg(paste0(output_dir,"plot_biblio_VennDiagram2.jpeg"), width = 9, height = 7, units = "in", res = 200)
  display_venn(
    list(DISorders = DISorders$MGI, macmanes_list = macmanes_list$MGI, kidney_markers = kidney_markers$MGI, 
         res_season_padj01_onl = row.names(res_season_padj01_onl)),
    category.names = c("Disorders" , "Dehydration " , "Kidney markers", "DE"),
    fill = c("#999999", "#E69F00", "#56B4E9", "#E64B35B2")
  )
  dev.off()
  
  
  print("Croisement Pelican results")
  
  gene_pelican
  
  gene_pelican <- gene_pelican |>
    mutate(gene_name = str_extract(alignment, "^[^_]+"))
  
  n=1000
  res.pel <- gene_pelican |>
    arrange(desc(gtf_pval_residual)) |>
    slice_head(n = n) |>
    bind_rows(
      gene_pelican |>
        arrange(gtf_pval_residual) |>
        slice_head(n = n)
    )
  
  df_DE_pel <- as.data.frame(intersect(row.names(res_season_padj01_onl), res.pel$gene_name))
  
  write.table(df_DE_pel, file=paste0(output_dir,"Table_intersect_DE_Pelican_top1000sites.tsv"), quote = F, row.names = T, sep = "\t")
  
  print("THE END")
}
 

