library(tidyverse)
library(DESeq2)
library(outliers)
library(ggplot2)
library(pheatmap)
library(RColorBrewer)
library(gplots)
library(ggrepel)
library(reshape2)
library(corrplot)
library(VennDiagram)
library(plyr)
library(dplyr)
library(stringr)
library(cowplot)
library(parallel)
library(clusterProfiler)
library(org.Mm.eg.db)
library(topGO)
library(UpSetR)
library(DOSE)
library(Rgraphviz)
library(gridExtra)
library(ade4)
#library(kableExtra)
library(sva)
library(pamr)
library(limma)

################################
# AVAILABLE DATA TABLE
################################

# 
# #Mouse gene information
# MusBackground2 <- read.csv('data/extra_files_for_DE/MusBackground2.tsv', h=T, sep = "\t") #Kidney expressed genes
# MusAssignIDs <- read.csv('data/extra_files_for_DE/MusAssignIDs.csv', h=T, sep = "\t") #All mouse gene IDs
# 
# #List from various studies
# disease_list <- read.csv("data/extra_files_for_DE//Disease_gene_list.csv", sep="\t",stringsAsFactors=FALSE, h=T) #List of genes involved in kidney-associated diseases (from Park et al. 2018)
# omim_list <- read.csv("data/extra_files_for_DE//omim_mouse_MGI_kidney.csv", sep="\t",stringsAsFactors=FALSE, h=F) #omim entry search using "kidney" as key word
# macmanes_list <- read.csv("data/extra_files_for_DE//MacManes_dehydration-genes_pval005-FDR001.csv", sep="\t",stringsAsFactors=FALSE, h=F) #List of DE genes from dehydration experiment from MacManes
# #macmanes_list <- read.csv("data/extra_files_for_DE//MacManes_dehydration-genes.csv", sep="\t",stringsAsFactors=FALSE, h=F) #List of DE genes from dehydration experiment from MacManes
# deconv_ParkCao_Com_list <- read.csv("data/extra_files_for_DE//MarkersListComShort.csv", sep="\t",stringsAsFactors=FALSE, h=T) #List of genes identified from single cells studies
# list_markers_biblio <- read.csv("data/extra_files_for_DE//List_Markers_all_biblio.csv", sep="\t",stringsAsFactors=FALSE, h=T) #List of marker associated to specific cell types, from biblio
# genes_VarFromPDB_df <- read.csv("data/extra_files_for_DE//VarFromPDB_gene_list.tsv", sep="\t",stringsAsFactors=FALSE, h=T)
# gene_seq_conv_intersection <- read.csv("data/extra_files_for_DE/table_gene_conv_summary_false_t.csv", sep=",",stringsAsFactors=FALSE, h=T)
# gene_seq_conv_pcoc <- read.csv("data/extra_files_for_DE/table_gene_conv_pcoc_0.999.csv", sep=",",stringsAsFactors=FALSE, h=T)
# gene_seq_conv_tdg09 <- read.csv("data/extra_files_for_DE/table_gene_conv_tdg09_0.999.csv", sep="\t",stringsAsFactors=FALSE, h=T)
# 
# dMusParkMax2 <- read.csv("data/extra_files_for_DE//dMusParkMax2.tsv", sep="\t",stringsAsFactors=FALSE, h=T) #Get SC data table

################################
# VENN DIAGRAM
################################
#Function to draw the venn plots (pairwise, triple,..., up to 4 species)
plotCondition <- function(a, ...) {
  grid.newpage()
  if (length(a) == 1) {
    out <- draw.single.venn(clusterNumber(a), ...)
  }
  if (length(a) == 2) {
    out <- draw.pairwise.venn(clusterNumber(a[1]), clusterNumber(a[2]), clusterNumber(a[1:2]), ...)
  }
  if (length(a) == 3) {
    out <- draw.triple.venn(clusterNumber(a[1]), clusterNumber(a[2]), clusterNumber(a[3]), clusterNumber(a[1:2]), clusterNumber(a[2:3]), clusterNumber(a[c(1, 3)]), clusterNumber(a), ...)
  }
  if (length(a) == 4) {
    out <- draw.quad.venn(clusterNumber(a[1]), clusterNumber(a[2]), clusterNumber(a[3]), clusterNumber(a[4]), clusterNumber(a[1:2]), clusterNumber(a[c(1, 3)]), clusterNumber(a[c(1,4)]), clusterNumber(a[2:3]), clusterNumber(a[c(2,4)]), clusterNumber(a[3:4]), clusterNumber(a[1:3]), clusterNumber(a[c(1, 2, 4)]), clusterNumber(a[c(1, 3, 4)]), clusterNumber(a[2:4]), clusterNumber(a), ...)
  }
  if (!exists("out"))
    out <- "Oops"
  return(out)
}

#WARNING the function calls another function "ClusterNumber" that has to be done before call plotCondition


################################
# PCA FUNCTION DEFINITION THIBAULT
################################

plotPCA.mystyle <-  function(object, intgroup, ntop, returnData=FALSE, pcs = c(1,2),scale)
{  stopifnot(length(pcs) == 2)    ### added this to check number of PCs ####
  # calculate the variance for each gene
  rv <- rowVars(assay(object))
  # select the ntop genes by variance
  select <- order(rv, decreasing=TRUE)[seq_len(min(ntop, length(rv)))]
  # perform a PCA on the data in assay(x) for the selected genes
  pca <- prcomp(t(assay(object)[select,]),scale. = scale)
  # the contribution to the total variance for each component
  percentVar <- pca$sdev^2 / sum( pca$sdev^2 )
  if (!all(intgroup %in% names(colData(object)))) {
    stop("the argument 'intgroup' should specify columns of colData(dds)")
  }
  intgroup.df <- as.data.frame(colData(object)[, intgroup, drop=FALSE])
  # add the intgroup factors together to create a new grouping factor
  group <- if (length(intgroup) > 1) {
    factor(apply( intgroup.df, 1, paste, collapse=" : "))
  } else {
    colData(object)[[intgroup]]
  }
  # assembly the data for the plot
  ########## Here we just use the pcs object passed by the end user ####
  d <- data.frame(PCa=pca$x[,pcs[1]], PCb=pca$x[,pcs[2]], group=group, intgroup.df, name=colnames(object))
  plot = ggplot(data = d, aes_string(x = "PCa", y = "PCb", color = "group")) + 
    geom_point(size = 3) + xlab(paste0("PC", pcs[1],": ", round(percentVar[pcs[1]] * 
                                                                  100), "% variance")) + ylab(paste0("PC", pcs[2],": ", round(percentVar[pcs[2]] * 
                                                                                                                                100), "% variance")) + coord_fixed() + geom_text(aes_string(x = "PCa", y = "PCb", label = "name"), color = "black") + theme_bw()
  print(plot)
  if (returnData) {
    attr(d, "percentVar") <- percentVar
    return(list(d, plot))
  }
}


########################################
# BCA FUNCTION MARIE Between analyses
########################################

plotPCA.mystyle2 <-  function(object, var1_design, var2_design, ntop, returnData=FALSE, pcs = c(1,2),scale=T)
{ 
  intgroup = c(var1_design,var2_design)
  
  stopifnot(length(pcs) == 2)    ### added this to check number of PCs ####
  pca <- dudi.pca(t(assay(object)),scale = scale,nf=ncol(assay(object)),scannf = FALSE)
  # the contribution to the total variance for each component
  percentVar <- pca$eig/ sum( pca$eig)
  if (!all(intgroup %in% names(colData(object)))) {
    stop("the argument 'intgroup' should specify columns of colData(dds)")
  }
  intgroup.df <- as.data.frame(colData(object)[, intgroup, drop=FALSE])
  # add the intgroup factors together to create a new grouping factor
  group <- if (length(intgroup) > 1) {
    factor(apply( intgroup.df, 1, paste, collapse=" : "))
  } else {
    colData(object)[[intgroup]]
  }
  
  # between PCA
  
  bca_var1=bca(pca,fac=factor(colData(object)[,var1_design]),scan=F,nf=8)
  bca_var2=bca(pca,fac=factor(colData(object)[,var2_design]),scan=F,nf=1)
  bca_var3=bca(pca,fac=factor(colData(object)[,c("study_nb")]),scan=F,nf=4)
  
  # percentage of variance attributed to environment
  print(bca_var2$ratio)
  # percentage of variance attributed to phylogenetic group
  print(bca_var1$ratio)
  # percentage of variance attributed to study origin
  print(bca_var3$ratio)
  
}




################################
# QUALITY FIGS AFTER DESEQ
################################

qualDE <- function(resTableRaw,resTableMod,condition,rld, quality_plots_filename, dds,ddsRes,resRaw,plots_filename, se,
                   coldata, var1_design, var2_design, Alltableok1, res_season_padj01_onl, ntd, resBasemean ) {
  pdf(quality_plots_filename,height = 15, width = 15)
  
  ##PlotMA
  ylim <- c(-10,10)
  drawLines <- function() abline(h=c(-.4,.4),col="dodgerblue",lwd=2)
  plotMA(resRaw, ylim=ylim); drawLines()
  
  ##Volcano plot
  DEgenes_DESeq <- resTableMod[which(abs(resTableMod$log2FoldChange) > log2(1.5) & resTableMod$padj < 0.05),]
  
  pEnrich = ggplot2::ggplot(resTableMod, ggplot2::aes(log2FoldChange, -log10(pvalue))) +
    ggplot2::geom_point(ggplot2::aes(col = sig)) +
    ggplot2::scale_color_manual(values = c("red", "black")) +
    ggplot2::ggtitle("Volcano Plot of DESeq2 analysis")
  
  pEnrich2 = pEnrich + ggrepel::geom_text_repel(data=resTableMod[1:10, ], ggplot2::aes(label=rownames(resTableMod[1:10, ])))
  print(pEnrich2)
  
  
  ## Dispersion plot
  plotDispEsts(ddsRes)
  
  # Other plots
  plot(resTableRaw$baseMean+1, -log10(resTableRaw$pvalue),
       log="x", xlab="mean of normalized counts",
       ylab=expression(-log[10](pvalue)),
       ylim=c(0,30),
       cex=.4, col=rgb(0,0,0,.3))
  
  use <- resTableRaw$baseMean > 10
  table(use)
  h1 <- hist(resTableRaw$pvalue[!use], breaks=0:50/50, plot=FALSE)
  h2 <- hist(resTableRaw$pvalue[use], breaks=0:50/50, plot=FALSE)
  colori <- c(`do not pass`="khaki", `pass`="powderblue")
  barplot(height = rbind(h1$counts, h2$counts), beside = FALSE,
          col = colori, space = 0, main = "", ylab="frequency")
  text(x = c(0, length(h1$counts)), y = 0, label = paste(c(0,1)),
       adj = c(0.5,1.7), xpd=NA)
  legend("topright", fill=rev(colori), legend=rev(names(colori)))
  
  
  ##PCA
  sp_onl <- sapply( strsplit(as.character(colnames(se)), "_"), "[[", 1 )
  
  if (is.null(var1_design)){
    PCA_fam_n500_Data <- plotPCA(DESeqTransform( se ), intgroup= var2_design, ntop = 500, returnData = TRUE )
  } else {
    PCA_fam_n500_Data <- plotPCA(DESeqTransform( se ), intgroup=c(var2_design, var1_design), ntop = 500, returnData = TRUE )
  }
  
  percentVar_n500 <- round(100 * attr(PCA_fam_n500_Data, "percentVar"))
  
  shape_col = ifelse(is.null(var1_design) , var2_design ,var1_design)
  
  PCA_n500_fam <- ggplot(PCA_fam_n500_Data, aes(PC1, PC2, color=NULL, shape=get(shape_col))) +
    geom_point(size=5) +
    scale_shape_manual(values=c(1:9)) +
    xlab(paste0("PC1: ",percentVar_n500[1],"% variance")) +
    ylab(paste0("PC2: ",percentVar_n500[2],"% variance")) + 
    geom_text_repel(aes(label=rownames(coldata)),hjust=.5, vjust=-.8, size=5) +
    geom_line(aes(colour=sp_onl), size=1)+
    ggtitle("PCA n500") +
    coord_fixed()
  
  print(PCA_n500_fam)
  
  if (is.null(var1_design)){
    PCA_fam_nTop_Data <- plotPCA(DESeqTransform( se ), intgroup= c(var2_design), ntop = nrow(Alltableok1), returnData = TRUE )
  } else {
    PCA_fam_nTop_Data <- plotPCA(DESeqTransform( se ), intgroup=c(var2_design, var1_design), ntop = nrow(Alltableok1), returnData = TRUE )
  }
  
  percentVar_nTop <- round(100 * attr(PCA_fam_nTop_Data, "percentVar"))
  
  PCA_nTop_fam <- ggplot(PCA_fam_nTop_Data, aes(PC1, PC2, color=NULL, shape=shape_col)) +
    geom_point(size=5) +
    scale_shape_manual(values=c(1:10)) +
    xlab(paste0("PC1: ",percentVar_nTop[1],"% variance")) +
    ylab(paste0("PC2: ",percentVar_nTop[2],"% variance")) + 
    geom_text_repel(aes(label=rownames(coldata)),hjust=.5, vjust=-.8, size=5) +
    geom_line(aes(colour=sp_onl), size=1)+
    ggtitle("PCA nTotal") +
    coord_fixed()
  
  print(PCA_nTop_fam)
  
  #if (is.null(var1_design)){
  #  PCA34 <- plotPCA.mystyle(rld, intgroup= c(var2_design), ntop = nrow(Alltableok1), returnData=FALSE, pcs = c(3,4),scale=TRUE)
  #} else {
  #  PCA34 <- plotPCA.mystyle(rld, intgroup=c(var2_design, var1_design), ntop = nrow(Alltableok1), returnData=FALSE, pcs = c(3,4),scale=TRUE)
  #}
  PCA34 <- plotPCA.mystyle(rld, intgroup= c(var2_design), ntop = nrow(Alltableok1), returnData=FALSE, pcs = c(3,4),scale=TRUE)
  
  print(PCA34)
  
  PCA56 <- plotPCA.mystyle(rld, intgroup= c(var2_design), ntop = nrow(Alltableok1), returnData=FALSE, pcs = c(5,6),scale=TRUE)
  
  print(PCA56)
  print("PCA done")
  
  
  # BCA ANALYSES
  # print("BCA analyses")
  # if (is.null(var1_design)){
  #   #BCA <- plotPCA.mystyle2(rld, var2_design, ntop = nrow(Alltableok1), returnData=FALSE, pcs = c(1,2),scale=TRUE)
  #   print("TODO BCA var1")
  # } else {
  #   BCA <- plotPCA.mystyle2(rld, var1_design, var2_design, ntop = nrow(Alltableok1), returnData=FALSE, pcs = c(1,2),scale=TRUE)
  # }
  #BCA <- plotPCA.mystyle2(rld, intgroup= c(var2_design), ntop = nrow(Alltableok1), returnData=FALSE, pcs = c(1,2),scale=TRUE)
  
  #print(BCA)
  #print("BCA done")
  
  # HEATMAP FIGURES
  
  #Correlation matrix all genes all individuals
  sampleDists <- dist( t( assay(rld) ) )
  sampleDistMatrix <- as.matrix( sampleDists )
  colours = colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
  heatmap.2( sampleDistMatrix, trace="none", col=colours, main = "Corr matrix with all genes")
  
  #Correlation matrix DE genes all individuals
  DEresBasemean <- resBasemean[rownames(resBasemean) %in% rownames(res_season_padj01_onl),]
  sampleDists <- dist( t( DEresBasemean ) )
  sampleDistMatrix <- as.matrix( sampleDists )
  
  sp_arid <- sample(row.names(coldata)[coldata$cond_season == "arid"], length(row.names(coldata)[coldata$cond_season == "arid"]))
  cols = rep('black', nrow(sampleDistMatrix))
  cols[row.names(sampleDistMatrix) %in% sp_arid] <- 'red'
  colours = colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
  
  heatmap.2( sampleDistMatrix, trace="none", col=colours, colRow = cols, main = "Correlation with DE genes only")
  
  #Heatmap DE genes all individuals -Showing gene expression
  reslog <- log10(resBasemean)
  DEreslog <- merge(x=reslog, y=res_season_padj01_onl, by=0, all=F)
  DEreslog <- DEreslog[ order(DEreslog$log2FoldChange), ]
  DEreslog <- subset(DEreslog, select = -c(baseMean,log2FoldChange,lfcSE,stat,pvalue,padj))
  row.names(DEreslog) <- DEreslog$Row.names
  DEreslogOK <- DEreslog[-1]
  DEreslogOK_mat <- as.matrix(DEreslogOK)
  DEreslogOK_mat[!is.finite(DEreslogOK_mat)] <- 0
  heatmap(DEreslogOK_mat,Rowv=NA, scale="row", main = "Heatmap DE genes only")
  
  #50 more expressed genes
  select <- order(rowMeans(resBasemean),decreasing=TRUE)[1:50]
  if (is.null(var1_design)){
    tmp_coldata = as.data.frame(colData(ddsRes))
    df <- data.frame(X = tmp_coldata[,c(var2_design)])
    colnames(df) = c(var2_design)
    rownames(df) <- rownames(tmp_coldata)
  } else {
    df <- as.data.frame(colData(ddsRes))[,c(var2_design, var1_design)]
  }
  
  p_exp=pheatmap(assay(ntd)[select,], cluster_rows=FALSE, show_rownames=TRUE,
                 cluster_cols=FALSE, annotation_col=df, main = "50 highest expressed genes" )
  print(p_exp)
  p_exp
  
  #variable genes
  selectvar <- order(rowVars(resBasemean), decreasing=TRUE)[1:50]
  p_var=pheatmap(assay(ntd)[selectvar,], cluster_rows=FALSE, show_rownames=TRUE,
                 cluster_cols=FALSE, annotation_col=df, main = "Variable genes" )
  
  print(p_var)
  p_var
  
  print("heatmaps done")
  
  dev.off()
  
}



################################
# MAKE STAT TABLE
################################

make_stat_table_per_gene = function(resBasemean,coldata, results) {
  stat_var_df = bind_rows(apply(resBasemean, 1, function(x){
    sp = names(x)
    group = coldata[,c("cond_season","final_name_short","sp_short_name","species")]
    group$species = as.character(group$species)
    count_g = data.frame(count = x)
    count_g = merge(count_g, group, by.x=0, by.y = "final_name_short")
    
    #variance mesic
    var_mes = var(count_g$count[count_g$cond_season == "mesic"])
    
    #variance arid
    
    var_xer = var(count_g$count[count_g$cond_season == "arid"])
    #variance global
    
    var_tot = var(count_g$count)
    
    var_mes_norm = var_mes/var_tot
    var_xer_norm = var_xer/var_tot
    
    ratio_var = max(var_xer_norm,var_mes_norm) / min(var_xer_norm,var_mes_norm)
    
    # moyenne
    
    mean_mesic = mean(count_g$count[count_g$cond_season == "mesic"])
    mean_arid = mean(count_g$count[count_g$cond_season == "arid"])
    mean_tot = mean(count_g$count)
    
    mean_mes_norm = mean_mesic/mean_tot
    mean_xer_norm = mean_arid/mean_tot
    
    # diff mean
    
    min_mesic = min(count_g$count[count_g$cond_season == "mesic"])
    max_mesic = max(count_g$count[count_g$cond_season == "mesic"])
    
    min_arid = min(count_g$count[count_g$cond_season == "arid"])
    max_arid = max(count_g$count[count_g$cond_season == "arid"])
    
    
    # moyenne par especes
    
    mean_per_sp = tapply(count_g$count, count_g$species, mean,na.rm=T)
    df_mean_per_sp = data.frame(mean_per_sp=mean_per_sp)
    df_mean_per_sp = merge(df_mean_per_sp, unique(group[,c("species","cond_season")]), by.x=0, by.y = "species")
    
    
    # test presence outliers
    
    ## mesic
    
    outlier_mesic = grubbs.test(df_mean_per_sp$mean_per_sp[df_mean_per_sp$cond_season == "mesic"])$p.value
    if (is.na(outlier_mesic)) {outlier_mesic= 1}
    ## arid
    
    outlier_arid = grubbs.test(df_mean_per_sp$mean_per_sp[df_mean_per_sp$cond_season == "arid"])$p.value
    if (is.na(outlier_arid)) {outlier_arid= 1}
    
    ## global
    
    outlier_tot = grubbs.test(df_mean_per_sp$mean_per_sp)$p.value
    if (is.na(outlier_tot)) {outlier_tot= 1}
    
    
    data.frame(var_mes_norm = var_mes_norm, var_xer_norm = var_xer_norm, var_tot = var_tot,
               mean_mes_norm = mean_mes_norm, mean_xer_norm = mean_xer_norm, mean_tot = mean_tot,
               outlier_mesic = outlier_mesic, outlier_arid = outlier_arid, outlier_tot = outlier_tot,
               ratio_var=ratio_var, min_mesic = min_mesic, max_mesic = max_mesic, min_arid = min_arid,
               max_arid = max_arid)
  }) , .id = "gene")
  
  deseq_var_df <- merge(stat_var_df, results, by.x="gene", by.y = 0)
  deseq_var_df <- deseq_var_df[!is.na(deseq_var_df$padj),]
  
  return(deseq_var_df)
}




################################
# PLOT GENE
################################


plot_gene = function(gene, dds, coldata, resBasemean, var1_design, var2_design, titre="") {
  
  data = as.data.frame(resBasemean[gene,])
  colnames(data) = "count"
  
  data_bio17 <- merge(data, coldata, by =0)
  
  mean_mesic = mean(data_bio17$count[data_bio17$cond_season == "mesic"])
  mean_arid = mean(data_bio17$count[data_bio17$cond_season == "arid"])
  
  data_bio17$cond_season = factor(data_bio17$cond_season, levels = c("arid","mesic"))
  
  p1 = ggplot(data_bio17, aes(x=cond_season, y=count, color=family, shape = cond_season )) + 
    geom_point(size=2, position = position_dodge(0.6)) + ggtitle(gene) +
    geom_hline(yintercept = mean_mesic, color = "turquoise3") +
    geom_hline(yintercept = mean_arid, color = "coral1") 
  #scale_y_log10()
  
  p2 = ggplot(data_bio17, aes(x=Bio17.without, y=count, color=cond_season, label = final_name_short, shape = cond_season)) + 
    geom_point(size=2) +
    geom_hline(yintercept = mean_mesic, color = "turquoise3") +
    geom_hline(yintercept = mean_arid, color = "coral1") +
    ggtitle(titre)
  #geom_text_repel()
  #scale_y_log10()
  
  plot_grid(p1+theme(legend.position = "none") ,p2+theme(legend.position = "none"), labels = c("a","b"), rel_widths = c(0.3,0.7))
  
}



################################
# CATEORIZATION
################################

make_categories = function (deseq_var_df, padj_v = 0.1) {
  res_df = subset(deseq_var_df, padj < padj_v )
  
  #print("nb DE")
  #print(dim(res_df))
  
  # gene up
  res_df$up <- F
  res_df$up [res_df$mean_xer_norm >   res_df$mean_mes_norm  & res_df$min_arid > 1.5 * res_df$min_mesic & res_df$max_arid > res_df$max_mesic ] = T
  
  #down
  res_df$down <- F
  res_df$down [res_df$mean_xer_norm <   res_df$mean_mes_norm  & res_df$min_mesic > 1.5 * res_df$min_arid & res_df$max_arid < res_df$max_mesic ] = T
  
  #constraint
  res_df$constraint <- F
  res_df$constraint [ (2 * res_df$max_mesic > res_df$max_arid) & res_df$min_mesic <= 1.5 * res_df$min_arid & (res_df$var_mes_norm > 2 * res_df$var_xer_norm ) ] = T
  
  #diversification
  res_df$diversification <- F
  res_df$diversification [(2 * res_df$max_arid > res_df$max_mesic) & res_df$min_arid <= 1.5 * res_df$min_mesic & (res_df$var_xer_norm > 2 * res_df$var_mes_norm )  ] = T
  
  # Tri des outliers
  res_df$outlier <- F
  res_df$outlier[  res_df$outlier_tot < 1e-05 & (res_df$outlier_mesic < 1e-4 |res_df$outlier_arid < 1e-4 )] = T 
  
  # unclassified
  res_df$unclassified <- F
  res_df$unclassified[! res_df$up & ! res_df$down & ! res_df$constraint & ! res_df$diversification] = T 
  
  res_df$multi_cat = apply(res_df[,c("up","down","constraint","diversification")], 1, sum)
  
  res_df$cat_and_outlier = F
  res_df$cat_and_outlier[res_df$outlier &  res_df$multi_cat > 0] = T
  
  
  return(res_df)
}


################################
# RANDOM LABEL
################################


#Function to randomize the environment and run DE Seq2
Random_label <- function(x){
  #head(AlltableMat)
  tryCatch({
    library(DESeq2)
    library(dplyr)
    library(outliers)
    x_len = length(unique(coldata$sp_short_name[coldata$cond_season == "arid"]))
    m_len = length(unique(coldata$sp_short_name[coldata$cond_season == "mesic"]))
    tmp = unique(coldata[,c("sp_short_name","cond_season")])
    true_etiquettes = tmp$cond_season
    unique_sp <- tmp$sp_short_name
    random_etiquettes <- sample(c(rep("arid",x_len),rep("mesic",m_len)) , replace = FALSE)
    coldata$newEnv <- mapvalues(coldata$sp_short_name, from=unique_sp, to=random_etiquettes)
    
    if (! is.null(var1_design)){
      design_new = as.formula(paste("~ ", var1_design, "+", "newEnv"))
    } else {
      design_new = as.formula(paste("~ ", "newEnv"))
    } 
    
    
    ddsInput <- DESeqDataSetFromMatrix(countData = AlltableMat,
                                       colData = coldata,
                                       design = design_new)
    
    dds <- DESeq(ddsInput)
    resBasemean <- counts(dds, norm=T)
    res <- results(dds,  lfcThreshold=.4, altHypothesis="greaterAbs")
    res_raw <- results(dds)
    
    res <- res[order(res$padj),]
    res_df <- as.data.frame(res)
    
    results_df = as.data.frame(dplyr::mutate(as.data.frame(res), sig=ifelse(res$padj<0.1, "FDR<0.1", "Not Sig")), row.names=rownames(res))
    
    deseq_var_df = make_stat_table_per_gene(resBasemean, coldata, results_df[results_df$padj < 0.2,])
    
    #print(deseq_var_df)
    df_cat = make_categories(deseq_var_df)
    #print(df_cat)
    #eror
    
    nb_gene_by_cat = apply(df_cat[, c("up", "down", "constraint", "diversification", "unclassified")] , 2, sum)
    nb_gene_by_cat_df = as.data.frame(t(nb_gene_by_cat))  
    
    nb_gene_by_cat_df$identical_design_ind = 0
    nb_gene_by_cat_df$identical_design_ind = sum(as.character(coldata$newEnv) == as.character(coldata$cond_season))
    nb_gene_by_cat_df$identical_design_sp = 0
    nb_gene_by_cat_df$identical_design_sp = sum(as.character(true_etiquettes) == as.character(random_etiquettes))
    #if ( sum(duplicated(coldata$cond_season, coldata$design_new)) > 0 ) { 
    #  nb_gene_by_cat_df$identical_design = 0 } else {
    #    nb_gene_by_cat_df$identical_design = 1
    #  }
    
    return(nb_gene_by_cat_df)
    
  }, error=function(err){
    print("ERROR")
    print(err)
    return(data.frame(up=0,down=0, constraint=0, diversification=0, unclassified=0, identical_design_ind=0,identical_design_sp=0))
  })
}


################################################
# FIGURE CROSS GENES KNOWN IN BIBLIO AND OTHER STUDIES
################################################


barplot_DE_refCross <- function(df_cat,CompTableList) {
  
  #Shorten the lfc table with DE genes with the chosen conditions
  #DE_list <- rbind(table_cond)
  lfc_Cond_DE <- df_cat
  
  #Add known genes from bibliography and other studies
  
  lfc_Cond_DE_melt_l2fc <- merge(lfc_Cond_DE, CompTableList, by.x = "gene", by.y = "MGI", all.x=TRUE, all.y = FALSE)
  lfc_Cond_DE_melt_l2fc$disorders = (lfc_Cond_DE_melt_l2fc$log2FoldChange + (0.15 * ifelse(lfc_Cond_DE_melt_l2fc$log2FoldChange >0, 1, -1 ))) *lfc_Cond_DE_melt_l2fc$disorders
  lfc_Cond_DE_melt_l2fc$dehydration = (lfc_Cond_DE_melt_l2fc$log2FoldChange + (0.22 * ifelse(lfc_Cond_DE_melt_l2fc$log2FoldChange >0, 1, -1 ))) * lfc_Cond_DE_melt_l2fc$dehydration
  lfc_Cond_DE_melt_l2fc$SC = (lfc_Cond_DE_melt_l2fc$log2FoldChange + (0.33 * ifelse(lfc_Cond_DE_melt_l2fc$log2FoldChange >0, 1, -1 ))) * lfc_Cond_DE_melt_l2fc$SC
  lfc_Cond_DE_melt_l2fc$markerBiblio = (lfc_Cond_DE_melt_l2fc$log2FoldChange + (0.44 * ifelse(lfc_Cond_DE_melt_l2fc$log2FoldChange >0, 1, -1 ))) * lfc_Cond_DE_melt_l2fc$markerBiblio
  lfc_Cond_DE_melt_l2fc$seq_conv_intersection = (lfc_Cond_DE_melt_l2fc$log2FoldChange + (0.55 * ifelse(lfc_Cond_DE_melt_l2fc$log2FoldChange >0, 1, -1 ))) *lfc_Cond_DE_melt_l2fc$seq_conv_intersection
  lfc_Cond_DE_melt_l2fc$pcoc = (lfc_Cond_DE_melt_l2fc$log2FoldChange + (0.66 * ifelse(lfc_Cond_DE_melt_l2fc$log2FoldChange >0, 1, -1 ))) *lfc_Cond_DE_melt_l2fc$pcoc
  lfc_Cond_DE_melt_l2fc$tdg09 = (lfc_Cond_DE_melt_l2fc$log2FoldChange + (0.77 * ifelse(lfc_Cond_DE_melt_l2fc$log2FoldChange >0, 1, -1 ))) *lfc_Cond_DE_melt_l2fc$tdg09
  
  lfc_Cond_DE_melt_l2fc$just <- ifelse(lfc_Cond_DE_melt_l2fc$log2FoldChange >0,1,0)
  lfc_Cond_DE_melt_l2fc_s <- subset(lfc_Cond_DE_melt_l2fc, select = c(gene, up, down, constraint, diversification, unclassified,
                                                                      disorders, dehydration, SC, markerBiblio, seq_conv_intersection, pcoc, tdg09, just, log2FoldChange, padj))
  print("lfc_Cond_DE_melt_l2fc_s ok")
  write.table(lfc_Cond_DE_melt_l2fc, file = paste0(output_dir,"table_croisement_biblio.tsv"), sep = "\t", quote = F)
  
  lfc_Cond_DE_melt_l2fc_l <- melt(lfc_Cond_DE_melt_l2fc_s, id.vars = c("gene", "disorders", "dehydration", "SC", "markerBiblio", "seq_conv_intersection", "pcoc", "tdg09",
                                                                       "just", "log2FoldChange", "padj"), variable.name = "cat")
  lfc_Cond_DE_melt_l2fc_l <- lfc_Cond_DE_melt_l2fc_l[lfc_Cond_DE_melt_l2fc_l$value,]
  print("lfc_Cond_DE_melt_l2fc_s ok")
  
  #Draw the plot
  print("DE_public_ref")
  DE_public_ref <- ggplot(lfc_Cond_DE_melt_l2fc_l, aes(x=reorder(gene, log2FoldChange), y=log2FoldChange)) + 
    theme_classic() +
    theme(legend.position="bottom") +
    geom_bar(stat="identity", position=position_dodge(0.5), fill = "white", col = "black")+ 
    geom_point(aes(y = disorders), col = "black", shape = 6, size = 3, position=position_dodge(0.9)) +
    geom_point(aes(y = dehydration), col = "grey", shape = 15, size = 3, position=position_dodge(0.9)) +
    geom_point(aes(y = SC), col = "blue", shape = 17, size = 3, position=position_dodge(0.9)) +
    geom_point(aes(y = markerBiblio), col = "green", shape = 18, size = 3, position=position_dodge(0.9)) +
    geom_point(aes(y = seq_conv_intersection), col = "orange", shape = 19, size = 3, position=position_dodge(0.9)) +
    geom_point(aes(y = pcoc), col = "red", shape = 19, size = 3, position=position_dodge(0.9)) +
    geom_point(aes(y = tdg09), col = "darkgoldenrod", shape = 19, size = 3, position=position_dodge(0.9)) +
    guides(shape = "none", col= "none") +
    scale_x_discrete("", labels=NULL) +
    scale_y_continuous(name="L2FC") +
    geom_text(aes(x=gene, y=0, label=paste0(" ",gene," "),hjust=just), size= 2.2) +
    ggtitle("Are DE genes known in the kidney literatures and present in sequence analyses?") +
    coord_flip()
  print("DE_public_ref done")
  
  df_list <- data.frame(list = c("disorders", "dehydration","SC","markerBiblio","seq_conv_intersection","pcoc", "tdg09"), shape = c(6, 15, 17, 18, 19,19, 19), col = c("black","grey","blue","green","orange", "red", "darkgoldenrod"), stringsAsFactors = FALSE)
  df_list$list = factor(df_list$list, levels = c("disorders", "dehydration","SC","markerBiblio","seq_conv_intersection","pcoc", "tdg09"))
  p_list=ggplot(df_list, aes(x=shape, y=shape, col=factor(list), shape=factor(list))) +
    geom_point()+
    scale_shape_manual(values = df_list$shape, name="Studies")+
    scale_color_manual(values = df_list$col, name="Studies")
  
  legend = get_legend(p_list + theme(legend.position="top",
                                     legend.text  = element_text(size=10),
                                     legend.title = element_text(size=0),
                                     legend.background = element_rect(fill="white", size=0.5, linetype="solid",
                                                                      colour ="black")) +
                        guides( colour = guide_legend(override.aes = list(alpha = 1,size = 3), nrow=1))
  )
  
  
  DE_public_ref_leg = plot_grid(DE_public_ref + theme(legend.position="none"),
                                plot_grid(NULL,legend ,NULL,rel_widths = c( 0.3, 1,0.3)),
                                ncol = 1,
                                nrow = 2,
                                rel_heights = c( 5, 1),
                                hjust = 0, vjust = 0, align="hv")
  
  print(DE_public_ref_leg)
  
  
  print("DE_public_ref_list_per_cat")
  DE_public_ref_list_per_cat = lapply(c("down","up","constraint", "diversification", "unclassified"), function (x){
    print(x)
    tmp_df =  subset(lfc_Cond_DE_melt_l2fc_l, cat == x)
    print(dim(tmp_df))
    p = NULL
    if (nrow(tmp_df)> 0) {
      p = ggplot(tmp_df, aes(x=reorder(gene, log2FoldChange), y=log2FoldChange)) + 
        theme_classic() +
        theme(legend.position="bottom") +
        geom_bar(stat="identity", position=position_dodge(0.5), fill = "white", col = "black", width = 0.8)+ 
        geom_point(aes(y = disorders), col = "black", shape = 6, size = 3, position=position_dodge(0.9)) +
        geom_point(aes(y = dehydration), col = "grey", shape = 15, size = 3, position=position_dodge(0.9)) +
        geom_point(aes(y = SC), col = "blue", shape = 17, size = 3, position=position_dodge(0.9)) +
        geom_point(aes(y = markerBiblio), col = "green", shape = 18, size = 3, position=position_dodge(0.9)) +
        geom_point(aes(y = seq_conv_intersection), col = "orange", shape = 19, size = 3, position=position_dodge(0.9)) +
        geom_point(aes(y = pcoc), col = "red", shape = 19, size = 3, position=position_dodge(0.9)) +
        geom_point(aes(y = tdg09), col = "darkgoldenrod", shape = 19, size = 3, position=position_dodge(0.9)) +
        guides(shape = FALSE, col=F) +
        scale_x_discrete("", labels=NULL) +
        scale_y_continuous(name="L2FC") +
        geom_text(aes(x=gene, y=0, label=paste0(" ",gene," "),hjust=just), size= 2.2) +
        ylim(c(min(lfc_Cond_DE_melt_l2fc_l$log2FoldChange), max(lfc_Cond_DE_melt_l2fc_l$log2FoldChange))) +
        ggtitle(paste0(x)) +
        coord_flip()
    }
    return(p)
  })
  print("DE_public_ref_list_per_cat done")
  
  names(DE_public_ref_list_per_cat) = c("down","up","constraint", "diversification", "unclassified")
  non_null_names <- which(!sapply(DE_public_ref_list_per_cat, is.null))
  DE_public_ref_list_per_cat <- DE_public_ref_list_per_cat[non_null_names]
  names(DE_public_ref_list_per_cat) <- non_null_names
  
  facet_cat_biblio = plot_grid(plotlist = DE_public_ref_list_per_cat, ncol = length(DE_public_ref_list_per_cat))
  print("facet_cat_biblio done")
  
  DE_public_ref_leg_facet = plot_grid(facet_cat_biblio + theme(legend.position="none"),
                                      plot_grid(NULL,legend ,rel_widths = c( 0.4, 1)),
                                      ncol = 1,
                                      nrow = 2,
                                      rel_heights = c( 3, 1),
                                      hjust = 0, vjust = 0, align="hv")
  print("DE_public_ref_leg_facet done")
  
  print(DE_public_ref_leg_facet)
  #print("DE_public_ref_leg_facet done2")
  
  return(list(DE_public_ref_leg = DE_public_ref_leg, DE_public_ref_leg_facet = DE_public_ref_leg_facet))
  
  
}



################################
# CELL LOCALIZATION FIGURE
################################

histo_DE_CellLoc_prop <- function(DE_list, dMusParkMax_stat_df) {
  
  DE_list_df=as.data.frame(rownames(DE_list))
  colnames(DE_list_df) <- "MGI"
  DE_list_df_park <- merge(DE_list_df, dMusParkMax2, by.x = "MGI", by.y = "name", all = FALSE)
  colnames(DE_list_df_park) <- c("MGI","Cell")
  DE_list_df_park_fq <- as.data.frame(table(DE_list_df_park$Cell))
  s <- sum(DE_list_df_park_fq$Freq)
  DE_list_df_park_fq$prop <- (100*DE_list_df_park_fq$Freq)/s
  colnames(DE_list_df_park_fq) <- c("Cell", "count", "prop")
  
  fq_df <- merge(DE_list_df_park_fq, dMusParkMax_stat_df, by = "Cell", all = T)
  fq_df$ratio <- fq_df$prop / fq_df$prop_real
  
  DE_cell_real_bar <- ggplot(fq_df, aes(x=Cell, y= ratio)) +  
    geom_bar(stat="identity", position=position_dodge(0.9))+
    geom_hline(yintercept = 1, color = "red", linetype = "dashed", size = 1) +
    scale_x_discrete("Cell types in kidney") +
    scale_y_continuous(name="Proportion DE/Proportion Real SC") +
    ggtitle(paste0(output_dir,"proportion cellular localization in kidney")) +
    coord_flip()
  
  print(DE_cell_real_bar)
  
  ggsave(DE_cell_real_bar, file = paste0(output_dir,"cell_localization_enrichment.pdf"), limitsize = FALSE)
  
  return(DE_cell_real_bar)
}





################################
# FONCTION ANALYSE GO
################################


GOwithCP <- function(results,name_GO_pdf, Alltableok, gene) {
  
  ## Gene annotations - add columns to DESseq2 results with Mgi, ENTREZ_ID and Ensembl ID for clusterProfiler
  
  #Step1:Get a table with Ensembl and Mgi and cluster ID
  results_anno1 <- merge(x=results, y=MusAssignIDs[, c("ClusterID","EnsGeneID","MGI")], by.x = "row.names", by.y=c("MGI"), all=F)    #results used in volcano plot
  u = unique(results_anno1$Row.names)
  results_anno2 = results_anno1[match(u, results_anno1$Row.names),] #dim 7021
  row.names(results_anno2) <- results_anno2$Row.names 
  results_anno2 <- results_anno2[-1]
  
  #Step2:Get EntrezID associated with Mgi symbol
  entrezID <- select(org.Mm.eg.db, keys = row.names(results_anno2), columns = "ENTREZID", keytype = "SYMBOL") 
  u2=unique(entrezID$SYMBOL)
  entrezID <- entrezID[match(u2, entrezID$SYMBOL),]
  row.names(entrezID) = entrezID$SYMBOL 
  entrezID <- entrezID[-1]
  
  #Step3: combine all information properly in a single dataframe
  results_anno3 = merge(x=results_anno2, y=entrezID, by= "row.names")
  results_anno <- subset(results_anno3, select = c(Row.names,baseMean, log2FoldChange, lfcSE, stat, pvalue, padj, sig, EnsGeneID,ENTREZID))
  row.names(results_anno) = results_anno$Row.names
  names(results_anno)[names(results_anno) == 'EnsGeneID'] <- 'ENSEMBL'
  names(results_anno)[names(results_anno) == 'Row.names'] <- 'SYMBOL'
  
  ## geneList file preparation - universe file, background (present in all species, ~6000 genes)
  resultListPrep <- subset(results_anno, select=c("ENTREZID","log2FoldChange"))
  rownames(resultListPrep) <- c()
  geneList = resultListPrep[,2]
  names(geneList) = as.character(resultListPrep[,1])
  geneListShort = sort(geneList, decreasing = TRUE)  
  
  ## geneList file preparation - universe file, background all genes expressed in kidney (not in all species, ~14000 genes)
  
  entrezID2 <- select(org.Mm.eg.db, keys = row.names(Alltableok), columns = "ENTREZID", keytype = "SYMBOL") 
  u22=unique(entrezID2$SYMBOL)
  entrezID2 <- entrezID2[match(u22, entrezID2$SYMBOL),]
  row.names(entrezID2) = entrezID2$SYMBOL 
  entrezID2 <- entrezID2[-1]
  geneListLong = as.character(na.omit(entrezID2$ENTREZID))
  
  #Choose geneList_universe (geneListShort or geneListLong)
  geneList_universe = geneListLong
  geneList_universe_tag = "long" # "short"
  
  #DE gene list
  
  if (! is.null(gene)) {
    entrezIDgene <- select(org.Mm.eg.db, keys = as.character(gene$MGI), columns = "ENTREZID", keytype = "SYMBOL") 
    u=unique(entrezIDgene$SYMBOL)
    entrezIDgene <- entrezIDgene[match(u, entrezIDgene$SYMBOL),]
    row.names(entrezIDgene) = entrezIDgene$SYMBOL 
    entrezIDgene <- entrezIDgene[-1]
    geneDE <- as.character(na.omit(entrezIDgene$ENTREZID))
    
  } else {
    
    geneDE <- as.character(na.omit(results_anno$ENTREZID[results_anno$padj < 0.1])) #Possibility to modify padj 0.05
    geneDE_MGI <- as.character(na.omit(results_anno$SYMBOL[results_anno$padj < 0.1]))
    
  }
  
  
  OrgDb <- org.Mm.eg.db # can also be other organisms
  
  ##ClusterProfiler
  #ontology categories: BiologicalProcess, MolecularFunction, CellularComponent (BP, CC or MF)
  
  pdf(name_GO_pdf, width = 20, height = 10)
  
  ### Group GO BP
  ggoBP <- clusterProfiler::groupGO(gene     = geneDE,
                                    OrgDb    = OrgDb,
                                    ont      = c("BP"),
                                    level    = 3,
                                    readable = TRUE)
  #head(summary(ggo)[,-5])
  ggoBP2 <- as.data.frame(ggoBP)[,-5]
  ggoBP2 <- ggoBP2[order(ggoBP2$Count, decreasing = TRUE),]
  ggoBP2$Description = factor(x=ggoBP2$Description, levels = ggoBP2$Description[order(ggoBP2$Count)])
  
  PggoBP <- ggplot(ggoBP2[1:20,], aes(x=Description, y=Count, fill=Count)) +
    geom_bar(stat="identity", color="black")+theme_bw() +
    coord_flip() + ylab("Count") + xlab("Biological Process") +
    scale_fill_gradient(low = "red", high = "blue",
                        space = "Lab", na.value = "grey50", guide = "colourbar") +
    theme(legend.position="none")
  #PggoBP
  
  ### Group GO MF
  ggoMF <- clusterProfiler::groupGO(gene     = geneDE,
                                    OrgDb    = OrgDb,
                                    ont      = c("MF"),
                                    level    = 3,
                                    readable = TRUE)
  #head(summary(ggo)[,-5])
  ggoMF2 <- as.data.frame(ggoMF)[,-5]
  ggoMF2 <- ggoMF2[order(ggoMF2$Count, decreasing = TRUE),]
  ggoMF2$Description = factor(x=ggoMF2$Description, levels = ggoMF2$Description[order(ggoMF2$Count)])
  
  PggoMF <- ggplot(ggoMF2[1:20,], aes(x=Description, y=Count, fill=Count)) +
    geom_bar(stat="identity", color="black")+theme_bw() +
    coord_flip() + ylab("Count") + xlab("Molecular Function") +
    scale_fill_gradient(low = "red", high = "blue",
                        space = "Lab", na.value = "grey50", guide = "colourbar") +
    theme(legend.position="none")
  #PggoMF
  
  ### Group GO CC
  ggoCC <- clusterProfiler::groupGO(gene     = geneDE,
                                    OrgDb    = OrgDb,
                                    ont      = c("CC"),
                                    level    = 3,
                                    readable = TRUE)
  #head(summary(ggo)[,-5])
  ggoCC2 <- as.data.frame(ggoCC)[,-5]
  ggoCC2 <- ggoCC2[order(ggoCC2$Count, decreasing = TRUE),]
  ggoCC2$Description = factor(x=ggoCC2$Description, levels = ggoCC2$Description[order(ggoCC2$Count)])
  
  PggoCC <- ggplot(ggoCC2[1:20,], aes(x=Description, y=Count, fill=Count)) +
    geom_bar(stat="identity", color="black")+theme_bw() +
    coord_flip() + ylab("Count") + xlab("Cellular Compound") +
    scale_fill_gradient(low = "red", high = "blue",
                        space = "Lab", na.value = "grey50", guide = "colourbar") +
    theme(legend.position="right")
  #PggoCC
  
  Pggo_all <- grid.arrange(PggoBP, PggoMF,PggoCC, nrow = 3)
  print(Pggo_all)
  
  #Maybe do a grouped circular barplot https://www.r-graph-gallery.com/297-circular-barplot-with-groups/
  
  ### GO over-representation test -Enrich GO
  ego <- clusterProfiler::enrichGO(gene          = geneDE,
                                   OrgDb         = OrgDb,
                                   universe = geneList_universe,
                                   pvalueCutoff  = 1,
                                   qvalueCutoff  = 1)
  
  
  head(summary(as.data.frame(ego)))
  
  dotego <- clusterProfiler::dotplot(ego, showCategory=25)
  print(dotego)
  Gorill_ego <- plotGOgraph(ego) #Should be gseGO or enrichGO, similar picture as GOrilla
  print(Gorill_ego)
  
  #upsetplot(ego, n=10) #need package UpSetR
  
  ### Remove redundant GO terms and network concept
  ego2 <- simplify(ego)
  ego2 <- setReadable(ego2, OrgDb = org.Mm.eg.db) #To convert ENTREZID to SYMBOL
  
  if (geneList_universe_tag == "short") {
    cnetplot(ego2, foldChange=geneList_universe)
    cnetego2 <- cnetplot(ego2, foldChange=geneList_universe, circular = TRUE, colorEdge = TRUE) #circular view
  } else {
    cnetego2 <- cnetplot(ego2, circular = TRUE, colorEdge = TRUE)
  } #circular view
  
  
  print(cnetego2)
  heatego2 <- heatplot(ego2)
  print(heatego2)
  ego2_compare <- enrichplot::pairwise_termsim(ego2)
  emaego2 <- emapplot(ego2_compare)
  print(emaego2)
  
  geneInCategory(ego2)[as.data.frame(ego2)$ID] %>% unlist %>% unique
  
  
  #EnrichGO in the three ontologies
  goAll <- clusterProfiler::enrichGO(gene          = geneDE,
                                     OrgDb         = OrgDb,
                                     universe = geneList_universe,
                                     ont           = "ALL",
                                     pAdjustMethod = "BH",
                                     pvalueCutoff  = 0.05,
                                     qvalueCutoff  = 0.2, 
                                     readable      = TRUE)
  goAll_df = as.data.frame(goAll)
  print("dim(goAll_df)")
  print(dim(goAll_df))
  if (dim(goAll_df)[1] > 1){
    bargoall <- barplot(goAll, drop= TRUE, split="ONTOLOGY", showCategory = 200) + facet_grid(ONTOLOGY~., scale="free_y",switch = 'x', space =  "free_y")
    ggsave(bargoall, filename =gsub(".pdf","_only_enri.pdf",name_GO_pdf), height =  nrow(goAll_df)*0.4 + 3, units = "cm",limitsize = F)
  }
  
  goAll <- clusterProfiler::enrichGO(gene          = geneDE,
                                     OrgDb         = OrgDb,
                                     universe = geneList_universe,
                                     ont           = "ALL",
                                     pAdjustMethod = "BH",
                                     pvalueCutoff  = 1,
                                     qvalueCutoff  = 1, 
                                     readable      = TRUE)
  
  head(summary(as.data.frame(goAll)))
  
  #bargoall <- barplot(goAll, drop= TRUE, showCategory=20)
  #print(bargoall)
  dotgoall <- dotplot(goAll, split="ONTOLOGY") + facet_grid(ONTOLOGY~., scale="free",switch = 'x')
  print(dotgoall)
  
  
  
  #goAll2 <- simplify(goAll)
  goAll2 <- setReadable(goAll, OrgDb = org.Mm.eg.db) #To convert ENTREZID to SYMBOL
  
  if (geneList_universe_tag == "short") {
    cnetAllOnt2 <- cnetplot(goAll2, foldChange=geneList_universe, circular = TRUE, colorEdge = TRUE) #circular view
  } else {
    cnetAllOnt2 <- cnetplot(goAll2, circular = TRUE, colorEdge = TRUE) } 
  
  print(cnetAllOnt2)
  
  heatAllOnt2 <- heatplot(goAll2)
  print(heatAllOnt2)
  goAll2_compare <- enrichplot::pairwise_termsim(goAll2)
  emaAllOnt2 <- emapplot(goAll2_compare)
  print(emaAllOnt2)
  
  
  ### dotplot for Enrichment analysis
  if (geneList_universe_tag == "short") {
    deg = names(geneList_universe)[abs(geneList_universe) > 0.5]
    do = enrichDO(deg)
  } else {
    do = enrichDO(geneDE) } 
  
  
  if (!is.null(do)) { 
    dotenrich <- dotplot(do, x="count", showCategory=20) #colorBy="qvalue"
    print(dotenrich)
  }
  
  ###KEGG analysis
  kk <- enrichKEGG(gene         = geneDE,
                   organism     = 'mmu',
                   pvalueCutoff = 0.1)
  head(summary(as.data.frame(kk)))
  
  mkk <- enrichMKEGG(gene = geneDE,
                     organism = 'mmu')
  head(summary(as.data.frame(mkk)))
  
  dev.off()
  
  
  return()
}





################################
# TADAM RUN EXPRESSION ANALYSES
################################

make_exp_analyses_all = function(data_expr) {
  
  print("Load data")
  
  All_table = data_expr$Alltable
  coldata = data_expr$coldata
  var1_design = data_expr$var1_design
  var2_design = data_expr$var2_design
  output_dir = data_expr$output_dir
  resampling  = data_expr$resampling
  test  = data_expr$test
  nRand = data_expr$nRand
  nCPU = data_expr$nCPU
  
  if (! is.null(var1_design)){
    design = as.formula(paste("~ ", var1_design, "+", var2_design))
  } else {
    design = as.formula(paste("~ ", var2_design))
  }
  
  All_table = All_table[,colnames(All_table) %in% coldata$ID]
  
  mt_genes = grep(pattern = "mt-", rownames(All_table))
  if (length(mt_genes) > 0) {
    Alltableok <- All_table[-mt_genes,] #Remove 7 mitochondrial genes (Co1, Co2, Cytb, Nd1, Nd2, Nd5, Nd6)
  } else {
    Alltableok = All_table
  }
  
  Alltableok1 <- round(Alltableok[complete.cases(Alltableok),])
  
  
  coldata <- coldata[ which(coldata$ID %in% colnames(All_table)), ]
  coldata <- coldata[ order(match(coldata$ID, colnames(Alltableok1))), ]#Reorder table such coldata and matrix has same order
  coldata$cond_season = factor(coldata$cond_season, level = c("arid","mesic"))
  
  rownames(coldata) <- paste0(coldata$sp_short_name,"_", coldata$ind_ID)
  coldata$final_name_short <- rownames(coldata)
  
  colnames(Alltableok1) <- coldata$final_name_short
  coldata$study_nb <- as.factor(coldata$study_nb)
  
  AlltableMat <- as.matrix(Alltableok1)
  
  if (test){
    AlltableMat=AlltableMat[1:2000,]
  }
  
  print(dim(AlltableMat))
  
  ## DE seq
  
  ddsInput <- DESeqDataSetFromMatrix(countData = AlltableMat,
                                     colData = coldata,
                                     design = design)
  
  dds <- DESeq(ddsInput)
  rld <- rlogTransformation(dds, blind=TRUE)
  #vst <- vst(dds, blind=FALSE)
  ntd <- normTransform(dds)
  #vsd <- varianceStabilizingTransformation(dds, blind=TRUE)
  se <- SummarizedExperiment(log2(counts(dds, normalized=TRUE) + 1),
                             colData=colData(dds))
  resBasemean <- counts(dds, norm=T)
  if (test) {
    print(head(resBasemean))
  }
  
  res <- results(dds,  lfcThreshold=.4, altHypothesis="greaterAbs")
  summary(res) 
  res_raw <- results(dds)
  
  res <- res[order(res$padj),]
  res_df <- as.data.frame(res)
  
  results = as.data.frame(dplyr::mutate(as.data.frame(res), sig=ifelse(res$padj<0.1, "FDR<0.1", "Not Sig")), row.names=rownames(res))
  
  
  write.table(results, file=paste0(output_dir,"env_seasonal_MvsX_online_cds.tsv"), quote = F, row.names = T, sep = "\t")
  
  res_season_padj01_onl <- res_df[res_df$padj < 0.1,]
  print(head(res_season_padj01_onl))
  write.table(res_season_padj01_onl, file=paste0(output_dir,"env_seasonal_MvsX_online_cds_DE01.tsv"), quote = F, row.names = T, sep = "\t")
  
  
  ## Quality
  
  if (T) {
    print("Quality")
    
    quality_plots_filename = paste0(output_dir,"/quality_plots.pdf")
    
    title_fig = paste(var2_design, output_dir)
    qualDE(resTableRaw=res,resTableMod=results,condition=title_fig,rld=rld,ddsRes=dds,
           resRaw=res_raw, quality_plots_filename = quality_plots_filename, se = se, coldata = coldata,
           var1_design = var1_design, var2_design = var2_design, Alltableok1 = Alltableok1, 
           resBasemean = resBasemean, res_season_padj01_onl = res_season_padj01_onl, ntd = ntd)
  }
  
  
  
  ##
  if (T) {
    print("Categorization")
    
    deseq_var_df = make_stat_table_per_gene(resBasemean, coldata, results)
    write.table(deseq_var_df, file = paste0(output_dir,"stat_var.tsv"), sep = "\t", quote = F)
    #print(head(deseq_var_df))
    deseq_var_df_corr <- cor(deseq_var_df[, !colnames(deseq_var_df) %in% c("gene", "sig")])
    
    print("mat coor done")
    deseq_var_df_corr[is.na(deseq_var_df_corr)] = 0
    
    corrplot(deseq_var_df_corr, type = "upper", order = "hclust", na.label = "NA")
    
    
    deseq_var_df_l = melt(deseq_var_df)
    plot = ggplot(deseq_var_df_l, aes(x = value, fill = variable)) +
      geom_histogram(, bins = 100) + facet_wrap(variable~., scales = "free")
    
    plot
    ggsave(plot, height = 10, width = 10, filename = paste0(output_dir,"plots_stats_norm_genes.pdf"),limitsize = FALSE)
    print("plot coor done")
    
    
    df_cat = make_categories(deseq_var_df)
    print("make_categories done")
    write.table(df_cat, file = paste0(output_dir,"categories_table.tsv"), sep = "\t", quote = F)
    print(head(df_cat))
    
    print("df_cat done")
    
    
    if (T){
      if (sum(df_cat$up)> 0){
        print("up")
        lp_up = apply(df_cat[df_cat$up,], 1,function(gene){
          
          titre = gsub("  ","",paste("[",
                                     ifelse(gene["up"],"up",""),
                                     ifelse(gene["down"],"down",""),
                                     ifelse(gene["constraint"],"constraint",""),
                                     ifelse(gene["diversification"],"diversification",""),
                                     ifelse(gene["outlier"],"outlier",""),
                                     ifelse(gene["unclassified"],"unclassified",""),
                                     "]",
                                     sep = " "))
          plot_gene(gene["gene"], titre, dds=dds, resBasemean = resBasemean, coldata=coldata, var1_design=var1_design, var2_design=var2_design)
          
          
        }
        )
        
        print("lp_up")
        ggsave(plot_grid(plotlist = lp_up, ncol = 4 ),
               filename = paste0(output_dir,"/plots_liste_gene_DE_up.pdf"),
               height = 5*floor(length(lp_up)/4),
               width = 42,
               limitsize = F
        )
      }  else {lp_up = list()}
      
      if (sum(df_cat$down)> 0){
        print("down")
        lp_down = apply(df_cat[df_cat$down,], 1, function(gene){
          
          titre = gsub("  ","",paste("[",
                                     ifelse(gene["up"],"up",""),
                                     ifelse(gene["down"],"down",""),
                                     ifelse(gene["constraint"],"constraint",""),
                                     ifelse(gene["diversification"],"diversification",""),
                                     ifelse(gene["outlier"],"outlier",""),
                                     ifelse(gene["unclassified"],"unclassified",""),
                                     "]",
                                     sep = " "))
          plot_gene(gene["gene"], titre, dds=dds, coldata=coldata,  resBasemean = resBasemean, var1_design=var1_design, var2_design=var2_design)
          
        }
        )
        print("lp_down")
        ggsave(plot_grid(plotlist = lp_down, ncol = 4 ),
               filename =  paste0(output_dir,"/plots_liste_gene_DE_down.pdf"),
               height = 5*floor(length(lp_down)/4),
               width = 42,
               limitsize = F
        )
      } else {lp_down = list()}
      
      if (sum(df_cat$constraint)> 0){
        print("constraint")
        lp_constraint = apply(df_cat[df_cat$constraint,], 1,function(gene){
          
          titre = gsub("  ","",paste("[",
                                     ifelse(gene["up"],"up",""),
                                     ifelse(gene["down"],"down",""),
                                     ifelse(gene["constraint"],"constraint",""),
                                     ifelse(gene["diversification"],"diversification",""),
                                     ifelse(gene["outlier"],"outlier",""),
                                     ifelse(gene["unclassified"],"unclassified",""),
                                     "]",
                                     sep = " "))
          plot_gene(gene["gene"], titre, dds=dds, coldata=coldata,  resBasemean = resBasemean, var1_design=var1_design, var2_design=var2_design)
          
          
        }
        )
        print("lp_constraint")
        ggsave(plot_grid(plotlist = lp_constraint, ncol = 4 ),
               filename =  paste0(output_dir,"/plots_liste_gene_DE_constraint.pdf"),
               height =5*floor(length(lp_constraint)/4),
               width = 42,
               limitsize = F
        )
      } else {lp_constraint =  list()}
      
      if (sum(df_cat$diversification)> 0){
        print("diversification")
        lp_diversification = apply(df_cat[df_cat$diversification,], 1 , function(gene){
          
          titre = gsub("  ","",paste("[",
                                     ifelse(gene["up"],"up",""),
                                     ifelse(gene["down"],"down",""),
                                     ifelse(gene["constraint"],"constraint",""),
                                     ifelse(gene["diversification"],"diversification",""),
                                     ifelse(gene["outlier"],"outlier",""),
                                     ifelse(gene["unclassified"],"unclassified",""),
                                     "]",
                                     sep = " "))
          plot_gene(gene["gene"], titre, dds=dds, coldata=coldata,  resBasemean = resBasemean, var1_design=var1_design, var2_design=var2_design)
          
          
        }
        )
        print("lp_diversification")
        ggsave(plot_grid(plotlist = lp_diversification, ncol = 4 ),
               filename =  paste0(output_dir,"/plots_liste_gene_DE_diversification.pdf"),
               height = 5*floor(length(lp_diversification)/4),
               width = 42,
               limitsize = F
        )
      } else {lp_diversification = list()}
      
      if (sum(df_cat$unclassified)> 0){
        print("unclassified")
        lp_unclassified = apply(df_cat[df_cat$unclassified,], 1, function(gene){
          
          titre = gsub("  ","",paste("[",
                                     ifelse(gene["up"],"up",""),
                                     ifelse(gene["down"],"down",""),
                                     ifelse(gene["constraint"],"constraint",""),
                                     ifelse(gene["diversification"],"diversification",""),
                                     ifelse(gene["outlier"],"outlier",""),
                                     ifelse(gene["unclassified"],"unclassified",""),
                                     "]",
                                     sep = " "))
          plot_gene(gene["gene"], titre, dds=dds, coldata=coldata,  resBasemean = resBasemean, var1_design=var1_design, var2_design=var2_design)
          
          
        }
        )
        print("lp_unclassified")
        ggsave(plot_grid(plotlist = lp_unclassified, ncol = 4 ),
               filename = paste0(output_dir,"/plots_liste_gene_DE_unclassified.pdf"),
               height = 5*floor(length(lp_unclassified)/4),
               width = 42,
               limitsize = F
        )
      } else {lp_unclassified = list()}
    }
    
  }
  
  print("categorization done")
  
  ## Resampling
  if (resampling) {
    print("Resampling")
    resampling_function = Random_label
    print("Random_label_v2")
    
    nb_gene_by_cat_real = apply(df_cat[, c("up", "down", "constraint", "diversification", "unclassified")] , 2, sum)
    nb_gene_by_cat_real_df = as.data.frame(nb_gene_by_cat_real)
    nb_gene_by_cat_real_dft = as.data.frame(t(nb_gene_by_cat_real_df))
    nb_gene_by_cat_real_dft$Tot = apply(nb_gene_by_cat_real_dft, 1, sum)
    print(nb_gene_by_cat_real_dft)
    
    #Run the function Rnadom label nRand times
    print("start resampling")
    clus <- makeCluster(nCPU) #Parallelisation
    clusterExport(clus,  envir=environment(), varlist=c("coldata","var1_design", "var2_design","design","AlltableMat", "mapvalues", "make_stat_table_per_gene", "make_categories"))
    multi_RandLab_raw = parLapply(clus,1:nRand, resampling_function)
    #print(head(multi_RandLab_raw))
    multi_RandLab = as.data.frame(do.call(rbind, multi_RandLab_raw))
    stopCluster(clus)
    #print("multi_RandLab ok")
    #print(multi_RandLab)
    
    multi_RandLab$Tot = apply(multi_RandLab[, c("up", "down", "constraint", "diversification", "unclassified")], 1, sum)
    write.table(multi_RandLab, file = paste0(output_dir,"multi_RandLab_",nRand,"_season.tsv"),quote=FALSE, sep="\t")
    
    # usefull if you use the sampling function : Random_label
    if (F){
      #Determine the number of simulations that has the same design season as real
      print("Number of simulations with initial design")
      multi_RandLab_iden <- multi_RandLab[ which(multi_RandLab$identical_design_ind > 0), ]
      #print(multi_RandLab_iden)
      print("multi_RandLab_iden")
      multi_RandLab_iden$identical_design2 <- 1
      find_nb_identical_design_ok <- nRand - sum(multi_RandLab_iden$identical_design2)
      print(find_nb_identical_design_ok)
      
      
      #Some stats regarding the number of line that change or not during the sampling
      print("Stats on number of sp changing in the sampled design")
      print(table(multi_RandLab$identical_design_sp))
      print("Stats on number of ind changing in the sampled design")
      print(table(multi_RandLab$identical_design_ind))
      
      multi_RandLab_melt <- melt(multi_RandLab, id=c("identical_design_ind","identical_design_sp"), value.name = "nb_DE")
      plot_dist_design = ggplot(multi_RandLab_melt, aes(x=identical_design_ind, y = nb_DE))+
        geom_point() + facet_grid(variable ~.)
      ggsave(plot_dist_design, file = paste0(output_dir,"Random_label_plot_dist_design_nb_DE.pdf"), limitsize = FALSE)
    }
    
    #Generate figures with random simulation
    multi_RandLab2 <- subset(multi_RandLab, select = -c(identical_design_ind,identical_design_sp,eff_seed,err))
    write.table(multi_RandLab2, file=paste0(output_dir,"multi_RandLab2_",nRand,"_season.tsv"),quote=FALSE, sep="\t")
    
    print(head(multi_RandLab2))
    nb_gene_by_cat_simu = apply(multi_RandLab2[, c("up", "down", "constraint", "diversification", "unclassified")] , 2, sum)
    nb_gene_by_cat_simu_df = as.data.frame(nb_gene_by_cat_simu)
    
    #How many simulation show number of DE genes over real
    print("How many simulation have DE genes over real data?")
    signif_random <- sapply(colnames(multi_RandLab2), function(c) {
      length(multi_RandLab2[,c][multi_RandLab2[,c] > nb_gene_by_cat_real_dft[,c]])
    },USE.NAMES = T)
    signif_random_df <- as.data.frame(signif_random)
    print(signif_random_df)
    
    signif_random_95 <- sapply(colnames(multi_RandLab2), function(c) {
      x=quantile(multi_RandLab2[,c], c(0.95))
      names(x) = NULL
      x
    },USE.NAMES = T)
    df_signif_random_95 = data.frame(variable = names(signif_random_95),value=signif_random_95)
    
    multi_RandLab_melt <- melt(multi_RandLab2, id=0, value.name = "type")
    
    nb_gene_by_cat_real_dft_melt = melt(nb_gene_by_cat_real_dft)
    #Plot the number of TRUE genes in histogram
    RandHist <- ggplot(data=multi_RandLab_melt, aes(x=type, fill=variable)) + 
      geom_histogram( position = "dodge2", col = "black", binwidth = 1) + 
      labs(title=paste0("Random similation (", nRand , " times)"), 
           x="Number of estimated DE genes", 
           y="Number of times X DE genes is found") +
      geom_vline(data = nb_gene_by_cat_real_dft_melt,
                 aes(xintercept = value, color = variable ),
                 linetype = "dashed", size = 1)
    
    RandHist
    ggsave(RandHist, file = paste0(output_dir,"Random_label_hist_All_season.pdf"), limitsize = FALSE)
    
    RandHist_facet <- ggplot(data=multi_RandLab_melt, aes(x=type, fill=variable)) + 
      geom_histogram( position = "dodge2", col = "black", binwidth = 1) + 
      labs(title=paste0("Random similation (", nRand , " times)"), 
           x="Number of estimated DE genes", 
           y="Number of times X DE genes is found") +
      geom_vline(data = df_signif_random_95, aes(xintercept = value),
                 col = "red", linetype = "dotted", size = 1) +
      geom_vline(data = nb_gene_by_cat_real_dft_melt, aes(xintercept = value),
                 col = "black", linetype = "dashed", size = 1) +
      facet_grid(variable ~.)
    
    RandHist_facet
    
    ggsave(RandHist_facet, file = paste0(output_dir,"Random_label_hist_facet_season.pdf"),
           limitsize = FALSE)
    
    
  }
  
  
  ## Croisement biblio
  
  print("Croisement biblio")
  
  #Information regarding tables to cross
  #disease_list: 186 List of genes involved in kidney-associated diseases (from Park et al. 2018)
  #omim_list: 106 genes extracted from omim database with kidney keyword in mouse
  #macmanes_list: genes identified in dehydration analyses (macmanes 2017 paper)
  #deconv_ParkCao_Com_list: 198 List of genes identified from single cells studies (cell-type specific genes)
  #list_marker_biblio: 222 list of genes from various ref, identified as more or less specific to a cell type in kidney
  #PDB: 22genes, list of gene extracted using PDB package with kidney keyword containing hpo phenotype, ClinVar and Uniprot variants
  #Gene_seq_conv_tot: 122 genes found in both pcoc and tdg09 analyses (intersection)
  #gene_pcoc: 494 genes found in pcoc analyses
  #gene_tdg09: 745 genes found in tdg09 anlyses
  
  #Other table list preparation
  disease_list <- disease_list$MGI
  disease_list <- as.data.frame(disease_list, stringsAsFactors=FALSE)
  disease_list$diseaseList <- 1
  colnames(disease_list) <- c("MGI","diseaseList")
  
  omim_list$omim <- 1
  colnames(omim_list) <- c("MGI","omim")
  
  genes_VarFromPDB_df$PDB <- 1
  colnames(genes_VarFromPDB_df) <- c("MGI","PDB")
  
  DISorders0 <- merge(disease_list, omim_list, by = c("MGI"),all= T)
  DISorders <- merge(DISorders0, genes_VarFromPDB_df, by = c("MGI"),all= T)
  DISorders$disorders <- 1
  DISorders <- subset(DISorders, select = c(MGI, disorders))
  
  macmanes_list$macmanes <- 1
  colnames(macmanes_list) <- c("MGI","dehydration")
  
  deconv_ParkCao_Com_list <- deconv_ParkCao_Com_list[-2]
  deconv_ParkCao_Com_list$SC <- 1
  colnames(deconv_ParkCao_Com_list) <- c("MGI","SC")
  
  list_markers_biblio$markerBiblio <- 1
  colnames(list_markers_biblio) <- c("MGI","markerBiblio")
  
  gene_seq_conv_intersection$common_pcoc_tdg09 = F
  gene_seq_conv_intersection$common_pcoc_tdg09[gene_seq_conv_intersection$pcoc & gene_seq_conv_intersection$tdg09] = T
  
  gene_seq_conv <- subset(gene_seq_conv_intersection, select = c("gene", "common_pcoc_tdg09"))
  gene_seq_conv <- gene_seq_conv[gene_seq_conv$common_pcoc_tdg09, ]
  gene_seq_conv$common_pcoc_tdg09 <- 1
  colnames(gene_seq_conv) <- c("MGI","seq_conv_intersection")
  
  gene_seq_conv_pcoc$pcoc <- 1
  gene_seq_conv_pcoc <- subset(gene_seq_conv_pcoc, select = c(gene_name, pcoc))
  colnames(gene_seq_conv_pcoc) <- c("MGI","pcoc")
  
  gene_seq_conv_tdg09$tdg09 <- 1
  gene_seq_conv_tdg09 <- subset(gene_seq_conv_tdg09, select = c(gene_name, tdg09))
  colnames(gene_seq_conv_tdg09) <- c("MGI","tdg09")
  
  print("prep table ok")
  
  #Merging all tables to work on it
  CompTable1 <- merge(x=DISorders,y=macmanes_list,by=c("MGI"),all=T)
  CompTable2 <- merge(x=CompTable1,y=deconv_ParkCao_Com_list,by=c("MGI"),all=T) 
  CompTable3 <- merge(x=CompTable2,y=list_markers_biblio,by=c("MGI"),all=T) 
  CompTable4 <- merge(x=CompTable3,y=gene_seq_conv,by=c("MGI"),all=T) 
  CompTable5 <- merge(x=CompTable4,y=gene_seq_conv_pcoc,by=c("MGI"),all=T)
  CompTable6 <- merge(x=CompTable5,y=gene_seq_conv_tdg09,by=c("MGI"),all=T) # There are non-unique rows (as Umod), to check if they are similar in all lines to remove the duplicates, do 'CompTable5[grep("Umod", CompTable5$MGI),]' it looks ok, we can keep unique one
  uCom <- unique(CompTable6$MGI)
  CompTableList <- CompTable6[match(uCom, CompTable6$MGI),] 
  
  print("prep compTableList ok")
  
  plot_season <- barplot_DE_refCross(df_cat,CompTableList)
  print(plot_season)
  
  ggsave(plot_season$DE_public_ref_leg, filename = paste0(output_dir,"plot_biblio_all_cat.pdf"), height = 0.35 * dim(df_cat)[1], width = 17 , units="cm",limitsize = FALSE)
  ggsave(plot_season$DE_public_ref_leg_facet, filename = paste0(output_dir,"plot_biblio_all_cat_facet.pdf"), height = (3+0.45 * dim(df_cat)[1])/3, width = 17 , units="cm", limitsize = FALSE)
  
  ## Kidney cell localization
  #Objectif - represent the localization of DE genes in kidney cells, using single data (from Park 2018 paper)
  print("Cell localization enrichment")
  
  dMusParkMax_stat_df <- as.data.frame(table(dMusParkMax2$max.tissue))
  s <- sum(dMusParkMax_stat_df$Freq)
  dMusParkMax_stat_df$prop <- 0
  dMusParkMax_stat_df$prop <- (100*dMusParkMax_stat_df$Freq)/s #Calculate proportion of genes in cells (Over the max number of genes, X% have CD cells as max)
  colnames(dMusParkMax_stat_df) <- c("Cell", "fq_real", "prop_real")
  
  Cell_prop_figure <- histo_DE_CellLoc_prop(DE_list = res_season_padj01_onl, dMusParkMax_stat_df = dMusParkMax_stat_df)
  
  
  ## GO analyses
  
  print("Analyse GO")
  print(head(results))
  
  name_GO_pdf=paste0(output_dir,"figures_GO.pdf")
  GO_fam_season <- GOwithCP(results=results, name_GO_pdf = name_GO_pdf,
                            Alltableok = Alltableok, gene = NULL)
  #Table background used is the count table beofre removing na lines (genes expressed in all ind)
  
  GO_fam_season_cat = sapply(c("up","down","diversification","constraint"),function(cat){
    print("Analyse GO")
    print(cat)
    head(results)
    gene_cat = df_cat$gene [ df_cat[,cat]]
    print(gene_cat)
    print("gene_cat_ok")
    if (is.null(dim(gene_cat)[1])) {
      print("absent category")
    } else {
      name_GO_pdf=paste0(output_dir,paste0("figures_",cat,"_GO.pdf"))
      GO_fam_season <- GOwithCP(results=results[rownames(results) %in% gene_cat,],
                                name_GO_pdf = name_GO_pdf, Alltableok = Alltableok, gene = NULL)
      GO_fam_season
    }
  }
  )
  
  
  print("GO DE genes done")
  
  #Create table with list of DE genes and intersection genes from seq analyses
  DE_list_df=as.data.frame(rownames(res_season_padj01_onl))
  colnames(DE_list_df) <- "MGI"
  gene = merge(DE_list_df, gene_seq_conv, by = c("MGI"), all = T)
  gene = subset(gene, select = c(MGI))
  
  name_GO_pdf=paste0(output_dir,"figures_GO_DEandintersection.pdf")
  GO_DE_seq <- GOwithCP(results=results, name_GO_pdf = name_GO_pdf, Alltableok = Alltableok,
                        gene = gene)
  
  print("GO DE + seq genes done")
}

#################################
# TADAM 2 RUN SIMULATION ANALYSES
#################################


random_permutation_persp_balanced <- function(norm_table_tmp, coldata_tmp){
  
  # we work couple by couple, we put the same tag to each sample of the same species 
  list_couple <- unique(coldata_tmp$couple)
  coldata_tmp$species1 <- sapply(coldata_tmp$couple,function(x) {strsplit(x,"@")[[1]][1]})
  coldata_tmp$species2 <- sapply(coldata_tmp$couple,function(x) {strsplit(x,"@")[[1]][2]})
  coldata_tmp$species <- coldata_tmp$sp_short_name
  
  
  shuffle_list <- lapply(list_couple,function(couple){
    couple_v <- strsplit(couple,"@")[[1]]
    #only shuffle if there are 2 species in the "couple" column
    if(length(couple_v) == 2) {
      id_sp1 = coldata_tmp$final_name_short[coldata_tmp$species == couple_v[1]]
      id_sp2 = coldata_tmp$final_name_short[coldata_tmp$species == couple_v[2]]
      # balance number of samples for id_sp1 and id_sp2 for swapping
      minnb = min(length(id_sp1),length(id_sp2))
      id_sp1_balance=sample(id_sp1,minnb,replace=F)
      id_sp2_balance=sample(id_sp2,minnb,replace=F)
      ids =c(id_sp1_balance,id_sp2_balance)
      # for each gene : if s = 1 we do not swap sp1 and sp2, otherwise we swap
      suffle_genes_couple <- apply(norm_table_tmp,1,function(gene){
        s = c(id_sp1_balance, id_sp2_balance)
        s_s1 = sample(c(1,2),size=1)
        if(s_s1 == 2)
        {
          new_s1 = sample(id_sp2_balance, size = minnb, replace = F)
          new_s2 = sample(id_sp1_balance, size = minnb, replace = F)
          s = c( new_s1,  new_s2)
        }
        return(gene[s])
      })
      suffle_genes_couple_dat = data.frame(t(suffle_genes_couple))
      names(suffle_genes_couple_dat) = ids
    }
    if(length(couple_v) == 1) {
      ids= coldata_tmp$final_name_short[coldata_tmp$species == couple_v[1]]
      suffle_genes_couple_dat = as.data.frame(norm_table_tmp[,ids])
      names(suffle_genes_couple_dat) = ids
      
    }
    return(suffle_genes_couple_dat)
  })
  
  normtable_shuffle = as.data.frame(shuffle_list)
  # to reorder like in original table
  return(normtable_shuffle)
}


permut_inds_selection_by_permut_group <- function(coldata_tmp){
  
  # define for each arid species a "twin" mesic species inside each permutation group
  # then define for each couple the bigger set as possible of inds (if 2 and 3 inds, choose 2 inds in each species)
  # in the following function count of each will be permuted inside each set of inds (with a proba of 0.5)
  
  coldata_tmp$couple = coldata_tmp$sp_short_name
  
  coldata_tmp$permutation_groups = coldata_tmp[,column_permutation]
  group_list = unique(coldata_tmp$permutation_groups)
  
  for (group in group_list) {
    #print(group)
    x_sp_l = unique(subset(coldata_tmp, permutation_groups == group & cond_season == "arid")$sp_short_name)
    m_sp_l = unique(subset(coldata_tmp, permutation_groups == group & cond_season == "mesic")$sp_short_name)
    
    min_sp = min(length(x_sp_l), length(m_sp_l))
    #print(min_sp)
    if (min_sp > 0) {
      x_sp_l = sample(x_sp_l, min_sp)
      m_sp_l = sample(m_sp_l, min_sp)
      
      for (i in 1:min_sp) {
        x_sp = x_sp_l[i]
        m_sp = m_sp_l[i]
        couple = paste0(x_sp, "@", m_sp)
        coldata_tmp$couple[coldata_tmp$sp_short_name %in% c(x_sp, m_sp)] = couple
      }
    }
  }
  
  # Then we work couple by couple, we put the same tag to each sample of the same species 
  list_couple <- unique(coldata_tmp$couple)
  coldata_tmp$species1 <- sapply(coldata_tmp$couple,function(x) {strsplit(x,"@")[[1]][1]})
  coldata_tmp$species2 <- sapply(coldata_tmp$couple,function(x) {strsplit(x,"@")[[1]][2]})
  coldata_tmp$species <- coldata_tmp$sp_short_name
  
  inds_to_shuffle_list <- lapply(list_couple,function(couple){
    couple_v <- strsplit(couple,"@")[[1]]
    #only shuffle if there are 2 species in the "couple" column
    if(length(couple_v) == 2) {
      id_sp1 = coldata_tmp$final_name_short[coldata_tmp$species == couple_v[1]]
      id_sp2 = coldata_tmp$final_name_short[coldata_tmp$species == couple_v[2]]
      # balance number of samples for id_sp1 and id_sp2 for swapping
      minnb = min(length(id_sp1),length(id_sp2))
      id_sp1_balance=sample(id_sp1,minnb,replace=F)
      id_sp2_balance=sample(id_sp2,minnb,replace=F)
      return(list(id_sp1_balance=id_sp1_balance,
                  id_sp2_balance=id_sp2_balance
      ))
    } else if(length(couple_v) == 1) {
      ids= coldata_tmp$final_name_short[coldata_tmp$species == couple_v[1]]
      return(list(ids=ids))
    }
    
  }
  )
  names(inds_to_shuffle_list) = list_couple
  inds_list = unlist(inds_to_shuffle_list)
  names(inds_list) = NULL
  
  return(list(inds_to_shuffle_list = inds_to_shuffle_list,
              inds_list = inds_list,
              couple_l = list_couple
  )
  )
}


random_permutation_from_inds_to_shuffle_list <- function(norm_table_tmp, inds_to_shuffle_list){
  
  # inds_to_shuffle_list come from the prevous function (list_of_list_of_permut_inds_by_permut_group)
  #list(couple_to_permute:list(id_sp1_balance:...;id_sp1_balance:...);couple_no_permute:list(ids:...);...)
  
  #for each gene, counts will be permuted between inds of each sp couple with a proba of 0.5
  ##first select with a proba of 0.5 the number of inds to be permuted
  ##then subset this number of inds form each set of inds of each species
  ##then permut count between these inds only
  
  shuffle_list <- lapply(names(inds_to_shuffle_list),function(couple){
    couple_v <- strsplit(couple,"@")[[1]]
    #only shuffle if there are 2 species in the "couple" column
    if(length(couple_v) == 2) {
      id_sp1_balance=inds_to_shuffle_list[[couple]]$id_sp1_balance
      id_sp2_balance=inds_to_shuffle_list[[couple]]$id_sp2_balance
      ids =c(id_sp1_balance,id_sp2_balance)
      nb_inds = min(length(id_sp1_balance),length(id_sp2_balance))
      # for each gene : if s = 1 we do not swap sp1 and sp2, otherwise we swap
      
      # suffle_genes_couple <- apply(norm_table_tmp,1,function(gene){
      #   s_order = c(id_sp1_balance, id_sp2_balance)
      #   names(s_order) = s_order
      #   
      #   nb_inds_to_permut = sum(sample(c(0,1),size=nb_inds, replace = T))
      #   
      #   new_s1 = sample(id_sp2_balance, size = nb_inds_to_permut, replace = F)
      #   new_s2 = sample(id_sp1_balance, size = nb_inds_to_permut, replace = F)
      #   
      #   s_order[new_s1] = new_s2
      #   s_order[new_s2] = new_s1
      # 
      #   return(gene[s_order])
      # })
      # for each gene : if s = 1 we do not swap sp1 and sp2, otherwise we swap
      suffle_genes_couple <- apply(norm_table_tmp,1,function(gene){
        s = c(id_sp1_balance, id_sp2_balance)
        s_s1 = sample(c(1,2),size=1)
        if(s_s1 == 2)
        {
          new_s1 = sample(id_sp2_balance, size = nb_inds, replace = F)
          new_s2 = sample(id_sp1_balance, size = nb_inds, replace = F)
          s = c( new_s1,  new_s2)
        }
        return(gene[s])
      })
      
      suffle_genes_couple_dat = data.frame(t(suffle_genes_couple))
      names(suffle_genes_couple_dat) = ids
    }
    if(length(couple_v) == 1) {
      ids= inds_to_shuffle_list[[couple]]$ids
      suffle_genes_couple_dat = as.data.frame(norm_table_tmp[,ids])
      names(suffle_genes_couple_dat) = ids
      
    }
    return(suffle_genes_couple_dat)
  })
  
  normtable_shuffle = as.data.frame(shuffle_list)
  # to reorder like in original table
  return(normtable_shuffle)
}






Random_label_new <- function(x, seed = NULL, norm_table = NULL, out_table = F, design = NULL, column_permutation = NULL){
  
  dds_reduced_true = NULL
  dds_shuflled = NULL
  res_compa = NULL
  
  present_inds = rep(1, nrow(coldata))
  names(present_inds) = rownames(coldata)
  present_inds_df = as.data.frame(t(present_inds))
  
  if (is.null(norm_table)) {
    print(norm_table)
    print("ERROR norm_table needed!")
  }
  
  if (is.null(column_permutation)) {
    print(column_permutation)
    print("ERROR column_permutation needed!")
  }
  
  if (is.null(design)) {
    print(design)
    print("ERROR design needed!")
  }
  
  if (! is.null(seed)) {
    eff_seed = seed
  } else {
    eff_seed <- sample(1:2^15, 1)
  }
  #print(sprintf("Seed for session: %s", eff_seed)) #17054
  set.seed(eff_seed)
  couple_l_str = ""
  #head(AlltableMat)
  tryCatch({
    library(DESeq2)
    library(dplyr)
    library(outliers)
    
    # define for each arid species a"twin" mesic species where count of each gene will be permuted
    #define twin species inside families
    
    coldata$couple = coldata$sp_short_name
    
    coldata$permutation_groups = coldata[,column_permutation] # work just for the pairset dataset to be define for the other , work in progress (Carine)
    group_list = unique(coldata$permutation_groups)
    
    for (group in group_list) {
      #print(group)
      x_sp_l = unique(subset(coldata, permutation_groups == group & cond_season == "arid")$sp_short_name)
      m_sp_l = unique(subset(coldata, permutation_groups == group & cond_season == "mesic")$sp_short_name)
      
      min_sp = min(length(x_sp_l), length(m_sp_l))
      #print(min_sp)
      if (min_sp > 0) {
        x_sp_l = sample(x_sp_l, min_sp)
        m_sp_l = sample(m_sp_l, min_sp)
        
        for (i in 1:min_sp) {
          print(i)
          x_sp = x_sp_l[i]
          m_sp = m_sp_l[i]
          couple = paste0(x_sp, "@", m_sp)
          coldata$couple[coldata$sp_short_name %in% c(x_sp, m_sp)] = couple
        }
      }
    }
    
    
    # Run permutation
    norm_table_shuffled<- random_permutation_persp_balanced(norm_table, coldata[,c("final_name_short","couple","sp_short_name")])
    
    couple_l = unique(coldata$couple)
    couple_l_str = paste0(couple_l[grep("@",couple_l)], collapse = ";")
    
    # update coldata and espacially if permutation_function == random_permutation_persp_balanced
    # where some ind where removed to have the same number of replicates between arid and mesic mesic species in ech couple
    
    coldata_new = coldata[colnames(norm_table_shuffled),]
    present_inds_df[! colnames(present_inds_df) %in% colnames(norm_table_shuffled) ] = 0
    
    # Run Deseq on the shuffling matrix (as it is already normalized don't run normalisation)
    dds_shuflled <- DESeqDataSetFromMatrix(countData = norm_table_shuffled ,
                                           colData = coldata_new,
                                           design = design)
    
    # not normalize as the table was normalized
    sizeFactors_all1= rep(1,ncol(norm_table_shuffled))
    names(sizeFactors_all1) <- colnames(norm_table_shuffled)
    sizeFactors(dds_shuflled ) <- sizeFactors_all1
    dds_shuflled <- DESeq(dds_shuflled)
    
    resBasemean_shuffled <- counts(dds_shuflled, norm=T)
    res_shuffled <- results(dds_shuflled,  lfcThreshold=.4, altHypothesis="greaterAbs", )
    
    res_shuffled <- res_shuffled[order(res_shuffled$padj),]
    
    res_shuffled_df = as.data.frame(dplyr::mutate(as.data.frame(res_shuffled),
                                                  sig=ifelse(res_shuffled$padj<0.1, "FDR<0.1", "Not Sig")),
                                    row.names=rownames(res_shuffled))
    
    #cat("shuffle signif:",sum(res_shuffled_df$sig== "FDR<0.1"),"\n")
    
    if(sum(res_shuffled_df$sig == "FDR<0.1", na.rm = T) > 0) {
      
      deseq_var_df_shuffled = make_stat_table_per_gene(resBasemean_shuffled, coldata_new, res_shuffled_df[res_shuffled_df$padj < 0.1,])
      df_cat_shuffled = make_categories(deseq_var_df_shuffled)
      #print(df_cat)
      #eror
      
      genes_sign_shuffled = deseq_var_df_shuffled$gene
      
      nb_gene_by_cat_shuffled = apply(df_cat_shuffled[, c("up", "down", "constraint", "diversification", "unclassified")] , 2, sum)
      nb_gene_by_cat_df_shuffled = as.data.frame(t(nb_gene_by_cat_shuffled))  
      
      
      
      nb_gene_by_cat_df_shuffled$eff_seed=eff_seed
      nb_gene_by_cat_df_shuffled$type="shuffling"
    } else {
      #no sig genes
      genes_sign_shuffled = c()
      
      nb_gene_by_cat_df_shuffled = data.frame(up=0,down=0, constraint=0, diversification=0, unclassified=0)
      nb_gene_by_cat_df_shuffled$eff_seed=eff_seed
      nb_gene_by_cat_df_shuffled$type="shuffling (no sig gene)"
    }
    
    
    if (T) {
      # if permutation_function == random_permutation_persp_balanced
      # where some ind where removed we have to run deseq with the true matrix with the same replicates between arid and mesic mesic species in each couple
      
      dds_reduced_true <- DESeqDataSetFromMatrix(countData = norm_table[,colnames(norm_table_shuffled)] ,
                                                 colData = coldata_new,
                                                 design = design)
      
      sizeFactors_all1= rep(1,ncol(norm_table_shuffled))
      names(sizeFactors_all1) <- colnames(norm_table_shuffled)
      sizeFactors(dds_reduced_true) <- sizeFactors_all1
      
      dds_reduced_true <- DESeq(dds_reduced_true)
      resBasemean_reduced_true <- counts(dds_reduced_true, norm=T)
      res_reduced_true <- results(dds_reduced_true,  lfcThreshold=.4, altHypothesis="greaterAbs")
      
      res_reduced_true <- res_reduced_true[order(res_reduced_true$padj),]
      
      results_df_reduced_true = as.data.frame(dplyr::mutate(as.data.frame(res_reduced_true),
                                                            sig=ifelse(res_reduced_true$padj<0.1, "FDR<0.1", "Not Sig")),
                                              row.names=rownames(res_reduced_true))
      
      
      #cat("true signif:",sum(results_df_reduced_true$sig== "FDR<0.1"),"\n")
      #cat("\n")
      
      if(sum(results_df_reduced_true$sig == "FDR<0.1", na.rm = T) > 0) {
        
        deseq_var_df_true = make_stat_table_per_gene(resBasemean_reduced_true, coldata_new, results_df_reduced_true[results_df_reduced_true$padj < 0.1,])
        df_cat_true = make_categories(deseq_var_df_true)
        #print(df_cat)
        #eror
        
        genes_sign_reduced = deseq_var_df_true$gene
        
        nb_gene_by_cat_true = apply(df_cat_true[, c("up", "down", "constraint", "diversification", "unclassified")] , 2, sum)
        nb_gene_by_cat_df_true = as.data.frame(t(nb_gene_by_cat_true))  
        
        
        nb_gene_by_cat_df_true$eff_seed=eff_seed
        nb_gene_by_cat_df_true$type="true"
      } else {
        #no sig genes
        genes_sign_reduced = c()
        
        nb_gene_by_cat_df_true = data.frame(up=0,down=0, constraint=0, diversification=0, unclassified=0)
        nb_gene_by_cat_df_true$eff_seed=eff_seed
        nb_gene_by_cat_df_true$type="true (no sig gene)"
      }
      
      results_df_reduced_true$gene = rownames(results_df_reduced_true)
      res_shuffled_df$gene = rownames(res_shuffled_df)
      
      res_compa = left_join(results_df_reduced_true, res_shuffled_df, by = "gene", suffix = c(".true",".shuffled"))
      res_compa = res_compa [,c("gene","log2FoldChange.true","pvalue.true","padj.true","sig.true","log2FoldChange.shuffled","pvalue.shuffled","padj.shuffled","sig.shuffled")]
      res_compa$eff_seed=eff_seed
      
      nb_gene_by_cat_df_true$common_genes_sign = sum(genes_sign_reduced %in% genes_sign_shuffled)
      nb_gene_by_cat_df_shuffled$common_genes_sign = sum(genes_sign_shuffled %in% genes_sign_reduced)
      
      nb_gene_by_cat_df_shuffled = rbind(nb_gene_by_cat_df_shuffled,
                                         nb_gene_by_cat_df_true)
      
      
    }
    
    nb_gene_by_cat_df_shuffled$design = as.character(design)[2]
    nb_gene_by_cat_df_shuffled$couple = couple_l_str
    nb_gene_by_cat_df_shuffled = cbind(nb_gene_by_cat_df_shuffled, as.data.frame(present_inds_df))
    
  }, error=function(err){
    print("ERROR")
    print(err)
    nb_gene_by_cat_df_shuffled = data.frame(up=0,down=0, constraint=0, diversification=0, unclassified=0, common_genes_sign=0)
    nb_gene_by_cat_df_shuffled$eff_seed=eff_seed
    nb_gene_by_cat_df_shuffled$type=err$message
    
    nb_gene_by_cat_df_shuffled$design = as.character(design)[2]
    nb_gene_by_cat_df_shuffled$couple = couple_l_str
    nb_gene_by_cat_df_shuffled = cbind(nb_gene_by_cat_df_shuffled, as.data.frame(present_inds_df))
    
  }
  )
  
  
  if(out_table) {
    return(list(dds_shuflled= dds_shuflled,
                dds_reduced_true = dds_reduced_true,
                coldata_new=coldata_new,
                norm_table_shuffled=norm_table_shuffled,
                nb_gene_by_cat_df_shuffled=nb_gene_by_cat_df_shuffled,
                res_compa = res_compa
    )
    )
  } else {
    return(
      list(nb_gene_by_cat_df_shuffled=nb_gene_by_cat_df_shuffled,
           res_compa=res_compa)
    )
    
  }
  
}



Random_label_with_removed_signi_genes <- function(x, seed = NULL, norm_table = NULL, out_table = F, design = NULL, column_permutation = NULL, replace_signi_genes = F){
  
  dds_reduced_true = NULL
  dds_shuflled = NULL
  res_compa = NULL
  
  
  present_inds = rep(0, nrow(coldata))
  names(present_inds) = rownames(coldata)
  present_inds_df = as.data.frame(t(present_inds))
  
  if (is.null(norm_table)) {
    print(norm_table)
    print("ERROR norm_table needed!")
  }
  
  if (is.null(column_permutation)) {
    print(column_permutation)
    print("ERROR column_permutation needed!")
  }
  
  if (is.null(design)) {
    print(design)
    print("ERROR design needed!")
  }
  
  if (! is.null(seed)) {
    eff_seed = seed
  } else {
    eff_seed <- sample(1:2^15, 1)
  }
  #print(sprintf("Seed for session: %s", eff_seed)) #17054
  set.seed(eff_seed)
  nb_gene_by_cat_df_shuffled = eff_seed
  couple_l_str = ""
  #head(AlltableMat)
  tryCatch({
    library(DESeq2)
    library(dplyr)
    library(outliers)
    
    # get inds to permut list 
    list_of_list_of_permut_inds_by_permut_group = permut_inds_selection_by_permut_group(coldata)
    inds_list = list_of_list_of_permut_inds_by_permut_group$inds_list
    
    couple_l = list_of_list_of_permut_inds_by_permut_group$couple_l
    couple_l_str = paste0(couple_l[grep("@",couple_l)], collapse = ";")
    
    # update coldata 
    # where some ind where removed to have the same number of replicates between arid and mesic mesic species in ech couple
    
    coldata_new = coldata[inds_list,]
    present_inds_df[inds_list] = 1
    
    
    # First run the true scenario to get signi genes
    
    dds_reduced_true <- DESeqDataSetFromMatrix(countData = norm_table[,inds_list] ,
                                               colData = coldata_new,
                                               design = design)
    
    sizeFactors_all1= rep(1,nrow(coldata_new))
    names(sizeFactors_all1) <- inds_list
    sizeFactors(dds_reduced_true) <- sizeFactors_all1
    
    dds_reduced_true <- DESeq(dds_reduced_true)
    resBasemean_reduced_true <- counts(dds_reduced_true, norm=T)
    res_reduced_true <- results(dds_reduced_true,  lfcThreshold=.4, altHypothesis="greaterAbs")
    
    res_reduced_true <- res_reduced_true[order(res_reduced_true$padj),]
    results_df_reduced_true = as.data.frame(dplyr::mutate(as.data.frame(res_reduced_true),
                                                          sig=ifelse(res_reduced_true$padj<0.1, "FDR<0.1", "Not Sig")),
                                            row.names=rownames(res_reduced_true))
    
    
    #cat("true signif:",sum(results_df_reduced_true$sig== "FDR<0.1"),"\n")
    #cat("\n")
    
    if(sum(results_df_reduced_true$sig == "FDR<0.1", na.rm = T) > 0) {
      
      deseq_var_df_true = make_stat_table_per_gene(resBasemean_reduced_true, coldata_new, results_df_reduced_true[results_df_reduced_true$padj < 0.1,])
      df_cat_true = make_categories(deseq_var_df_true)
      #print(df_cat)
      #eror
      
      genes_sign_reduced = deseq_var_df_true$gene
      
      nb_gene_by_cat_true = apply(df_cat_true[, c("up", "down", "constraint", "diversification", "unclassified")] , 2, sum)
      nb_gene_by_cat_df_true = as.data.frame(t(nb_gene_by_cat_true))  
      
      
      nb_gene_by_cat_df_true$eff_seed=eff_seed
      nb_gene_by_cat_df_true$type="true"
    } else {
      #no sig genes
      genes_sign_reduced = c()
      
      nb_gene_by_cat_df_true = data.frame(up=0, down=0, constraint=0, diversification=0, unclassified=0)
      nb_gene_by_cat_df_true$eff_seed=eff_seed
      nb_gene_by_cat_df_true$type="true (no sig gene)"
    }
    
    
    nb_gene_by_cat_df_true$Total = length(genes_sign_reduced)
    nb_gene_by_cat_df_true$LFC.XM.P = sum(subset(results_df_reduced_true, sig == "FDR<0.1")$log2FoldChange < 0 , na.rm = T)
    nb_gene_by_cat_df_true$LFC.XM.N = sum(subset(results_df_reduced_true, sig == "FDR<0.1")$log2FoldChange >= 0 , na.rm = T)
    
    
    # Remove signi genes from norm_table_shuffled if replaced_signi_gene == T and replace them by duplicate gene of the same expression
    if (replace_signi_genes & (length(genes_sign_reduced) > 0)) {
      
      # calculate the size of the quartile 
      percentage_gene_de = length(genes_sign_reduced) / nrow(norm_table)
      
      # split data in quartile of size 0.05 (data/20) but check if all significant genes are in the same quartile allow to sample other genes
      split_time = floor(1 / max(0.05, percentage_gene_de * 2.1))
      
      #make quartile of expression  in  "split_time" (20 or less)
      basemean = rowMeans(norm_table[,inds_list])
      q_expression = cut(basemean, breaks = quantile(basemean, probs = seq(0,1,length.out = split_time)), include.lowest = T )
      names(q_expression) = names(basemean)
      
      # calculate the number of signi gene of each quartile
      q_expression_signi_genes_table = table(q_expression[genes_sign_reduced])
      
      #remove significant genes from the sampled list
      q_expression_without_signi = q_expression[! names(q_expression) %in% genes_sign_reduced]
      
      # sample gene with the same (same quartile) expression of the significant genes
      dup_genes_l = lapply(names(q_expression_signi_genes_table), function(quant) {
        nb_gene_to_sample = q_expression_signi_genes_table[quant]
        genes_to_sample = names(q_expression_without_signi[q_expression_without_signi == quant ])
        sample(genes_to_sample, size = nb_gene_to_sample, replace = F)
      })
      
      dup_genes = unlist(dup_genes_l)
      
      # build the new table
      norm_table_no_signi = norm_table [! rownames(norm_table) %in% genes_sign_reduced,]
      
      if(length(genes_sign_reduced) == 1) {
        tmp = as.data.frame(t(norm_table [dup_genes,]))
        rownames(tmp) = dup_genes
      } else {
        tmp = norm_table_no_signi[dup_genes,]
      }
      rownames(tmp) = paste0(rownames(tmp),"_dup")
      
      norm_table_no_signi = rbind(norm_table_no_signi,tmp)
      
      # shuffle each gene between selected inds
      norm_table_shuffled<- random_permutation_from_inds_to_shuffle_list(norm_table_no_signi, list_of_list_of_permut_inds_by_permut_group$inds_to_shuffle_list)
    } else {
      # shuffle each gene between selected inds
      norm_table_shuffled<- random_permutation_from_inds_to_shuffle_list(norm_table, list_of_list_of_permut_inds_by_permut_group$inds_to_shuffle_list)
    }
    
    
    # Run Deseq on the shuffling matrix (as it is already normalized don't run normalisation)
    dds_shuflled <- DESeqDataSetFromMatrix(countData = norm_table_shuffled[,inds_list] ,
                                           colData = coldata_new,
                                           design = design)
    
    # not normalize as the table was normalized
    sizeFactors_all1= rep(1,length(inds_list))
    names(sizeFactors_all1) <- inds_list
    sizeFactors(dds_shuflled ) <- sizeFactors_all1
    dds_shuflled <- DESeq(dds_shuflled)
    
    resBasemean_shuffled <- counts(dds_shuflled, norm=T)
    res_shuffled <- results(dds_shuflled,  lfcThreshold=.4, altHypothesis="greaterAbs", )
    
    res_shuffled <- res_shuffled[order(res_shuffled$padj),]
    
    res_shuffled_df = as.data.frame(dplyr::mutate(as.data.frame(res_shuffled),
                                                  sig=ifelse(res_shuffled$padj<0.1, "FDR<0.1", "Not Sig")),
                                    row.names=rownames(res_shuffled))
    
    #cat("shuffle signif:",sum(res_shuffled_df$sig== "FDR<0.1"),"\n")
    
    if(sum(res_shuffled_df$sig == "FDR<0.1", na.rm = T) > 0) {
      
      deseq_var_df_shuffled = make_stat_table_per_gene(resBasemean_shuffled, coldata_new, res_shuffled_df[res_shuffled_df$padj < 0.1,])
      df_cat_shuffled = make_categories(deseq_var_df_shuffled)
      #print(df_cat)
      #eror
      
      genes_sign_shuffled = deseq_var_df_shuffled$gene
      
      nb_gene_by_cat_shuffled = apply(df_cat_shuffled[, c("up", "down", "constraint", "diversification", "unclassified")] , 2, sum)
      nb_gene_by_cat_df_shuffled = as.data.frame(t(nb_gene_by_cat_shuffled))  
      
      
      
      nb_gene_by_cat_df_shuffled$eff_seed=eff_seed
      nb_gene_by_cat_df_shuffled$type="shuffling"
    } else {
      #no sig genes
      genes_sign_shuffled = c()
      
      nb_gene_by_cat_df_shuffled = data.frame(up=0,down=0, constraint=0, diversification=0, unclassified=0)
      nb_gene_by_cat_df_shuffled$eff_seed=eff_seed
      nb_gene_by_cat_df_shuffled$type="shuffling (no sig gene)"
    }
    
    nb_gene_by_cat_df_shuffled$Total = length(genes_sign_shuffled)
    nb_gene_by_cat_df_shuffled$LFC.XM.P = sum(subset(res_shuffled_df, sig == "FDR<0.1")$log2FoldChange < 0, na.rm = T)
    nb_gene_by_cat_df_shuffled$LFC.XM.N = sum(subset(res_shuffled_df, sig == "FDR<0.1")$log2FoldChange >= 0 , na.rm = T)
    
    # Make a comparaison between logfoldchange between the reduced_true data or the shuffled data
    
    results_df_reduced_true$gene = rownames(results_df_reduced_true)
    res_shuffled_df$gene = rownames(res_shuffled_df)
    
    if (F) {
      res_compa = left_join(results_df_reduced_true, res_shuffled_df, by = "gene", suffix = c(".true",".shuffled"))
      res_compa = res_compa [,c("gene","log2FoldChange.true","pvalue.true","padj.true","sig.true","log2FoldChange.shuffled","pvalue.shuffled","padj.shuffled","sig.shuffled")]
      res_compa$eff_seed=eff_seed
    }
    
    # Make a summary of the number of significant genes between the reduced_true data or the shuffled data
    
    nb_gene_by_cat_df_true$common_genes_sign = sum(genes_sign_reduced %in% genes_sign_shuffled)
    nb_gene_by_cat_df_shuffled$common_genes_sign = sum(genes_sign_shuffled %in% genes_sign_reduced)
    
    nb_gene_by_cat_df_shuffled = rbind(nb_gene_by_cat_df_shuffled, nb_gene_by_cat_df_true)
    
    nb_gene_by_cat_df_shuffled$design = as.character(design)[2]
    nb_gene_by_cat_df_shuffled$couple = couple_l_str
    nb_gene_by_cat_df_shuffled = cbind(nb_gene_by_cat_df_shuffled, as.data.frame(present_inds_df))
    
  }, error=function(err){
    print("ERROR")
    print(err)
    nb_gene_by_cat_df_shuffled = data.frame(up=0, down=0, constraint=0, diversification=0, unclassified=0, common_genes_sign=0, Total=0, LFC.XM.P=0, LFC.XM.N=0)
    nb_gene_by_cat_df_shuffled$eff_seed=eff_seed
    nb_gene_by_cat_df_shuffled$type=err$message
    
    nb_gene_by_cat_df_shuffled$design = as.character(design)[2]
    nb_gene_by_cat_df_shuffled$couple = couple_l_str
    nb_gene_by_cat_df_shuffled = cbind(nb_gene_by_cat_df_shuffled, as.data.frame(present_inds_df))
    
  }
  )
  
  
  if(out_table) {
    return(list(dds_shuflled= dds_shuflled,
                dds_reduced_true = dds_reduced_true,
                coldata_new=coldata_new,
                norm_table_shuffled=norm_table_shuffled,
                nb_gene_by_cat_df_shuffled=nb_gene_by_cat_df_shuffled,
                res_compa = res_compa
    )
    )
  } else {
    return(
      list(nb_gene_by_cat_df_shuffled=nb_gene_by_cat_df_shuffled,
           res_compa=res_compa)
    )
    
  }
  
}


calcul_pvalue_from_res_shuffling = function(res_shuffling) {
  samp <- unique(res_shuffling)
  df_table_wider <- samp %>%
    mutate(type = gsub(" (no sig gene)","", type, fixed = T)) %>%
    dplyr::select(c(up:LFC.XM.N)) %>%
    pivot_wider(names_from = c(type),
                values_from = c(up:unclassified,Total:LFC.XM.N)) 
  
  if (nrow(df_table_wider) >= 10) {
    
    pwTotal <- wilcox.test(df_table_wider$Total_shuffling, df_table_wider$Total_true, paired = TRUE)$p.value
    pwUp  <- wilcox.test(df_table_wider$up_shuffling, df_table_wider$up_true, paired = TRUE)$p.value
    pwDo  <- wilcox.test(df_table_wider$down_shuffling, df_table_wider$down_true, paired = TRUE)$p.value
    pwCo  <- wilcox.test(df_table_wider$constraint_shuffling, df_table_wider$constraint_true, paired = TRUE)$p.value
    pwDi  <- wilcox.test(df_table_wider$diversification_shuffling, df_table_wider$diversification_true, paired = TRUE)$p.value
    pwLFC.XM.P  <- wilcox.test(df_table_wider$LFC.XM.P_shuffling, df_table_wider$LFC.XM.P_true, paired = TRUE)$p.value
    pwLFC.XM.N  <- wilcox.test(df_table_wider$LFC.XM.N_shuffling, df_table_wider$LFC.XM.N_true, paired = TRUE)$p.value
    
    pwvalues=c(pwTotal,pwUp,pwDo,pwCo,pwDi,pwLFC.XM.P,pwLFC.XM.N)
    names(pwvalues)=c("Total","Up","Down","Conserved","Diversification","LFC.XM.P","LFC.XM.N")
    
  } else {
    pwvalues=c(1,1,1,1,1,1,1)
    names(pwvalues)=c("Total","Up","Down","Conserved","Diversification","LFC.XM.P","LFC.XM.N")
  }
  
  pTotal <- table(df_table_wider$Total_shuffling>df_table_wider$Total_true)
  pTotal <- ifelse(is.na(pTotal["TRUE"]),0, pTotal["TRUE"]/sum(pTotal))
  pUp <- table(df_table_wider$up_shuffling>df_table_wider$up_true)
  pUp <-  ifelse(is.na(pUp["TRUE"]),0, pUp["TRUE"]/sum(pUp))
  pDo <- table(df_table_wider$down_shuffling>df_table_wider$down_true)
  pDo <-  ifelse(is.na(pDo["TRUE"]),0, pDo["TRUE"]/sum(pDo))
  pCo <- table(df_table_wider$constraint_shuffling>df_table_wider$constraint_true)
  pCo <-  ifelse(is.na(pCo["TRUE"]),0, pCo["TRUE"]/sum(pCo))
  pDi <- table(df_table_wider$diversification_shuffling>df_table_wider$diversification_true)
  pDi <-  ifelse(is.na(pDi["TRUE"]),0, pDi["TRUE"]/sum(pDi))
  
  pLFC.XM.P <- table(df_table_wider$diversification_shuffling>df_table_wider$diversification_true)
  pLFC.XM.P <-  ifelse(is.na(pLFC.XM.P["TRUE"]),0, pLFC.XM.P["TRUE"]/sum(pLFC.XM.P))
  pLFC.XM.N <- table(df_table_wider$LFC.XM.N_shuffling>df_table_wider$LFC.XM.N_true)
  pLFC.XM.N <-  ifelse(is.na(pLFC.XM.N["TRUE"]),0, pLFC.XM.N["TRUE"]/sum(pLFC.XM.N))
  
  pvalues=c(pTotal,pUp,pDo,pCo,pDi,pLFC.XM.P,pLFC.XM.N)
  names(pvalues)=c("Total","Up","Down","Conserved","Diversification","LFC.XM.P","LFC.XM.N")
  
  msTotal=mean(df_table_wider$Total_shuffling, na.rm = TRUE)
  msUp=mean(df_table_wider$up_shuffling, na.rm = TRUE)
  msLo=mean(df_table_wider$down_shuffling, na.rm = TRUE)
  msCo=mean(df_table_wider$constraint_shuffling, na.rm = TRUE)
  msDi=mean(df_table_wider$diversification_shuffling, na.rm = TRUE)
  
  msLFC.XM.P=mean(df_table_wider$LFC.XM.P_shuffling, na.rm = TRUE)
  msLFC.XM.N=mean(df_table_wider$LFC.XM.N_shuffling, na.rm = TRUE)
  
  msvalues=c(msTotal,msUp,msLo,msCo,msDi,msLFC.XM.P,msLFC.XM.N)
  names(msvalues)=c("Total","Up","Down","Conserved","Diversification","LFC.XM.P","LFC.XM.N")
  
  mtTotal=mean(df_table_wider$Total_true, na.rm = TRUE)
  mtUp=mean(df_table_wider$up_true, na.rm = TRUE)
  mtLo=mean(df_table_wider$down_true, na.rm = TRUE)
  mtCo=mean(df_table_wider$constraint_true, na.rm = TRUE)
  mtDi=mean(df_table_wider$diversification_true, na.rm = TRUE)
  
  mtLFC.XM.P=mean(df_table_wider$LFC.XM.P_true, na.rm = TRUE)
  mtLFC.XM.N=mean(df_table_wider$LFC.XM.N_true, na.rm = TRUE)
  
  mtvalues=c(mtTotal,mtUp,mtLo,mtCo,mtDi,mtLFC.XM.P,mtLFC.XM.N)
  names(mtvalues)=c("Total","Up","Down","Conserved","Diversification","LFC.XM.P","LFC.XM.N")
  
  dTotal=mean(df_table_wider$Total_true-df_table_wider$Total_shuffling, na.rm = TRUE)
  dUp=mean(df_table_wider$up_true-df_table_wider$up_shuffling, na.rm = TRUE)
  dLo=mean(df_table_wider$down_true-df_table_wider$down_shuffling, na.rm = TRUE)
  dCo=mean(df_table_wider$constraint_true-df_table_wider$constraint_shuffling, na.rm = TRUE)
  dDi=mean(df_table_wider$diversification_true-df_table_wider$diversification_shuffling, na.rm = TRUE)
  
  dLFC.XM.P=mean(df_table_wider$LFC.XM.P_true-df_table_wider$LFC.XM.P_shuffling, na.rm = TRUE)
  dLFC.XM.N=mean(df_table_wider$LFC.XM.N_true-df_table_wider$LFC.XM.N_shuffling, na.rm = TRUE)
  
  dvalues=c(dTotal,dUp,dLo,dCo,dDi,dLFC.XM.P,dLFC.XM.N)
  names(dvalues)=c("Total","Up","Down","Conserved","Diversification","LFC.XM.P","LFC.XM.N")
  
  sddTotal=sd(df_table_wider$Total_true-df_table_wider$Total_shuffling, na.rm = TRUE)
  sddUp=sd(df_table_wider$up_true-df_table_wider$up_shuffling, na.rm = TRUE)
  sddLo=sd(df_table_wider$down_true-df_table_wider$down_shuffling, na.rm = TRUE)
  sddCo=sd(df_table_wider$constraint_true-df_table_wider$constraint_shuffling, na.rm = TRUE)
  sddDi=sd(df_table_wider$diversification_true-df_table_wider$diversification_shuffling, na.rm = TRUE)
  
  sddLFC.XM.P=sd(df_table_wider$LFC.XM.P_true-df_table_wider$LFC.XM.P_shuffling, na.rm = TRUE)
  sddLFC.XM.N=sd(df_table_wider$LFC.XM.N_true-df_table_wider$LFC.XM.N_shuffling, na.rm = TRUE)
  
  sddvalues=c(sddTotal,sddUp,sddLo,sddCo,sddDi,sddLFC.XM.P,sddLFC.XM.N)
  names(sddvalues)=c("Total","Up","Down","Conserved","Diversification","LFC.XM.P","LFC.XM.N")
  
  sdtTotal=sd(df_table_wider$Total_true, na.rm = TRUE)
  sdtUp=sd(df_table_wider$up_true, na.rm = TRUE)
  sdtLo=sd(df_table_wider$down_true, na.rm = TRUE)
  sdtCo=sd(df_table_wider$constraint_true, na.rm = TRUE)
  sdtDi=sd(df_table_wider$diversification_true, na.rm = TRUE)
  
  sdtLFC.XM.P=sd(df_table_wider$LFC.XM.P_true, na.rm = TRUE)
  sdtLFC.XM.N=sd(df_table_wider$LFC.XM.N_true, na.rm = TRUE)
  
  sdtvalues=c(sdtTotal,sdtUp,sdtLo,sdtCo,sdtDi,sdtLFC.XM.P,sdtLFC.XM.N)
  names(sdtvalues)=c("Total","Up","Down","Conserved","Diversification","LFC.XM.P","LFC.XM.N")
  
  sdsTotal=sd(df_table_wider$Total_shuffling, na.rm = TRUE)
  sdsUp=sd(df_table_wider$up_shuffling, na.rm = TRUE)
  sdsLo=sd(df_table_wider$down_shuffling, na.rm = TRUE)
  sdsCo=sd(df_table_wider$constraint_shuffling, na.rm = TRUE)
  sdsDi=sd(df_table_wider$diversification_shuffling, na.rm = TRUE)
  
  sdsLFC.XM.P=sd(df_table_wider$LFC.XM.P_shuffling, na.rm = TRUE)
  sdsLFC.XM.N=sd(df_table_wider$LFC.XM.N_shuffling, na.rm = TRUE)
  
  sdsvalues=c(sdsTotal,sdsUp,sdsLo,sdsCo,sdsDi,sdsLFC.XM.P,sdsLFC.XM.N)
  names(sdsvalues)=c("Total","Up","Down","Conserved","Diversification","LFC.XM.P","LFC.XM.N")
  
  q25_tTotal=quantile(df_table_wider$Total_true, 0.25)
  q25_tUp=quantile(df_table_wider$up_true, 0.25)
  q25_tLo=quantile(df_table_wider$down_true, 0.25)
  q25_tCo=quantile(df_table_wider$constraint_true, 0.25)
  q25_tDi=quantile(df_table_wider$diversification_true, 0.25)
  
  q25_tLFC.XM.P=quantile(df_table_wider$LFC.XM.P_true, 0.25)
  q25_tLFC.XM.N=quantile(df_table_wider$LFC.XM.N_true, 0.25)
  
  q25_tvalues=c(q25_tTotal,q25_tUp,q25_tLo,q25_tCo,q25_tDi,q25_tLFC.XM.P,q25_tLFC.XM.N)
  names(q25_tvalues)=c("Total","Up","Down","Conserved","Diversification","LFC.XM.P","LFC.XM.N")
  
  q25_sTotal=quantile(df_table_wider$Total_shuffling, 0.25)
  q25_sUp=quantile(df_table_wider$up_shuffling, 0.25)
  q25_sLo=quantile(df_table_wider$down_shuffling, 0.25)
  q25_sCo=quantile(df_table_wider$constraint_shuffling, 0.25)
  q25_sDi=quantile(df_table_wider$diversification_shuffling, 0.25)
  
  q25_sLFC.XM.P=quantile(df_table_wider$LFC.XM.P_shuffling, 0.25)
  q25_sLFC.XM.N=quantile(df_table_wider$LFC.XM.N_shuffling, 0.25)
  
  q25_svalues=c(q25_sTotal,q25_sUp,q25_sLo,q25_sCo,q25_sDi,q25_sLFC.XM.P,q25_sLFC.XM.N)
  names(q25_svalues)=c("Total","Up","Down","Conserved","Diversification","LFC.XM.P","LFC.XM.N")
  
  
  q50_tTotal=quantile(df_table_wider$Total_true, 0.50)
  q50_tUp=quantile(df_table_wider$up_true, 0.50)
  q50_tLo=quantile(df_table_wider$down_true, 0.50)
  q50_tCo=quantile(df_table_wider$constraint_true, 0.50)
  q50_tDi=quantile(df_table_wider$diversification_true, 0.50)
  
  q50_tLFC.XM.P=quantile(df_table_wider$LFC.XM.P_true, 0.50)
  q50_tLFC.XM.N=quantile(df_table_wider$LFC.XM.N_true, 0.50)
  
  q50_tvalues=c(q50_tTotal,q50_tUp,q50_tLo,q50_tCo,q50_tDi,q50_tLFC.XM.P,q50_tLFC.XM.N)
  names(q50_tvalues)=c("Total","Up","Down","Conserved","Diversification","LFC.XM.P","LFC.XM.N")
  
  q50_sTotal=quantile(df_table_wider$Total_shuffling, 0.50)
  q50_sUp=quantile(df_table_wider$up_shuffling, 0.50)
  q50_sLo=quantile(df_table_wider$down_shuffling, 0.50)
  q50_sCo=quantile(df_table_wider$constraint_shuffling, 0.50)
  q50_sDi=quantile(df_table_wider$diversification_shuffling, 0.50)
  
  q50_sLFC.XM.P=quantile(df_table_wider$LFC.XM.P_shuffling, 0.50)
  q50_sLFC.XM.N=quantile(df_table_wider$LFC.XM.N_shuffling, 0.50)
  
  q50_svalues=c(q50_sTotal,q50_sUp,q50_sLo,q50_sCo,q50_sDi,q50_sLFC.XM.P,q50_sLFC.XM.N)
  names(q50_svalues)=c("Total","Up","Down","Conserved","Diversification","LFC.XM.P","LFC.XM.N")
  
  
  q75_tTotal=quantile(df_table_wider$Total_true, 0.75)
  q75_tUp=quantile(df_table_wider$up_true, 0.75)
  q75_tLo=quantile(df_table_wider$down_true, 0.75)
  q75_tCo=quantile(df_table_wider$constraint_true, 0.75)
  q75_tDi=quantile(df_table_wider$diversification_true, 0.75)
  
  q75_tLFC.XM.P=quantile(df_table_wider$LFC.XM.P_true, 0.75)
  q75_tLFC.XM.N=quantile(df_table_wider$LFC.XM.N_true, 0.75)
  
  q75_tvalues=c(q75_tTotal,q75_tUp,q75_tLo,q75_tCo,q75_tDi,q75_tLFC.XM.P,q75_tLFC.XM.N)
  names(q75_tvalues)=c("Total","Up","Down","Conserved","Diversification","LFC.XM.P","LFC.XM.N")
  
  q75_sTotal=quantile(df_table_wider$Total_shuffling, 0.75)
  q75_sUp=quantile(df_table_wider$up_shuffling, 0.75)
  q75_sLo=quantile(df_table_wider$down_shuffling, 0.75)
  q75_sCo=quantile(df_table_wider$constraint_shuffling, 0.75)
  q75_sDi=quantile(df_table_wider$diversification_shuffling, 0.75)
  
  q75_sLFC.XM.P=quantile(df_table_wider$LFC.XM.P_shuffling, 0.75)
  q75_sLFC.XM.N=quantile(df_table_wider$LFC.XM.N_shuffling, 0.75)
  
  q75_svalues=c(q75_sTotal,q75_sUp,q75_sLo,q75_sCo,q75_sDi,q75_sLFC.XM.P,q75_sLFC.XM.N)
  names(q75_svalues)=c("Total","Up","Down","Conserved","Diversification","LFC.XM.P","LFC.XM.N")
  
  
  
  l=list(pvalues,pwvalues,msvalues,mtvalues,dvalues,sddvalues,sdtvalues,sdsvalues,q25_tvalues,q25_svalues,q50_tvalues, q50_svalues,q75_tvalues, q75_svalues)
  names(l)=c("pvalues","pvalues_wilc_paired","mean_shuffling_values","mean_true_values","delta_values","sd_delta_values","sd_true_values","sd_shuffling_values",
             "q25_true_values","q25_shuffling_values",
             "q50_true_values","q50_shuffling_values",
             "q75_true_values","q75_shuffling_values")
  res = l %>% as.data.frame() %>%
    rownames_to_column("type") %>%
    pivot_wider(names_from = c(type),
                values_from = c(pvalues:q75_shuffling_values),names_sep = "."
    )
  return(res)
}

make_exp_analyses_only_simu = function(data_expr, replace_signi_genes=F, run_whole_dataset = T, adjust_table = T) {
  
  print(paste("Load data", data_expr$dataset_name))
  
  print(paste("output_dir", data_expr$output_dir))
  
  All_table = data_expr$Alltable
  coldata = data_expr$coldata
  var1_design = data_expr$var1_design
  var2_design = data_expr$var2_design
  output_dir = data_expr$output_dir
  resampling  = data_expr$resampling
  test  = data_expr$test
  nRand = data_expr$nRand
  nCPU = data_expr$nCPU
  column_permutation = data_expr$column_permutation
  dataset_name = data_expr$dataset_name
  
  if (replace_signi_genes) {
    output_prefix = paste0(output_dir,"/res_shuffling_",dataset_name,"_",nRand,"simu_rep_signi")
    dataset_name_2 = paste0(dataset_name,"_replace_signi")
  } else {
    output_prefix = paste0(output_dir,"/res_shuffling_",dataset_name,"_",nRand,"simu")
    dataset_name_2 = dataset_name
  }
  
  RDATA_output = paste0(output_prefix,".RData")
  
  if(run_whole_dataset | ! file.exists(RDATA_output)) {
    if (! is.null(var1_design)){
      design = as.formula(paste("~ ", var1_design, "+", var2_design))
      if (adjust_table) {
        design = as.formula(paste("~ ", var2_design))
      }
    } else {
      adjust_table = F
      design = as.formula(paste("~ ", var2_design))
    }
    
    All_table = All_table[,colnames(All_table) %in% coldata$ID]
    
    mt_genes = grep(pattern = "mt-", rownames(All_table))
    if (length(mt_genes) > 0) {
      Alltableok <- All_table[-mt_genes,] #Remove 7 mitochondrial genes (Co1, Co2, Cytb, Nd1, Nd2, Nd5, Nd6)
    } else {
      Alltableok = All_table
    }
    
    
    Alltableok1 <- round(Alltableok[complete.cases(Alltableok),])
    Alltableok1[is.na(Alltableok1)]=0
    n=apply(Alltableok1,1,function(x){sum(x==0)})
    #counts=counts[n<ncol(counts),]
    Alltableok1=Alltableok1[n==0,]
    
    
    
    coldata <- coldata[ which(coldata$ID %in% colnames(All_table)), ]
    coldata <- coldata[ order(match(coldata$ID, colnames(Alltableok1))), ]#Reorder table such coldata and matrix has same order
    coldata$cond_season = factor(coldata$cond_season, level = c("mesic","arid"))
    
    coldata[,column_permutation] = factor(coldata[,column_permutation])
    
    rownames(coldata) <- paste0(coldata$sp_short_name,"_", coldata$ID_number)
    coldata$final_name_short <- rownames(coldata)
    
    colnames(Alltableok1) <- coldata$final_name_short
   # coldata$study_nb <- as.factor(coldata$study_nb)
    
    AlltableMat <- as.matrix(Alltableok1)
    
    if (test){
      AlltableMat=AlltableMat[1:2000,]
    }
    
    print("dim(AllMat)")
    print(dim(AlltableMat))
    
    if (adjust_table) {
      print(paste0("Adjust table with Combat using batch=",
                   var1_design, " (",
                   paste0(unique(coldata[,var1_design]), collapse = ","),")",
                   " & group=", var2_design, " (",
                   paste0(unique(coldata[,var2_design]), collapse = ","),")")
      )
      print(paste0("Design: ", design))
      AlltableMat_adjusted <- ComBat_seq(AlltableMat, batch=coldata[,var1_design], group=coldata[,var2_design])
      AlltableMat_ok <- AlltableMat_adjusted
    } else {
      print("No adjustement")
      print(paste0("Design: ", design))
      AlltableMat_ok <- AlltableMat
    }
    
    ## DE seq
    ddsInput <- DESeqDataSetFromMatrix(countData = AlltableMat_ok,
                                       colData = coldata,
                                       design = design)
    
    dds_total_true <- DESeq(ddsInput)
    resBasemean_total_true <- counts(dds_total_true, norm=T)
    
    if(run_whole_dataset) {
      res_total_true <- results(dds_total_true,  lfcThreshold=.4, altHypothesis="greaterAbs")
      
      print(summary(res_total_true))
      
      res_total_true_df <- as.data.frame(res_total_true)
      results_df_total_true = as.data.frame(
        dplyr::mutate(as.data.frame(res_total_true_df),
                      sig=ifelse(res_total_true_df$padj<0.1, "FDR<0.1", "Not Sig")),
        row.names=rownames(res_total_true_df))
      
      deseq_var_df_total_true = make_stat_table_per_gene(resBasemean_total_true, coldata, results_df_total_true[results_df_total_true$padj < 0.1,])
      df_cat_total_true = make_categories(deseq_var_df_total_true)
      
      nb_gene_by_cat_total_total_true = apply(df_cat_total_true[, c("up", "down", "constraint", "diversification", "unclassified")] , 2, sum)
      nb_gene_by_cat_total_total_true = as.data.frame(t(nb_gene_by_cat_total_total_true))
      
      nb_gene_by_cat_total_total_true$Total = sum(results_df_total_true$sig == "FDR<0.1", na.rm = T)
      nb_gene_by_cat_total_total_true$LFC.XM.P = sum(subset(results_df_total_true, sig == "FDR<0.1")$log2FoldChange < 0, na.rm = T)
      nb_gene_by_cat_total_total_true$LFC.XM.N = sum(subset(results_df_total_true, sig == "FDR<0.1")$log2FoldChange >= 0, na.rm = T)
      
      nb_gene_by_cat_total_total_true$type="total true"
      nb_gene_by_cat_total_total_true$design=as.character(design)[2]
      nb_gene_by_cat_total_total_true$dataset_name = dataset_name
      
      print(nb_gene_by_cat_total_total_true)
    }
  }
  
  if ( ! file.exists(RDATA_output)) {
    
    print(paste("no simu in ", RDATA_output))
    ### prep simu
    
    run_resampling_function = function(x) {
      Random_label_with_removed_signi_genes(x,
                                            norm_table = round(resBasemean_total_true,0),
                                            seed = NULL,
                                            design = design,
                                            column_permutation = column_permutation, 
                                            replace_signi_genes = replace_signi_genes)
    } 
    
    
    run_in_parallel <- function(nRand, run_resampling_function,nCPU=5) {
      print(paste("start resampling ", nRand))
      clus <- makeCluster(nCPU) #Parallelisation
      clusterExport(clus,  envir=environment(), varlist=c("Random_label_new","random_permutation_persp_balanced","coldata",
                                                          "var1_design", "var2_design","design",
                                                          "column_permutation","resBasemean_total_true",
                                                          "mapvalues", "make_stat_table_per_gene",
                                                          "make_categories","run_resampling_function",
                                                          "permut_inds_selection_by_permut_group","random_permutation_from_inds_to_shuffle_list",
                                                          "Random_label_with_removed_signi_genes","replace_signi_genes"
      ))
      
      multi_RandLab_raw = parLapply(clus,1:nRand, run_resampling_function)
      
      multi_RandLab_raw_nb_gene_by_cat_df_shuffled = lapply(multi_RandLab_raw, function(l){l$nb_gene_by_cat_df_shuffled})
      multi_RandLab_raw_res_compa = lapply(multi_RandLab_raw, function(l){l$res_compa})
      
      multi_RandLab_nb_gene_by_cat_df_shuffled = as.data.frame(do.call(rbind, multi_RandLab_raw_nb_gene_by_cat_df_shuffled))
      multi_RandLab_res_compa = as.data.frame(do.call(rbind, multi_RandLab_raw_res_compa))
      
      stopCluster(clus)
      
      list(nb_gene_by_cat_df_shuffled =multi_RandLab_nb_gene_by_cat_df_shuffled,
           res_compa=multi_RandLab_res_compa)
      
    }
    
    start_time <- Sys.time()
    
    res_shuffling_raw = run_in_parallel(nRand, run_resampling_function, nCPU=nCPU)
    
    shuffling_res_compa = res_shuffling_raw$res_compa
    res_shuffling = res_shuffling_raw$nb_gene_by_cat_df_shuffled
    
    inds = colnames(res_shuffling)[grep("_[0-9]",colnames(res_shuffling))]
    res_shuffling$tirage = apply(res_shuffling[inds],1, paste0, collapse="")
    
    end_time <- Sys.time()
    
    save(res_shuffling,shuffling_res_compa,file=RDATA_output)
    
    duration = (end_time - start_time)
    
    print(duration)
    
    duration_str = strftime(as.POSIXct("00:00:00", format="%H:%M:%S") + 
                              duration, format="%H:%M:%S")
    
    writeLines(paste("Execution time :", duration_str), paste0(output_prefix,".exec_time.txt"))
    
  } else {
    
    print( paste(RDATA_output, "exists") )
    print( paste("Load simulations saved in", RDATA_output))
    
    output_time_file = gsub(".RData",".exec_time.txt",RDATA_output)
    print(system(paste0("cat ", output_time_file),intern = TRUE))
    
    load(RDATA_output)
    print(dim(res_shuffling))
    print(dim(shuffling_res_compa))
    inds = colnames(res_shuffling)[grep("_[0-9]",colnames(res_shuffling))]
    #res_shuffling = subset(res_shuffling, type %in% c("shuffling","true","shuffling (no sig gene)"))
    #print(dim(res_shuffling))
  }
  # calcul pvalue total
  
  pval_simu_df = calcul_pvalue_from_res_shuffling(res_shuffling)
  pval_simu_df$couple = "All_data"
  
  
  # calcul pvalue par tirage
  pval_simu_per_tirage_df  = res_shuffling %>% group_by(couple) %>% group_modify(~calcul_pvalue_from_res_shuffling(.x))
  
  pval_simu_df =
    rbind(pval_simu_df , pval_simu_per_tirage_df) %>%
    relocate(couple) %>%
    mutate(dataset = dataset_name_2 )
  
  write_tsv(pval_simu_df,file=paste0(output_prefix,"_pval.tsv"))
  
  
  #reformat ouptut table
  
  res_shuffling_fdl = res_shuffling %>% tibble() %>% dplyr::select(-c(inds,design)) %>% unique() %>%
    pivot_wider(names_sep = ".",
                names_from = type,
                values_from = c(up,down,constraint,diversification,unclassified,Total,LFC.XM.P,LFC.XM.N))
  
  # some metrics 
  
  table(res_shuffling$tirage)
  nb_tirage = length(table(res_shuffling$tirage))
  
  table(res_shuffling$couple)
  nb_couple = length(table(res_shuffling$couple))
  
  # replace ; by \n all 3 couples in couple_list to see title in facet
  i=1
  while (length(grep(";",res_shuffling_fdl$couple)) > 0 ) {
    if (i == 3) {
      res_shuffling_fdl$couple = str_replace(res_shuffling_fdl$couple, ";","\n")
      i=1
    } else {
      res_shuffling_fdl$couple = str_replace(res_shuffling_fdl$couple, ";"," ")
      i = i + 1
    }
  }
  
  
  #plot global
  
  title_plot=paste(nb_couple, " shuffled sp couples (~", as.character(mean(table(res_shuffling$couple)/2)), " each)")
  
  col_wilc = grep("pvalues_wilc_p", colnames(pval_simu_df))
  annotation_pval = as.data.frame(t(subset(pval_simu_df, couple == "All_data")[,col_wilc])) 
  colnames(annotation_pval) = "pval"
  annotation_pval$set   = sapply(str_split(rownames(annotation_pval),"\\.", n = 2), function(x){x[2]}) # return True, up , ...
  annotation_pval$x     = -Inf
  annotation_pval$y     = Inf
  annotation_pval$hjust = 0
  annotation_pval$vjust = 1
  annotation_pval$color = "balck"
  annotation_pval$label = paste0("pval = ",round(annotation_pval$pval,2))
  
  
  ptot = ggplot() + theme_bw() +
    geom_boxplot(data=res_shuffling_fdl, aes(x=Total.true,y=Total.shuffling, color = couple)) +
    geom_abline(slope=1, intercept=0) +
    ggtitle(paste0(dataset_name, "\nTotal (n=",title_plot,")")) + theme(legend.position = "none") +
    xlim(range(res_shuffling_fdl$Total.shuffling, res_shuffling_fdl$Total.true)) +
    ylim(range(res_shuffling_fdl$Total.shuffling, res_shuffling_fdl$Total.true)) +
    geom_text(data = subset(annotation_pval, set == "Total") , aes(x=x,y=y,label=label,hjust=hjust,vjust=vjust))
  
  pup = ggplot() + theme_bw() +
    geom_boxplot(data=res_shuffling_fdl, aes(x=up.true,y=up.shuffling, color = couple)) +  geom_abline(slope=1, intercept=0) +
    ggtitle("up") + theme(legend.position = "none") +
    xlim(range(res_shuffling_fdl$up.shuffling, res_shuffling_fdl$up.true)) +
    ylim(range(res_shuffling_fdl$up.shuffling, res_shuffling_fdl$up.true)) +
    geom_text(data = subset(annotation_pval, set == "Up") , aes(x=x,y=y,label=label,hjust=hjust,vjust=vjust))
  
  pdown = ggplot() + theme_bw() +
    geom_boxplot(data=res_shuffling_fdl, aes(x=down.true,y=down.shuffling, color = couple)) +  geom_abline(slope=1, intercept=0) +
    ggtitle("down") + theme(legend.position = "none") +
    xlim(range(res_shuffling_fdl$down.shuffling, res_shuffling_fdl$down.true)) +
    ylim(range(res_shuffling_fdl$down.shuffling, res_shuffling_fdl$down.true)) +
    geom_text(data = subset(annotation_pval, set == "Down") , aes(x=x,y=y,label=label,hjust=hjust,vjust=vjust))
  
  p_cons = ggplot() + theme_bw() +
    geom_boxplot(data=res_shuffling_fdl, aes(x=constraint.true,y=constraint.shuffling, color = couple)) +  geom_abline(slope=1, intercept=0) +
    ggtitle("constraint") + theme(legend.position = "none") +
    xlim(range(res_shuffling_fdl$constraint.shuffling, res_shuffling_fdl$constraint.true)) +
    ylim(range(res_shuffling_fdl$constraint.shuffling, res_shuffling_fdl$constraint.true)) +
    geom_text(data = subset(annotation_pval, set == "Conserved") , aes(x=x,y=y,label=label,hjust=hjust,vjust=vjust))
  
  p_divers = ggplot() + theme_bw() +
    geom_boxplot(data=res_shuffling_fdl, aes(x=diversification.true,y=diversification.shuffling, color = couple)) +  geom_abline(slope=1, intercept=0) +
    ggtitle("diversification") + theme(legend.position = "none") +
    xlim(range(res_shuffling_fdl$diversification.true, res_shuffling_fdl$diversification.shuffling)) +
    ylim(range(res_shuffling_fdl$diversification.true, res_shuffling_fdl$diversification.shuffling)) +
    geom_text(data = subset(annotation_pval, set == "Diversification") , aes(x=x,y=y,label=label,hjust=hjust,vjust=vjust))
  
  p_uncl = ggplot(res_shuffling_fdl, aes(x=unclassified.true,y=unclassified.shuffling, color = couple)) + theme_bw() +
    geom_boxplot() +   geom_abline(slope=1, intercept=0) +
    ggtitle("unclassified") + theme(legend.position = "none") +
    xlim(range(res_shuffling_fdl$unclassified.true, res_shuffling_fdl$unclassified.shuffling)) +
    ylim(range(res_shuffling_fdl$unclassified.true, res_shuffling_fdl$unclassified.shuffling))
  
  p_LFC.XM.P = ggplot() + theme_bw() +
    geom_boxplot(data=res_shuffling_fdl, aes(x=LFC.XM.P.true,y=LFC.XM.P.shuffling, color = couple)) +  geom_abline(slope=1, intercept=0) +
    ggtitle("LFC.XM.P") + theme(legend.position = "none") +
    xlim(range(res_shuffling_fdl$LFC.XM.P.true, res_shuffling_fdl$LFC.XM.P.shuffling)) +
    ylim(range(res_shuffling_fdl$LFC.XM.P.true, res_shuffling_fdl$LFC.XM.P.shuffling)) +
    geom_text(data = subset(annotation_pval, set == "LFC.XM.P") , aes(x=x,y=y,label=label,hjust=hjust,vjust=vjust))
  
  p_LFC.XM.N = ggplot() + theme_bw() +
    geom_boxplot(data=res_shuffling_fdl, aes(x=LFC.XM.N.true,y=LFC.XM.N.shuffling, color = couple)) +  geom_abline(slope=1, intercept=0) +
    ggtitle("LFC.XM.N") + theme(legend.position = "none") +
    xlim(range(res_shuffling_fdl$LFC.XM.N.true, res_shuffling_fdl$LFC.XM.N.shuffling)) +
    ylim(range(res_shuffling_fdl$LFC.XM.N.true, res_shuffling_fdl$LFC.XM.N.shuffling)) +
    geom_text(data = subset(annotation_pval, set == "LFC.XM.N") , aes(x=x,y=y,label=label,hjust=hjust,vjust=vjust))
  
  if (nb_couple <= 12) {
    if(replace_signi_genes == F) {
      
      ptot_tr_common = ggplot(res_shuffling_fdl, aes(x=common_genes_sign  ,y=Total.true)) + theme_bw() +
        geom_boxplot(aes(color = couple)) + geom_abline(slope=1, intercept=0) +
        ggtitle("nb common significant genes / Total.true") + theme(legend.position = "none") 
      
      ptot_sh_common = ggplot(res_shuffling_fdl, aes(x=common_genes_sign  ,y=Total.shuffling)) + theme_bw() +
        geom_boxplot(aes(color = couple)) + geom_abline(slope=1, intercept=0) +
        ggtitle("nb common significant genes / Total.shuffling") + theme(legend.position = "none") 
      
      p_global <- plot_grid(plotlist = list(ptot,pup,pdown,p_cons,p_divers,p_uncl,p_LFC.XM.P,p_LFC.XM.N,ptot_sh_common,ptot_tr_common), ncol = 2)
    } else {
      p_global <- plot_grid(plotlist = list(ptot,pup,pdown,p_cons,p_divers,p_uncl,p_LFC.XM.P,p_LFC.XM.N), ncol = 2)
    }
    
    ggsave(p_global,file=paste0(output_prefix,"_plot_global.pdf"),
           units = "cm", width = 22 ,height = 28,
           limitsize = FALSE)
    print(p_global)
    
    
    # plot par tirage
    
    title_plot=paste(as.character(table(res_shuffling$couple)/2), collapse ="/")
    
    ptot = ggplot(res_shuffling_fdl, aes(x=Total.true,y=Total.shuffling, group = tirage,color = couple)) + theme_bw() +
      geom_boxplot() + geom_point(alpha = 0.1) + geom_abline(slope=1, intercept=0) +
      facet_wrap(~couple) + ggtitle(paste0(dataset_name,"\nTot (n=",title_plot,")")) + theme(legend.position = "none")
    
    pup = ggplot(res_shuffling_fdl, aes(x=up.true,y=up.shuffling, group = tirage,color = couple)) + theme_bw() +
      geom_boxplot() + geom_point(alpha = 0.1) + geom_abline(slope=1, intercept=0) +
      facet_wrap(~couple) + ggtitle("up") + theme(legend.position = "none")
    
    pdown = ggplot(res_shuffling_fdl, aes(x=down.true,y=down.shuffling, group = tirage,color = couple)) + theme_bw() +
      geom_boxplot() + geom_point(alpha = 0.1) + geom_abline(slope=1, intercept=0) +
      facet_wrap(~couple) + ggtitle("down") + theme(legend.position = "none")
    
    p_cons = ggplot(res_shuffling_fdl, aes(x=constraint.true,y=constraint.shuffling, group = tirage,color = couple)) + theme_bw() +
      geom_boxplot() + geom_point(alpha = 0.1) + geom_abline(slope=1, intercept=0) +
      facet_wrap(~couple) + ggtitle("constraint") + theme(legend.position = "none") 
    
    p_divers = ggplot(res_shuffling_fdl, aes(x=diversification.true,y=diversification.shuffling, group = tirage,color = couple)) + theme_bw() +
      geom_boxplot() + geom_point(alpha = 0.1) + geom_abline(slope=1, intercept=0) +
      facet_wrap(~couple) + ggtitle("diversification") + theme(legend.position = "none") 
    
    p_uncl = ggplot(res_shuffling_fdl, aes(x=unclassified.true,y=unclassified.shuffling, group = tirage,color = couple)) + theme_bw() +
      geom_boxplot() + geom_point(alpha = 0.1) + geom_abline(slope=1, intercept=0) +
      facet_wrap(~couple) + ggtitle("unclassified") + theme(legend.position = "none")
    
    p_LFC.XM.P = ggplot(res_shuffling_fdl, aes(x=LFC.XM.P.true,y=LFC.XM.P.shuffling, group = tirage,color = couple)) + theme_bw() +
      geom_boxplot() + geom_point(alpha = 0.1) + geom_abline(slope=1, intercept=0) +
      facet_wrap(~couple) + ggtitle("LFC.XM.P") + theme(legend.position = "none") 
    
    p_LFC.XM.N = ggplot(res_shuffling_fdl, aes(x=LFC.XM.N.true,y=LFC.XM.N.shuffling, group = tirage,color = couple)) + theme_bw() +
      geom_boxplot() + geom_point(alpha = 0.1) + geom_abline(slope=1, intercept=0) +
      facet_wrap(~couple) + ggtitle("LFC.XM.N") + theme(legend.position = "none") 
    
    
    p_tirage <- plot_grid(plotlist = list(ptot,pup,pdown,p_cons,p_divers,p_uncl,p_LFC.XM.P,p_LFC.XM.N), ncol = 1)
    
    print(p_tirage)
    
    nb_col = ifelse(nb_couple > 4, 2,1)
    ggsave(p_tirage,file=paste0(output_prefix,"_plot_per_tirage.pdf"),
           units = "cm", width = nb_col * 11 ,height = 24+nb_couple*8,      limitsize = FALSE)
  }
  
  return(pval_simu_df)
}


