library(tidyverse)
# Clustering functions ----------------------------------------------------


library(cluster)

CPM <- function(x){
  return(x / sum(x) * 1000000)
}

clustering <- function(x, 
                       k, 
                       dist_method = "euclidean", 
                       hclust_method = "ward.D2", 
                       scale = F,
                       foldchange = F) {
  # error handling
  if (!dist_method %in%  c("euclidean", "maximum", "manhattan", "canberra", "binary", "minkowski")) {
    return(message("Error : unknow dist_method \n use \"euclidean\", \"maximum\", \"manhattan\", \"canberra\", \"binary\" or \"minkowski\""))
  }
  if (! hclust_method %in%  c("ward.D", "ward.D2", "single", "complete", "average", "mcquitty", "median", "centroid")) {
    return(message("Error : unknow dist_method \n use \"ward.D\", \"ward.D2\", \"single\", \"complete\", \"average\", \"mcquitty\", \"median\" or \"centroid\""))
  }
  
  # scale values if TRUE
  if (scale & !foldchange) {
    x <- scale(x = x, center = TRUE, scale = TRUE)
  } else if (scale & foldchange) {
    x <- x + 0.001
    x <- x / x[,trunc(ncol(x)/2 + ncol(x) %% 2)]
    x <- log2(x)
  }
  
  # calculate distances and clustering
  data_hc_defaults <- hclust(dist(x, method = dist_method), method = hclust_method)
  
  # cut in k cluster - return a named vector with the name of the gene and the cluster number
  cluster = cutree(data_hc_defaults,k=k)
  
  # add the cluster information to the original matrix
  x <- cbind(x, cluster)
  
  return(x)
}

plot_clustering <- function(df, clust) {
  
  # select the cluster you want to plot
  data <- df %>% filter(cluster == clust)
  # record the number of genes
  n_genes <- nrow(data)
  # add gene name as a column
  data$gene <- rownames(data) 
  # melt the data.frame for ggplot plotting
  data <- data %>% pivot_longer(cols = -c(gene,cluster), 
                                names_to = "condition",
                                values_to = "norm_counts")
  
  # re-order the column as in the original matrix
  data$condition <- factor(data$condition, levels = colnames(df))
  
  # ggplot
  ggplot(data = data, aes(y = norm_counts, x = condition, group = gene)) + 
    geom_line(alpha = 0.2) + 
    stat_summary(aes(y = norm_counts, group = 1), # mean as a line
                 fun = mean, 
                 color = "red",
                 geom = "line") +
    stat_summary(aes(y = norm_counts, group = 1), # sd as ribbon
                 fun = mean,
                 geom = "ribbon",
                 alpha = .3,
                 fill = "#EB5286",
                 fun.min = function(x) mean(x) - sd(x), 
                 fun.max = function(x) mean(x) + sd(x)) +
    theme_bw() + 
    ylab("normalized expression (arbitrary unit)")+
    ggtitle(paste("cluster ", clust, "\n n = ", n_genes, "genes"))
}

plot_dual_clustering <- function(df, clust) {
  
  # select the cluster you want to plot
  data <- df %>% filter(cluster == clust)
  # record the number of genes
  n_genes <- nrow(data)
  # add gene name as a column
  data$gene <- rownames(data) 
  data$TripCHX0h <- 0
  
  # melt the data.frame for ggplot plotting
  data <- data %>% pivot_longer(cols = -c(gene,cluster), 
                                names_to = "condition",
                                values_to = "norm_counts")
  
  data$time <- as.numeric(as.character(gsub("Trip(CHX)*","", gsub("h","", data$condition))))
  data$condition <- gsub(".h", "", data$condition)
  
  # ggplot
  ggplot(data = data, aes(y = norm_counts, x = time, group = gene)) + 
    geom_line(data = filter(data, condition == "Trip"), 
              aes(y = norm_counts, x = time, group = gene),
              alpha = 0.1) + 
    geom_line(data = filter(data, condition == "TripCHX"), 
              aes(y = norm_counts, x = time, group = gene),
              alpha = 0.1) + 
    stat_summary(aes(y = norm_counts, group = condition, color = condition), # mean as a line
                 fun = mean, 
                 geom = "line") +
    stat_summary(aes(y = norm_counts, group = condition, fill = condition), # sd as ribbon
                 fun = mean,
                 geom = "ribbon",
                 alpha = .3,
                 # fill = "#EB5286",
                 fun.min = function(x) mean(x) - sd(x), 
                 fun.max = function(x) mean(x) + sd(x)) +
    scale_fill_manual(values = c("#69b3a2", "#404080")) +
    scale_color_manual(values = c("#69b3a2", "#404080")) + 
    theme_bw() + 
    ylab("scaled gene expression (arbitrary unit)") +
    ggtitle(paste("cluster ", clust, "\n n = ", n_genes, "genes"))
}


# load data ---------------------------------------------------------------
load("results/dbNormCountsAll_exon.RData")
norm_counts <- as.data.frame(pivot_wider(dball, id_cols = ensemblID, values_from = normReadsCounts, names_from = librairies))
rownames(norm_counts) <- gsub("\\.[[:digit:]]*", "", norm_counts$ensemblID)

expressedGenes <- list(Lympho_Resting = unlist(read.csv(file = "results/filtred_genes_Lympho_Resting.csv")),
                       Lympho_Activated = unlist(read.csv(file = "results/filtred_genes_Lympho_Activated.csv")),
                       Macro_Resting = unlist(read.csv(file = "results/filtred_genes_Macro_Resting.csv")),
                       Macro_Activated = unlist(read.csv(file = "results/filtred_genes_Macro_Activated.csv")))
# Resting -----------------------------------------------

norm_counts_resting <- norm_counts[rownames(norm_counts)%in% expressedGenes[["Lympho_Resting"]], ]
norm_counts_resting <- norm_counts_resting[,grep("Resting_.h_Triptolide_(CHX_)*exon", colnames(norm_counts_resting))]

Trip0h <- norm_counts_resting[,grep("0h_Triptolide_exon", colnames(norm_counts_resting))]
Trip1h <- norm_counts_resting[,grep("1h_Triptolide_exon", colnames(norm_counts_resting))]
Trip3h <- norm_counts_resting[,grep("3h_Triptolide_exon", colnames(norm_counts_resting))]
TripCHX1h <- norm_counts_resting[,grep("1h_Triptolide_CHX_exon", colnames(norm_counts_resting))]
TripCHX3h <- norm_counts_resting[,grep("3h_Triptolide_CHX_exon", colnames(norm_counts_resting))]

data_resting <- data.frame(Trip3h = rowMeans(Trip3h),
                           Trip1h = rowMeans(Trip1h),
                   Trip0h = rowMeans(Trip0h),
                   TripCHX1h = rowMeans(TripCHX1h),
                   TripCHX3h = rowMeans(TripCHX3h))

# Activated ---------------------------------------------------------------
norm_counts_activated <- norm_counts[rownames(norm_counts)%in% expressedGenes[["Lympho_Activated"]], ]
norm_counts_activated <- norm_counts_activated[,grep("Activated_.h_Triptolide_(CHX_)*exon", colnames(norm_counts))]

Trip0h <- norm_counts_activated[,grep("0h_Triptolide_exon", colnames(norm_counts_activated))]
Trip1h <- norm_counts_activated[,grep("1h_Triptolide_exon", colnames(norm_counts_activated))]
Trip3h <- norm_counts_activated[,grep("3h_Triptolide_exon", colnames(norm_counts_activated))]
TripCHX1h <- norm_counts_activated[,grep("1h_Triptolide_CHX_exon", colnames(norm_counts_activated))]
TripCHX3h <- norm_counts_activated[,grep("3h_Triptolide_CHX_exon", colnames(norm_counts_activated))]

data_activated <- data.frame(Trip3h = rowMeans(Trip3h),
                             Trip1h = rowMeans(Trip1h),
                   Trip0h = rowMeans(Trip0h),
                   TripCHX1h = rowMeans(TripCHX1h),
                   TripCHX3h = rowMeans(TripCHX3h))
# clustering -------------------------------------------------------------
test_resting <- as.data.frame(clustering(x = data_resting, 
                                 k = 15,
                                 dist_method = "euclidean",
                                 hclust_method = "ward.D2",
                                 scale = T,
                                 foldchange = T))

table(test_resting$cluster)
pdf(file = "test_clustering_resting.pdf")
for(i in c(1:15)) {
  print(plot_clustering(test_resting, i))
  print(plot_dual_clustering(test_resting,i))
}
dev.off()
plot_clustering(test_resting,1)
plot_dual_clustering(test_resting,1)


test_activated <- as.data.frame(clustering(x = data_activated, 
                                         k = 15,
                                         dist_method = "euclidean",
                                         hclust_method = "ward.D2",
                                         scale = T,
                                         foldchange = T))

table(test_activated$cluster)
pdf(file = "test_clustering_activated.pdf")
for(i in c(1:15)) {
  print(plot_clustering(test_resting, i))
  print(plot_dual_clustering(test_activated,i))
}
dev.off()

3
