## CSC functions

library(tidyverse)
library("Biostrings")
library(ggpubr)

options(mc.cores = 14)



# ENV VARIABLES -----------------------------------------------------------

genetic_code <- c("Phe", "Phe", "Leu", "Leu", "Ser", "Ser", "Ser", "Ser", "Tyr", "Tyr", "Stop", "Stop", "Cys", "Cys", "Stop", "Trp",
                  "Leu", "Leu", "Leu", "Leu", "Pro", "Pro", "Pro", "Pro", "His", "His", "Gln", "Gln", "Arg", "Arg", "Arg", "Arg",
                  "Ile", "Ile", "Ile", "Met", "Thr", "Thr", "Thr", "Thr", "Asn", "Asn", "Lys", "Lys", "Ser", "Ser", "Arg", "Arg",
                  "Val", "Val", "Val", "Val", "Ala", "Ala", "Ala", "Ala", "Asp", "Asp", "Glu", "Glu", "Gly", "Gly", "Gly", "Gly")
names(genetic_code) <- names(GENETIC_CODE)

codon <- c("ATT", "ATC", "ATA", "CTT", "CTC", "CTA", "CTG", "TTA", "TTG", "GTT", "GTC", "GTA", "GTG", "TTT", "TTC", "ATG", "TGT", "TGC",
           "GCT", "GCC", "GCA", "GCG", "GGT", "GGC", "GGA", "GGG", "CCT", "CCC", "CCA", "CCG", "ACT", "ACC", "ACA", "ACG", "TCT", "TCC", 
           "TCA", "TCG", "AGT", "AGC", "TAT", "TAC", "TGG", "CAA", "CAG", "AAT", "AAC", "CAT", "CAC", "GAA", "GAG", "GAT", "GAC", "AAA", 
           "AAG", "CGT", "CGC", "CGA", "CGG", "AGA", "AGG", "TAA", "TAG", "TGA")

amino_acid <- c("Ala", "Arg", "Asn", "Asp", "Cys", "Gln", "Glu", "Gly", "His", "Ile", 
                "Leu", "Lys", "Met", "Phe", "Pro", "Ser", "Thr", "Trp", "Tyr", "Val",
                "Stop")
names_codon_aa <- c("AAA (Lys)", "AAC (Asn)", "AAG (Lys)", "AAU (Asn)", "ACA (Thr)", "ACC (Thr)", 
                    "ACG (Thr)", "ACU (Thr)", "AGA (Arg)", "AGC (Ser)", "AGG (Arg)", "AGU (Ser)", 
                    "AUA (Ile)", "AUC (Ile)", "AUG (Met)", "AUU (Ile)", "CAA (Gln)", "CAC (His)", 
                    "CAG (Gln)", "CAU (His)", "CCA (Pro)", "CCC (Pro)", "CCG (Pro)", "CCU (Pro)", 
                    "CGA (Arg)", "CGC (Arg)", "CGG (Arg)", "CGU (Arg)", "CUA (Leu)", "CUC (Leu)", 
                    "CUG (Leu)", "CUU (Leu)", "GAA (Glu)", "GAC (Glu)", "GAG (Glu)", "GAU (Glu)", 
                    "GCA (Ala)", "GCC (Ala)", "GCG (Ala)", "GCU (Ala)", "GGA (Gly)", "GGC (Gly)", 
                    "GGG (Gly)", "GGU (Gly)", "GUA (Val)", "GUC (Val)", "GUG (Val)", "GUU (Val)", 
                    "UAA (Stop)", "UAC (Tyr)", "UAG (Stop)", "UAU (Tyr)", "UCA (Ser)", "UCC (Ser)", 
                    "UCG (Ser)", "UCU (Ser)", "UGA (Stop)", "UGC (Cys)", "UGG (Trp)", "UGU (Cys)", 
                    "UUA (Leu)", "UUC (Phe)", "UUG (Leu)", "UUU (Phe)")

# fasta_functions ---------------------------------------------------------

check_cds <- function(cds) {
  
  start <- substring(cds, 1, 3) == 'ATG' | substring(cds, 1, 3) == 'atg' | 
    substring(cds, 1, 3) == 'CTG' | substring(cds, 1, 3) == 'ctg'
  stop <- substring(cds, nchar(cds) - 2,nchar(cds)) == 'TAG' | substring(cds, nchar(cds) - 2,nchar(cds)) == 'tag' |
    substring(cds, nchar(cds) - 2,nchar(cds)) == 'TAA' | substring(cds, nchar(cds) - 2,nchar(cds)) == 'taa' |
    substring(cds, nchar(cds) - 2,nchar(cds)) == 'TGA' | substring(cds, nchar(cds) - 2,nchar(cds)) == 'tga' 
  length <- nchar(cds) %% 3 == 0
  
  return(which(length & start & stop))
}

# codon functions ---------------------------------------------------------

count_codon <- function(cds) {
  analysed <- cds[check_cds(cds)]
  codon_counts <- mclapply(analysed, FUN = function(x) table(substring(as.character(x), seq(4, nchar(x) - 3, 3), seq(4, nchar(x) - 3, 3) + 2)))
  
  return(codon_counts)
}

freq_codon <- function(codon_counts) {
  codon_freq <- mclapply(codon_counts, function(x) x / sum(x))
  return(codon_freq)
}

add_missing_aa <- function(aa_freq){
  func <- function(x) {
    names_to_add <- amino_acid[which(!amino_acid %in% names(x))]
    existing_names <- names(x)
    x <- c(x, rep(0, times = length(which(!amino_acid %in% names(x)))))
    names(x) <- c(existing_names, names_to_add)
    x <- x[order(names(x))]
    return(x)
  }
  
  aa_freq_20 <- mclapply(aa_freq,  func)
  return(aa_freq_20)
}

add_missing_codon <- function(codon_freq){
  func <- function(x) {
    names_to_add <- codon[which(!codon %in% names(x))]
    existing_names <- names(x)
    x <- c(x, rep(0, times = length(which(!codon %in% names(x)))))
    names(x) <- c(existing_names, names_to_add)
    x <- x[order(names(x))]
    return(x)
  }
  
  codon_freq_64 <- mclapply(codon_freq,  func)
  return(codon_freq_64)
  
}

list_to_df <- function(list_codon, codon = TRUE) {
  if (codon) {
    codon_freq_64 <- add_missing_codon(list_codon)
  } else {
    codon_freq_64 <- add_missing_aa(list_codon)
  }
  seq_name <- names(codon_freq_64)
  sequence <- paste(codon_freq_64)
  df_codon <- t(data.frame(codon_freq_64))
  return(df_codon)
}

clean_codon_freq <- function(df_codon_freq, index) {
  index_clean <- index[!is.na(index)]
  codon_freq_clean <- df_codon_freq[rownames(df_codon_freq) %in% names(index_clean),]
  index_clean <- index_clean[order(names(index_clean))]
  codon_freq_clean <- codon_freq_clean[order(rownames(codon_freq_clean)),]
  
  cleaned_obj <- list(codon_freq_clean, index_clean)
  names(cleaned_obj) <- c("codon_freq_clean", "index_clean")
  
  if (all(rownames(codon_freq_clean) == names(index_clean))) {
    return(cleaned_obj)
  } else {
    stop("Error in clean_codon_freq function")
  }
  
}

translate <- function(cds) {
  codon <- substring(as.character(cds), 
                     seq(4, nchar(cds) - 3, 3), 
                     seq(4, nchar(cds) - 3, 3) + 2
           )
  aa <- c()
  for (i in codon) {
    aa <- c(aa, genetic_code[i])
  }
  return(aa)
}

freq_AA_single <- function(cds) {
  freq <- table(translate(cds))
  freq <- freq / sum(freq)
  return(freq)
}

freq_AA <- function(cds, cds_check = T) {
  if (cds_check) {
    analysed <- cds[check_cds(cds)]
  } else {
    analysed <- cds
  }
  freq <- mclapply(analysed, freq_AA_single)
  return(freq)
}

# data prep functions -----------------------------------------------------

prepare_index <- function(index_name, filtred = TRUE) {
  db <- "~/RMI2/gitlab/tdd/data/databases/2020-03-26_07-39-50_Subset_Data_processed.csv"
  table <- as_tibble(read.csv(db))
  
  index <- table %>% select(gene_id, transcript_id, index_name)
  
  if (filtred) {
    # Determine the cell type
    if (length(grep("Lympho",index_name, value = TRUE)) == 1 ) {
      cell <- "Lympho"
    }  else if (length(grep("Macro",index_name, value = TRUE)) == 1 ) {
      cell <- "Macro"
    }
    # Determine in the state type
    if (length(grep("Activated", index_name, value = TRUE)) == 1 ) {
      state <- "Activated"
    }  else if (length(grep("Resting", index_name, value = TRUE)) == 1 ) {
      state <- "Resting"
    }
    # load expressed Genes
    expressedGenes <- list(Lympho_Resting = unlist(read.csv(file = "~/RMI2/gitlab/tdd/results/filtred_genes_Lympho_Resting.csv")),
                           Lympho_Activated = unlist(read.csv(file = "~/RMI2/gitlab/tdd/results/filtred_genes_Lympho_Activated.csv")),
                           Macro_Resting = unlist(read.csv(file = "~/RMI2/gitlab/tdd/results/filtred_genes_Macro_Resting.csv")),
                           Macro_Activated = unlist(read.csv(file = "~/RMI2/gitlab/tdd/results/filtred_genes_Macro_Activated.csv")))
    # 
    index <- index %>% filter(gene_id %in% expressedGenes[[paste0(cell, "_", state)]])
    index_transcripts <- index$transcript_id
    index <- unlist(index[, index_name])
    names(index) <- as.character(index_transcripts)
  } else {
    index_transcripts <- index$transcript_id
    index <- unlist(index[, index_name])
    names(index) <- as.character(index_transcripts)
  } 
  
  return(index)
}

# CSC functions -----------------------------------------------------------

get_rCSC <- function(df_codon_freq, index_name, filtred = TRUE) {
  index <- prepare_index(index_name = index_name, filtred = filtred)
  cleaned_obj <- clean_codon_freq(df_codon_freq, index)
  rCSC <- apply(cleaned_obj[["codon_freq_clean"]], 2, function(x) cor(x, cleaned_obj[["index_clean"]], method = "pearson"))
}

define_optmial_codon <- function(rCSC) {
  opt_codon <- rCSC
  opt_codon[opt_codon > 0] <- "opt"
  opt_codon[opt_codon < 0] <- "non-opt"
  opt_codon <- as.factor(opt_codon)
  
  return(opt_codon)
}

calc_opt_percent <- function(opt_codon, df_codon) {
  
  opt <- names(opt_codon[opt_codon == "opt" & !is.na(opt_codon)])
  opt_percent <- rowSums(df_codon[,colnames(df_codon)%in%opt]) / rowSums(df_codon) * 100
  return(opt_percent)
}

# GC3 functions -----------------------------------------------------------

define_GC3_codon <- function() {
  codon <- c("AUU", "AUC", "AUA", "CUU", "CUC", "CUA", "CUG", "UUA", "UUG", "GUU", "GUC", "GUA", "GUG", "UUU", "UUC", "AUG", "UGU", "UGC",
             "GCU", "GCC", "GCA", "GCG", "GGU", "GGC", "GGA", "GGG", "CCU", "CCC", "CCA", "CCG", "ACU", "ACC", "ACA", "ACG", "UCU", "UCC", 
             "UCA", "UCG", "AGU", "AGC", "UAU", "UAC", "UGG", "CAA", "CAG", "AAU", "AAC", "CAU", "CAC", "GAA", "GAG", "GAU", "GAC", "AAA", 
             "AAG", "CGU", "CGC", "CGA", "CGG", "AGA", "AGG", "UAA", "UAG", "UGA")
  
  names(codon) <- codon
  codon[grep("(G|C)$", codon)] <- "GC3"
  codon[grep("(A|U)$", codon)] <- "AU3"
  
  return(codon)
}

calc_GC3_percent <- function(df_codon_count) {
  GC_AT_codon <- define_GC3_codon()
  GC3 <- names(GC_AT_codon[GC_AT_codon == "GC3" & !is.na(GC_AT_codon)])
  GC3_percent <- rowSums(df_codon_count[,colnames(df_codon_count) %in% GC3]) / rowSums(df_codon_count) * 100
  return(GC3_percent)
}


# plot functions ----------------------------------------------------------

boxplot_opt_percent_index <- function(opt_percent, index) {
  index_clean <- index[!is.na(index)]
  index_clean <- index_clean[order(names(index_clean))]
  
  opt_percent_clean <- opt_percent[names(opt_percent) %in% names(index_clean)]
  opt_percent_clean <- opt_percent_clean[order(names(opt_percent_clean))]
  
  if (all(names(opt_percent_clean) == names(index_clean))) {
    theme_set(theme_bw())
    data <- as.data.frame(cbind(opt_percent_clean, index_clean))
    data$bin_percent <- NA
    data[data$opt_percent_clean >= 80,"bin_percent"] <- ">80"
    data[data$opt_percent_clean < 80,"bin_percent"] <- "70-79"
    data[data$opt_percent_clean < 70,"bin_percent"] <- "60-69"
    data[data$opt_percent_clean < 60,"bin_percent"] <- "50-59"
    data[data$opt_percent_clean < 50,"bin_percent"] <- "40-49"
    data[data$opt_percent_clean < 40,"bin_percent"] <- "30-39"
    data[data$opt_percent_clean < 30,"bin_percent"] <- "<29"
    data$bin_percent <- factor(data$bin_percent, levels = c("<29", "30-39", "40-49", "50-59", "60-69", "70-79", ">80"))
    
    dataSE <- summarySE(data, measurevar = "index_clean", groupvars = "bin_percent")
    dataSE$bin_percent <- factor(dataSE$bin_percent, levels = c("<29", "30-39", "40-49", "50-59", "60-69", "70-79", ">80"))
    dataSE$N <- paste("n =", dataSE$N)
    ymin <- min(dataSE$index_clean - dataSE$se)
    ymax <- max(dataSE$index_clean + dataSE$se)
    
    # my_comparisons <- list( c("<29", "30-39"), c("30-39", "40-49"), c("40-49", "50-59"), c("50-59","60-69"),c("70-79", ">80")  )
    violin_plot <- ggplot(data = data, aes(x = bin_percent, y = index_clean)) + 
                    geom_violin(aes(fill = bin_percent)) + ylim(-1,2) + geom_boxplot(width=0.2) +
                    stat_compare_means() 
                    # stat_compare_means(comparisons = my_comparisons, label = "p.signif", method = "t.test")

    
    mean_plot <- ggplot(data = dataSE, aes(x = bin_percent, y = index_clean)) + 
                    geom_point(aes(color = bin_percent)) + 
                    geom_errorbar(aes(ymin = index_clean - se, 
                                      ymax = index_clean + se, 
                                      color = bin_percent), 
                                  width = .1) +
                    geom_text(aes(label = N), y = ymax + 0.1*ymax) + 
                    ylim(ymin, ymax + 0.15*ymax)
     
    return(list(violin_plot, mean_plot))
    
  } else {
    stop("Error in boxplot_opt_percent_index function")
  }
}

boxplot_opt_percent_high_low <- function(opt_percent, index) {
  index_clean <- index[!is.na(index)]
  index_clean <- index_clean[order(names(index_clean))]
  
  opt_percent_clean <- opt_percent[names(opt_percent) %in% names(index_clean)]
  opt_percent_clean <- opt_percent_clean[order(names(opt_percent_clean))]
  
  if (all(names(opt_percent_clean) == names(index_clean))) {
    theme_set(theme_bw())
    data <- as.data.frame(cbind(opt_percent_clean, index_clean))
    data$bin_index <- NA
    data[data$index_clean >= 0.8,"bin_index"] <- ">0.8"
    data[data$index_clean < 0.8,"bin_index"] <- "0.6-0.8"
    data[data$index_clean < 0.6,"bin_index"] <- "0.4-0.6"
    data[data$index_clean < 0.4,"bin_index"] <- "0.2-0.4"
    data[data$index_clean < 0.2,"bin_index"] <- "<0.2"
    
    data$bin_index <- factor(data$bin_index, levels = c("<0.2", "0.2-0.4", "0.4-0.6", "0.6-0.8", ">0.8"))
    
    boxplot <- ggplot(data = data, aes(x = bin_index, y = opt_percent_clean)) + 
      geom_violin(aes(fill = bin_index)) + ylim(25,80) + geom_boxplot(width=0.2)
    model <- aov(opt_percent_clean ~ bin_index, data = data)
    TukeyHSD(model)
    
    return(boxplot)
    
  } else {
    stop("Error in boxplot_opt_percent_index function")
  }
}

barplot_rCSC <- function(rCSC, pval = NULL){
  theme_set(theme_bw())
  data <- as.data.frame(rCSC)
  data$codon <- rownames(data)
  data$codon <- factor(data$codon, levels = data[order(data$rCSC, decreasing = TRUE), "codon"])
  
  if (is.null(pval)) {
    ggplot(data = data, aes(x = codon, y = rCSC)) + 
      geom_bar(stat = "identity") +
      theme(axis.text.x = element_text(angle = 90)) 
  } else {
    pval[pval == "<1e-04"] <- 0
    data <- cbind(data,as.numeric(pval))
    data <- na.exclude(data)
    data$sign <- "n.s"
    data[data$`as.numeric(pval)` < 0.05,"sign"] <- "*"
    data[data$`as.numeric(pval)` < 0.01,"sign"] <- "**"
    data[data$`as.numeric(pval)` < 0.001,"sign"] <- "***"
    data[data$`as.numeric(pval)` < 0.0001,"sign"] <- "****"
    
    ggplot(data = data, aes(x = codon, y = rCSC, fill = "all")) + 
      geom_bar(stat = "identity", alpha = 0.5) +
      theme(axis.text.x = element_text(angle = 90)) + 
      geom_text(aes(label=sign), vjust=-0.5, color="black",
                position = position_identity(), size=3.5)
  }
  
}

barplot_sup_rCSC <- function(rCSC_resting, rCSC_activated, pval_resting = NULL, pval_activated = NULL){
  theme_set(theme_bw())
  data_res <- as.data.frame(rCSC_resting)
  data_res$codon <- rownames(data_res)
  data_act <- as.data.frame(rCSC_activated)
  data_act$codon <- rownames(data_act)
  
  data <- merge(data_res, data_act)
  data$codon <- factor(data$codon, levels = data[order(data$rCSC_resting, decreasing = TRUE), "codon"])
  colnames(data) <- c("codon", "resting", "activated")
  data <- pivot_longer(data = data, cols = c(resting, activated), names_to = "state")
  
  if (is.null(pval_resting) | is.null(pval_activated)) {
    ggplot(data = data, aes(x = codon, y = value, fill = state)) + 
      geom_bar(stat = "identity", position = position_dodge()) +
      theme(axis.text.x = element_text(angle = 90)) 
  } else {
    pval_resting[pval_resting == "<1e-04"] <- 0
    pval_activated[pval_activated == "<1e-04"] <- 0
    data_res <- data %>% filter(state == "resting")
    data_res <- cbind(data_res,as.numeric(pval_resting$FDR))
    colnames(data_res) <- c("codon", "state", "value", "FDR")
    data_act <- data %>% filter(state == "activated")
    data_act <- cbind(data_act,as.numeric(pval_activated$FDR))
    colnames(data_act) <- c("codon", "state", "value", "FDR")
    data <- rbind(data_res, data_act)
    data <- data %>% filter(! is.na(FDR) & ! is.na(value))
    # data$FDR <- round(data$FDR, digits = 3)
    data$sign <- NA
    data[data$FDR < 0.05,"sign"] <- "*"
    data[data$FDR < 0.01,"sign"] <- "**"
    data[data$FDR < 0.001,"sign"] <- "***"
    data[data$FDR < 0.0001,"sign"] <- "****"
    
    ggplot(data = data, aes(x = codon, y = value, fill = state)) + 
      geom_bar(stat = "identity", alpha = 0.5, position = position_identity() )+
      theme(axis.text.x = element_text(angle = 90)) + 
      geom_text(aes(label=sign), vjust=-0.5, color="black",
                position = position_identity(), size=3.5) + 
      scale_fill_manual("legend", values = c("resting" = "#69b3a2", "activated" = "#404080"))
  }
}

barplot_sup_perm <- function(rCSC, perm_rCSC, pval = NULL){
  theme_set(theme_bw())
  data_res <- as.data.frame(rCSC)
  data_res$codon <- rownames(data_res)
    
  data <- merge(data_res, perm_rCSC)
  data$codon <- factor(data$codon, levels = data[order(data$rCSC, decreasing = TRUE), "codon"])
  data <- pivot_longer(data = data, cols = c(rCSC, median), names_to = "variable")
  data[data$variable == "rCSC", "sem"] <- NA
  
  if (is.null(pval)) {
    ggplot(data = data, aes(x = codon, y = value, fill = state)) + 
      geom_bar(stat = "identity", position = position_dodge()) +
      theme(axis.text.x = element_text(angle = 90)) 
  } else {
    pval[pval == "<1e-04"] <- 0
    data_res <- data %>% filter(variable == "rCSC")
    data_res <- cbind(data_res,as.numeric(pval$FDR))
    colnames(data_res) <- c("codon", "sem", "variable", "value", "FDR")
    
    data <- data %>% filter(!variable == "rCSC")
    data$FDR <- NA

    data <- rbind(data_res, data %>% filter(!variable == "rCSC"))
    data <- data %>% filter(! (is.na(FDR) & variable == "rCSC"))
    data$sign <- NA
    data[data$FDR < 0.05 & !is.na(data$FDR),"sign"] <- "*"
    data[data$FDR < 0.01& !is.na(data$FDR),"sign"] <- "**"
    data[data$FDR < 0.001& !is.na(data$FDR),"sign"] <- "***"
    data[data$FDR < 0.0001& !is.na(data$FDR),"sign"] <- "****"
    data$variable <- gsub("rCSC", "real_sequence", data$variable)
    data$variable <- gsub("median", "permutated_sequence", data$variable)
    
    data <- data %>% filter(!(codon %in% c("UGA (Stop)", "UAG (Stop)", "UAA (Stop)")))
    
    ggplot(data = data, aes(x = codon, y = value, fill = variable)) + 
      geom_bar(stat = "identity", alpha = 0.5, position = position_identity() )+
      geom_errorbar(aes(ymin=value-sem, ymax=value+sem), width=.1) +
      theme(axis.text.x = element_text(angle = 90)) + 
      geom_text(aes(label=sign), vjust=-0.5, color="black",
                position = position_identity(), size=3.5) 
  }
  
}

binning_correlation <- function(var, index_name, filtred = TRUE) {
  index <- prepare_index(index_name = index_name, filtred = filtred)
  index <- index[is.finite(index)]
  index <- as.data.frame(index, row.names = names(index))
  index$gene_id <- rownames(index)
  
  var <- as.data.frame(var, row.names = names(var))
  var$gene_id <- rownames(var)
  
  data <- merge(index, var)
  
  bin <- round(nrow(data) / 300)
  tpm <- data.frame(matrix(nrow = nrow(data) %% bin, ncol = 2))
  data <- data[order(data[,"var"]), ]
  
  # loop for calculate bin values
  c = 1
  for (j in 1:nrow(data)) {
    if (j %% bin == 0) {
      m1 = mean(data[((j - bin) + 1):j,"var"])
      m2 = mean(data[((j - bin) + 1):j,"index"])
      
      tpm[c, 1] = m1
      tpm[c, 2] = m2
      c = c + 1
    }
  }
  
  # rename dataframe and plot 
  colnames(tpm) <- c("variable", "index")
  
  stats <- boxplot(tpm, plot = FALSE)[["stats"]]
  xmin <- stats[1,1] - stats[1,1] * 0.05
  xmax <- stats[5,1] + stats[5,1] * 0.05
  
  ymin <- stats[1,2] - stats[1,2] * 0.05
  ymax <- stats[5,2] + stats[5,2] * 0.05
  
  p1 <- ggplot(tpm, aes(x = variable, y = index)) + 
    ggtitle(index_name) + 
    xlab("variable") + 
    theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
          panel.background = element_blank(), axis.line = element_line(colour = "black")) +
    geom_point(alpha = 0.6) + ylim(ymin,ymax) + xlim(xmin,xmax) +
    geom_smooth(method = "loess", size = 1.5)
  
  return(p1)
  
}


# permutation functions ---------------------------------------------------

resample <- function(x, ...) x[sample.int(length(x), ...)]

random_seq <- function(one_seq){
  seq <- unlist(strsplit(as.character(one_seq), split = ""))
  new_seq <- paste(resample(seq, size = length(seq)),collapse = '')
}

sequence_permutation <- function(cds, check_cds = TRUE) {
  if (check_cds) {
    cds <- cds[check_cds(cds)]
  }
  permutated_cds <- mclapply(cds, FUN = random_seq)
  return(permutated_cds)
}

count_permutated_seq <- function(permutated_seq){
  codon_counts <- mclapply(permutated_seq, FUN = function(x) table(substring(as.character(x), seq(4, nchar(x) - 3, 3), seq(4, nchar(x) - 3, 3) + 2)))
  return(codon_counts)
}

rCSC_perm <- function(cds, index_name, n_perm = 10000, is_codon = T) {
  message(paste(Sys.time(),": Check_cds"))
  analysed <- cds[check_cds(cds)]
  message(paste(Sys.time(),": prepare_index"))
  index <- prepare_index(index_name)
  analysed <- analysed[names(analysed) %in% names(index)]
  
  message(paste(Sys.time(),": create bootstrap list"))
  bootstrap <- list()
  for (i in 1:n_perm) {
    bootstrap[[i]] <- analysed
  }
  
  message(paste(Sys.time(),": calculate CSC scores"))
  if (is_codon) {
    rCSC <- mclapply(bootstrap, function(x) get_rCSC(df_codon_freq = list_to_df(freq_codon(count_permutated_seq(sequence_permutation(x)))),
                                                     index_name = index_name,
                                                     filtred = TRUE)
    )
  } else {
    rCSC <- mclapply(bootstrap, function(x) get_rCSC(df_codon_freq = list_to_df(freq_AA(sequence_permutation(x, 
                                                                                                             check_cds = FALSE), 
                                                                                        cds_check = F),
                                                                                codon = FALSE),
                                                     index_name = index_name,
                                                     filtred = TRUE),
                     mc.cores = 4
    )
  }
  
  return(rCSC)
}


# stats functions ----------------------------------------------------------

normalize_denstiy <- function(dens_data) {
  total <- sum(dens_data[['y']])
  dens_data[['y_norm']] <- dens_data[['y']] / total
  return(dens_data)
}

pval_sumcum_uniq <- function(CSC_random, CSC_ref, n_perm, method = "diffAbs"){
  
  if (is.na(CSC_ref)) {
    return(NA)
  }
  
  if (method == "greaterAbs") {
    CSC_random  <- abs(CSC_random)
    CSC_random <- CSC_random[order(CSC_random)]
    CSC_dens <- normalize_denstiy(dens_data = density(CSC_random))
    df_z <- as.data.frame(cbind(CSC_dens[['x']], 1 - cumsum(CSC_dens[['y_norm']])))
    colnames(df_z) <- c("x", "y_norm")
    if (max(df_z$x) < abs(CSC_ref)) {
      pval <- paste0("<", 1/n_perm)
    } else {
    pval <- df_z[df_z$x > abs(CSC_ref),'y_norm'][1]
    }
    
  } else if (method == "diffAbs") {
    # CSC_random  <- abs(CSC_random)
    CSC_random <- CSC_random[order(CSC_random)]
    CSC_dens <- normalize_denstiy(dens_data = density(CSC_random))
    
    if (CSC_ref > mean(CSC_random)) {
      df_z <- as.data.frame(cbind(CSC_dens[['x']], 1 - cumsum(CSC_dens[['y_norm']])))
      colnames(df_z) <- c("x", "y_norm")
      if (max(df_z$x) < CSC_ref) {
        pval <- paste0("<", 1/n_perm)
      } else {
        pval <- df_z[df_z$x > CSC_ref,'y_norm'][1]
      }
      
    } else if (CSC_ref < mean(CSC_random)) {
      df_z <- as.data.frame(cbind(CSC_dens[['x']], cumsum(CSC_dens[['y_norm']])))
      colnames(df_z) <- c("x", "y_norm")
      if (max(df_z$x) < CSC_ref) {
        pval <- paste0("<", 1/n_perm)
      } else {
        pval <- df_z[df_z$x > CSC_ref,'y_norm'][1]
      }
    } else {
      stop("method must be \"greaterAbs\" or \"diffAbs\"")
    }
  }
  
  return(pval)
}

pval_sumcum_all <- function(rCSC, CSC_ref, n_perm, is_codon = T){
  if (is_codon){
    codon <- list("AAA", "AAC", "AAG", "AAT", "ACA", "ACC", "ACG", "ACT", "AGA", "AGC", "AGG", "AGT", "ATA", "ATC", "ATG", "ATT", 
                  "CAA", "CAC", "CAG", "CAT", "CCA", "CCC", "CCG", "CCT", "CGA", "CGC", "CGG", "CGT", "CTA", "CTC", "CTG", "CTT", 
                  "GAA", "GAC", "GAG", "GAT", "GCA", "GCC", "GCG", "GCT", "GGA", "GGC", "GGG", "GGT", "GTA", "GTC", "GTG", "GTT", 
                  "TAA", "TAC", "TAG", "TAT", "TCA", "TCC", "TCG", "TCT", "TGA", "TGC", "TGG", "TGT", "TTA", "TTC", "TTG", "TTT")
    names(codon) <- codon
  } else {
    codon <- list("Ala", "Arg", "Asn", "Asp", "Cys", "Gln", "Glu", "Gly", "His", "Ile", 
                  "Leu", "Lys", "Met", "Phe", "Pro", "Ser", "Thr", "Trp", "Tyr", "Val",
                  "Stop")
    names(codon) <- codon
  }
  
  
  y = unlist(rCSC)
  pval <- mclapply(codon, function(x) pval_sumcum_uniq(CSC_random = y[names(y) == x], CSC_ref = CSC_ref[x], n_perm = n_perm))
  return(unlist(pval))
}

perm_boot <- function(cds, n_perm) {
  analysed <- cds[check_cds(cds)]
  bootstrap <- list()
  for (i in 1:n_perm) {
    bootstrap[[i]] <- analysed
  }
  
  bootstrap <-  mclapply(bootstrap, function(x) sequence_permutation(x, check_cds = FALSE))
  return(bootstrap)
}

pval_perm <- function(cds, index_name, CSC_ref, n_perm = 10000, is_codon = T) {
  rCSC <- rCSC_perm(cds = cds, index_name = index_name, n_perm = n_perm, is_codon = is_codon)
  save(rCSC, file = paste0("results/rCSC_perm_",index_name,"_",n_perm,"perm.RData"))
  
  rCSC_df <- list_to_df(rCSC)
  median_rCSC <- apply(rCSC_df, MARGIN = 2, FUN = median)
  sem_rCSC <- apply(rCSC_df, MARGIN = 2, FUN = function(x) sd(x))
  rCSC_df <- data.frame(codon = names(sem_rCSC),
                     median = median_rCSC, 
                     sem = sem_rCSC)
  save(rCSC_df, file = paste0("results/rCSC_perm_df_",index_name,"_",n_perm,"perm.RData"))
  message(paste(Sys.time(),": calculating pval"))
  pval <- pval_sumcum_all(rCSC = rCSC, CSC_ref = CSC_ref, n_perm = n_perm, is_codon = is_codon)
  
  return(pval)
}

summarySE <- function(data=NULL, measurevar, groupvars=NULL, na.rm=FALSE,
                      conf.interval=.95, .drop=TRUE) {
  library(plyr)
  
  # New version of length which can handle NA's: if na.rm==T, don't count them
  length2 <- function (x, na.rm=FALSE) {
    if (na.rm) sum(!is.na(x))
    else       length(x)
  }
  
  # This does the summary. For each group's data frame, return a vector with
  # N, mean, and sd
  datac <- ddply(data, groupvars, .drop=.drop,
                 .fun = function(xx, col) {
                   c(N    = length2(xx[[col]], na.rm=na.rm),
                     mean = mean   (xx[[col]], na.rm=na.rm),
                     sd   = sd     (xx[[col]], na.rm=na.rm)
                   )
                 },
                 measurevar
  )
  
  # Rename the "mean" column    
  datac <- rename(datac, c("mean" = measurevar))
  
  datac$se <- datac$sd / sqrt(datac$N)  # Calculate standard error of the mean
  
  # Confidence interval multiplier for standard error
  # Calculate t-statistic for confidence interval: 
  # e.g., if conf.interval is .95, use .975 (above/below), and use df=N-1
  ciMult <- qt(conf.interval/2 + .5, datac$N-1)
  datac$ci <- datac$se * ciMult
  
  return(datac)
}

stats_rCSC <- function(rCSC) {
  up <- names(rCSC)[rCSC > 0]
  up <- gsub(" \\(...)", "", up)
  up <- up[!is.na(up)]
  GC3_up <- length(grep("(G|C)$", up))/length(up) * 100
  message(paste(length(grep("(G|C)$", up)), "codons GC3 over", length(up), "codons with rCSC > 0"))
  up <- paste(up, collapse = '')
  stat <- table(unlist(strsplit(up, split = '', fixed = TRUE)))
  if(is.na(stat["C"])) {
    stat["C"] <- 0
  }
  
  GC_up <- (stat["G"] + stat["C"] )/sum(stat) *100
  message(paste((stat["G"] + stat["C"] )/sum(stat) *100 , "% of GC nucleotide in codons with rCSC > 0"))
 
  
  down <- names(rCSC)[rCSC < 0]
  down <- gsub(" \\(...)", "", down)
  down <- down[!is.na(down)]
  GC3_down <- length(grep("(G|C)$", down))/length(down) * 100
  message(paste(length(grep("(G|C)$", down)), "codons GC3 over", length(down), "codons with rCSC < 0"))
  down <- paste(down, collapse = '')
  stat <- table(unlist(strsplit(down, split = '', fixed = TRUE)))
  GC_down <- (stat["G"] + stat["C"] )/sum(stat) *100
  message(paste((stat["G"] + stat["C"] )/sum(stat) * 100 , "% of GC nucleotide in codons with rCSC < 0"))
  
  res <- data.frame("variable"      = c("GC3_percent", "GC_percent", "GC3_percent", "GC_percent"),
                    "rCSC"          = c("positive", "positive", "negative", "negative"),
                    "value" = c(GC3_up, GC_up, GC3_down, GC_down ))
  return(res)
}
