library(tidyverse)
library(pre)
library(gam)
library(rpart)
library(pdp) # PDP, ICE
library(iml) # H-statistique
library(anchors)
library(shapper) # kernel shap
library(xgboost)
library(e1071)
library(car)
library(caret)
library(doParallel)
library(randomForest)

ascb <- function(x){
  sqrt(x + 3/8)
}

bcnPowerTransform <- function(x){
  params <- car::powerTransform(x, family = "bcnPower")
  car::bcnPower(x, gamma = params$gamma, lambda = params$lambda)
}

# load original data
 
attach("data/2020_03_23_db_for_RandomForest_longFormat_v2_ensembl.RData",
       name = "db_list_ensembl")
tdd_index <- as_tibble(db_list_ensembl$db_tddindex)
detach(db_list)

# process data
tdd_index %>% summary()
 
tdd_index <-  tdd_index %>%
  drop_na() %>% 
  mutate(
    tdd_cell = factor(tdd_cell),
    tdd_time = factor(tdd_time),
    tdd_transcription_drug = factor(tdd_transcription_drug),
    tdd_translation_drug = factor(tdd_translation_drug),
    tdd_state = factor(tdd_state),
    RiboDens = ifelse(is.infinite(RiboDens), 30, RiboDens),
    n4G = n4G_UTR5 + n4G_CDS + n4G_UTR3
    ) 

# diagnostic plot of data

tdd_index %>% drop_na() %>% summarise(count = n()) / tdd_index %>% summarise(count = n())

tdd_index %>% filter(tdd_index < 0) %>% summarise(count = n())
 
tdd_index %>%
  keep(is.numeric) %>% 
  gather() %>% 
  ggplot(aes(value)) +
  facet_wrap(~ key, scales = "free") +
  geom_histogram()

tdd_index %>%
  keep(is.numeric) %>% 
  gather(-tdd_index, key = "var", value = "value") %>%
  ggplot(aes(x = value, y = tdd_index)) +
  geom_point() +
  facet_wrap(~ var, scales = "free") +
  theme_bw()

tdd_index %>%
  keep(is.numeric) %>% 
  plot()

tdd_index %>%
  keep(is.factor) %>% 
  plot()

tdd_index %>%
  ggplot() +
  geom_point(aes(x = tdd_index, y = degradation_value)) +
  theme_bw()

tdd_index %>%
  ggplot() +
  geom_point(aes(x = tdd_index, y = Hwang.transcript_half.life)) +
  theme_bw()


# check numerical variable distribution

tdd_index %>%
  keep(is.numeric) %>% 
  pivot_longer(-tdd_index) %>%
  ggplot(aes(
    x = value,
  )) +
  facet_wrap(~name, scales = "free") +
  geom_histogram()
  theme_bw()

# search transform for variable starting with UTR 

tdd_index %>%
  keep(is.numeric) %>% 
  dplyr::select(tdd_index, starts_with("UTR")) %>% 
  pivot_longer(-tdd_index) %>% 
  mutate(transform = "none") %>% 
  bind_rows(
    tdd_index %>%
      keep(is.numeric) %>% 
      dplyr::mutate_at(vars(starts_with("UTR")), log1p)  %>% 
      dplyr::select(tdd_index, starts_with("UTR")) %>% 
      pivot_longer(-tdd_index) %>%
      mutate(transform = "log1p")
  ) %>% 
  ggplot(aes(
    x = value,
    fill = transform
  )) +
  facet_wrap(~name + transform, scales = "free", ncol = 4) +
  geom_histogram() +
  theme_bw()

# search transform for variable starting with n4G 

tdd_index %>%
  keep(is.numeric) %>% 
  dplyr::select(tdd_index, starts_with("n4G")) %>% 
  pivot_longer(-tdd_index) %>% 
  mutate(transform = "none") %>% 
  bind_rows(
    tdd_index %>%
      keep(is.numeric) %>% 
      dplyr::mutate_at(vars(starts_with("n4G")), log1p)  %>% 
      dplyr::select(tdd_index, starts_with("n4G")) %>% 
      pivot_longer(-tdd_index) %>%
      mutate(transform = "log1p")
  ) %>% 
  ggplot(aes(
    x = value,
    fill = transform
  )) +
  facet_wrap(~name + transform, scales = "free", ncol = 4) +
  geom_histogram() +
  theme_bw()

# search transform for variable starting with Streches
  
tdd_index %>%
  keep(is.numeric) %>% 
  dplyr::select(tdd_index, starts_with("Stretch")) %>% 
  pivot_longer(-tdd_index) %>% 
  mutate(transform = "none") %>% 
  bind_rows(
    tdd_index %>%
      keep(is.numeric) %>% 
      dplyr::mutate_at(vars(starts_with("Stretch")), log1p)  %>% 
      dplyr::select(tdd_index, starts_with("Stretch")) %>% 
      pivot_longer(-tdd_index) %>%
      mutate(transform = "log1p")
  ) %>% 
  bind_rows(
    tdd_index %>%
      keep(is.numeric) %>% 
      dplyr::mutate_at(vars(starts_with("Stretch")), bcnPowerTransform)  %>% 
      dplyr::select(tdd_index, starts_with("Stretch")) %>% 
      pivot_longer(-tdd_index) %>%
      mutate(transform = "bcnPowerTransform")
  ) %>%   
  ggplot(aes(
    x = value,
    fill = transform
  )) +
  facet_wrap(~name + transform, scales = "free", ncol = 6) +
  geom_histogram() +
  theme_bw()
 
################################################################################
  
tdd_index %>%
  dplyr::select(-transcript_id) %>% 
  mutate(tdd_index = as.factor(tdd_index)) %>% 
  keep(is.factor) %>% 
  mutate(tdd_index = as.numeric(tdd_index)) %>% 
  pivot_longer(-tdd_index) %>%
  ggplot(aes(
    x = value,
    y = tdd_index,
    color = value 
  )) +
  facet_wrap(~name, scales = "free") +
  geom_boxplot(fill = NA) +
  geom_violin(fill = NA, alpha = 0.5) +
  theme_bw()

################################################################################
################################################################################

save(tdd_index,
     file = "results/tdd_index.Rdata")

load(file = "results/tdd_index.Rdata")
    
# find near zero variability factor
nzv <- tdd_index %>%
  nearZeroVar(saveMetrics = TRUE) %>%
  filter(zeroVar)
nzv

# re-encoding without intercept 
tdd_index %>%
  dplyr::select(-transcript_id) %>% 
  ncol() - 
tdd_index %>%
  dplyr::select(-transcript_id) %>% 
  dummyVars(tdd_index ~ ., data = .) %>%
  predict(newdata = tdd_index) %>%
  ncol()
# we create 35 columns

# find correlated factors
tdd_index %>%
  keep(is.numeric) %>% 
  as.matrix() %>%
  cor(method = "spearman") %>%
  tibble::as_tibble(.name_repair = "universal", rownames =  NA) %>%
  mutate(factors_x =  rownames(.)) %>%
  pivot_longer(
    -factors_x,
    names_to = "factors_y",
    values_to = "correlation"
  ) %>%
  mutate(
    factors_x = as_factor(factors_x),
    factors_y = as_factor(factors_y),
    correlation = correlation
  ) %>%
  ggplot() +
  geom_tile(aes(x = factors_x, y = factors_y, fill = correlation))

# find covaring factors
tdd_index %>%
  mutate_if(is.factor, as.numeric) %>% 
  keep(is.numeric) %>% 
  as.matrix() %>%
  cov(method = "spearman") %>%
  tibble::as_tibble(.name_repair = "universal", rownames =  NA) %>%
  mutate(factors_x =  rownames(.)) %>%
  pivot_longer(
    -factors_x,
    names_to = "factors_y",
    values_to = "covariation"
  ) %>%
  mutate(
    factors_x = as_factor(factors_x),
    factors_y = as_factor(factors_y),
  ) %>%
  ggplot() +
  geom_tile(aes(x = factors_x, y = factors_y, fill = covariation))

tdd_index %>%
  mutate_if(is.factor, as.numeric) %>% 
  keep(is.numeric) %>% 
  as.matrix() %>%
  cov(method = "spearman") %>%
  tibble::as_tibble(.name_repair = "universal", rownames =  NA) %>%
  mutate(factors_x =  rownames(.)) %>%
  pivot_longer(
    -factors_x,
    names_to = "factors_y",
    values_to = "covariation"
  ) %>%
  mutate(
    factors_x = as_factor(factors_x),
    factors_y = as_factor(factors_y),
  ) %>%
  ggplot() +
  geom_tile(aes(x = factors_x, y = factors_y, fill = covariation))

# find covaring factors
tdd_index %>%
  mutate_if(is.factor, as.numeric) %>% 
  keep(is.numeric) %>% 
  pivot_longer(-tdd_index) %>%
  group_by(name) %>% 
  do(
    reg = lm(tdd_index ~ -1 + value, data = .),
  ) %>% 
  mutate(coef = reg$coefficients,
         pval = anova(reg)$`Pr(>F)`[1],
         Fval = anova(reg)$`F value`[1]) %>% 
  ungroup() %>% 
  mutate(name = fct_reorder(name, coef)) %>% 
  ggplot() +
  geom_point(aes(x = name, y = coef, color = Fval)) +
  coord_flip() +
  theme_bw()

################################################################################
#################################### svm #######################################
################################################################################

source("src/tdd_analysis_svm.R")

################################################################################
################################# random forest ################################
################################################################################

source("src/tdd_analysis_rf.R")

################################################################################
################################ boosted tree ##################################
################################################################################

source("src/tdd_analysis_boostedtree.R")

