########################################
######    TDD in gene subset      ######
######         2020.01.20         ######
########################################

# strategy : boxplot IndexTDD of differents gene subsets (i.e. 5'TOP genes, ARE genes etc...)

# 0. Set env --------------------------------------------------------------


setwd("~/RMI2/Projet_TDD/20200114_Normalisation")

library(ggplot2)
theme_set(theme_bw())
library(reshape2)
library(DESeq2)
library(stringr)
library(dplyr)
library(gridExtra)
library("biomaRt")

load(file = "results/03_IndexTDDallConditions.RData")

# 1. Get gene lists ------------------------------------------------------------
Up_UPF2ko_BMM <- read.csv("data/Upregulated_UPF2ko_mouse_BMM.csv")
Up_UPF2ko_BMM <- as.character(Up_UPF2ko_BMM[, 1])
Up_UPF2ko_thymo <-
  read.csv("data/Upregulated_UPF2ko_mouse_thymocytes.csv")
Up_UPF2ko_thymo <- as.character(Up_UPF2ko_thymo[, 1])
#TOP <- read.csv("5'TOP_genes_+-RP.csv") ## Redondant avec TOP_sabatini
#TOP <- as.character(TOP[,1])
#ARE <- read.csv("ARE_genes_data_051115.csv") ## Trop de gènes
#ARE <- as.character(ARE[,1])
ATTA <- read.csv("data/ATTTA_containing_genes.csv")
ATTA <- as.character(ATTA[, 1])
# TDD <- read.csv("genesavecTDD.csv") ## Trop de gènes
# TDD <- as.character(TDD[,2])
TOP_sabatini <- read.csv("data/sabatini_top_rnas.csv")
TOP_sabatini <- as.character(TOP_sabatini[, 1])
TOP_sabatini_nonRP <- read.csv("data/sabatini_top_rnas_nonRP.csv")
TOP_sabatini_nonRP <- as.character(TOP_sabatini_nonRP[, 1])
uORF <- read.csv("data/s_uORF_gene_list.csv")
uORF <- as.character(uORF[, 1])

uORF_solene <-
  read.delim("data/uORF_mouse_solene.txt",
             header = TRUE,
             sep = "\t")

uORF_solene$Kozak = as.numeric(as.character(uORF_solene$Kozak))
uORF_solene <-
  uORF_solene[(uORF_solene$Kozak > 3.8) &
                (!(uORF_solene$C_start == "cano")), ]
uORF_solene <- uORF_solene[!is.na(uORF_solene$num.transcrit), ]
uORF_solene_CDS <- uORF_solene[uORF_solene$Place == "CDS", ]
uORF_solene = colsplit(uORF_solene$num.transcrit, " ", c("transcriptID", "genename"))
uORF_solene_CDS = colsplit(uORF_solene_CDS$num.transcrit,
                           " ",
                           c("transcriptID", "genename"))
uORF_solene = as.vector(unique(uORF_solene$genename))
uORF_solene_CDS = as.vector(unique(uORF_solene_CDS$genename))

UTR3 <-
  read.csv(
    "data/Mouse_mm10_all_UTR3_length_GC.txtw"
  )
UTR3 <- UTR3[UTR3$length > 4000, ]
UTR3 = as.vector(unique(UTR3$X))

Intron3UTR_50bp = read.csv("data/transcript_IntronUTR3.txt", sep = "\t")
Intron3UTR_50bp <- na.exclude(Intron3UTR_50bp)
Intron3UTR_50bp <- Intron3UTR_50bp$gene_id

features <-
  list(
    Up_UPF2ko_BMM,
    Up_UPF2ko_thymo,
#    TOP,
#    ARE,
    ATTA,
#    TDD,
    TOP_sabatini,
    TOP_sabatini_nonRP,
    uORF,
    uORF_solene,
    uORF_solene_CDS
  )
names(features) = c(
  "Up_UPF2ko_BMM",
  "Up_UPF2ko_thymo",
#  "TOP",
#  'ARE',
  "ATTA",
#  "TDD",
  "TOP_sabatini",
  "TOP_sabatini_nonRP",
  "uORF"
)


# 2. get EnsemblID --------------------------------------------------------
mart <- useDataset("mmusculus_gene_ensembl", useMart("ensembl"))

list.biomart <- function(data)
{
  result <-
    getBM(
      attributes = c("ensembl_gene_id", "mgi_symbol"),
      filters = "mgi_symbol",
      values = data,
      mart = mart
    )
  return(result)
  
}

resultat <- lapply(features, list.biomart)
names(resultat) = c(
  "Up_UPF2ko_BMM",
  "Up_UPF2ko_thymo",
#  "TOP",
#  'ARE',
  "ATTA",
#  "TDD",
  "TOP_sabatini",
  "TOP_sabatini_nonRP",
  "uORF"
)



# boxplots ----------------------------------------------------------------


## Lympho R
lymphoR = IndexTDD[["LymphoR"]]
lymphoR[, "ensemblID"] = str_extract(lymphoR$ensemblID, "ENSMUSG...........")

Up_UPF2ko_BMM = lymphoR[lymphoR$ensemblID %in% resultat$Up_UPF2ko_BMM$ensembl_gene_id, ]
feature = "Up_UPF2ko_BMM"
Up_UPF2ko_BMM <- cbind(Up_UPF2ko_BMM, feature)

Up_UPF2ko_thymo = lymphoR[lymphoR$ensemblID %in% resultat$Up_UPF2ko_thymo$ensembl_gene_id, ]
feature = "Up_UPF2ko_thymo"
Up_UPF2ko_thymo <- cbind(Up_UPF2ko_thymo, feature)

 # TOP = lymphoR[lymphoR$ensemblID %in% resultat$TOP$ensembl_gene_id, ]
 # feature = "TOP"
 # TOP <- cbind(TOP, feature)
 # 
 # ARE = lymphoR[lymphoR$ensemblID %in% features$ARE, ]
 # feature = "ARE"
 # ARE <- cbind(ARE, feature)

ATTA = lymphoR[lymphoR$ensemblID %in% features$ATTA, ]
feature = "ATTA"
ATTA <- cbind(ATTA, feature)

 # TDD = lymphoR[lymphoR$ensemblID %in% features$TDD, ]
 # feature = "TDD"
 # TDD <- cbind(TDD, feature)

TOP_sabatini = lymphoR[lymphoR$ensemblID %in% resultat$TOP_sabatini$ensembl_gene_id, ]
feature = "TOP_sabatini"
TOP_sabatini <- cbind(TOP_sabatini, feature)

TOP_sabatini_nonRP = lymphoR[lymphoR$ensemblID %in% resultat$TOP_sabatini_nonRP$ensembl_gene_id, ]
feature = "TOP_sabatini_nonRP"
TOP_sabatini_nonRP <- cbind(TOP_sabatini_nonRP, feature)

uORF = lymphoR[lymphoR$ensemblID %in% resultat$uORF$ensembl_gene_id, ]
feature = "uORF"
uORF <- cbind(uORF, feature)


feature = "all"
lymphoR = cbind(lymphoR, feature)

data = rbind(
  lymphoR,
#  ARE,
  ATTA,
  Up_UPF2ko_BMM,
# TOP,
  TOP_sabatini,
  TOP_sabatini_nonRP,
  Up_UPF2ko_thymo
)

p1 <- ggplot(data = data,
       aes(x = feature, y = asIndexTDD_t3TripTripCHX.0hTrip, colour = feature)) +
  geom_boxplot(show.legend = FALSE) + ylab("absolute IndexTDD") +
  ggtitle("absolute TDD index in resting lympho") + ylim(-0.5,1.5)

lymphoA = IndexTDD[["LymphoA"]]
lymphoA[, "ensemblID"] = str_extract(lymphoA$ensemblID, "ENSMUSG...........")

Up_UPF2ko_BMM = lymphoA[lymphoA$ensemblID %in% resultat$Up_UPF2ko_BMM$ensembl_gene_id, ]
feature = "Up_UPF2ko_BMM"
Up_UPF2ko_BMM <- cbind(Up_UPF2ko_BMM, feature)

Up_UPF2ko_thymo = lymphoA[lymphoA$ensemblID %in% resultat$Up_UPF2ko_thymo$ensembl_gene_id, ]
feature = "Up_UPF2ko_thymo"
Up_UPF2ko_thymo <- cbind(Up_UPF2ko_thymo, feature)

# TOP = lymphoR[lymphoR$ensemblID %in% resultat$TOP$ensembl_gene_id, ]
# feature = "TOP"
# TOP <- cbind(TOP, feature)
# 
# ARE = lymphoR[lymphoR$ensemblID %in% features$ARE, ]
# feature = "ARE"
# ARE <- cbind(ARE, feature)

ATTA = lymphoA[lymphoA$ensemblID %in% features$ATTA, ]
feature = "ATTA"
ATTA <- cbind(ATTA, feature)

# TDD = lymphoR[lymphoR$ensemblID %in% features$TDD, ]
# feature = "TDD"
# TDD <- cbind(TDD, feature)

TOP_sabatini = lymphoA[lymphoA$ensemblID %in% resultat$TOP_sabatini$ensembl_gene_id, ]
feature = "TOP_sabatini"
TOP_sabatini <- cbind(TOP_sabatini, feature)

TOP_sabatini_nonRP = lymphoA[lymphoA$ensemblID %in% resultat$TOP_sabatini_nonRP$ensembl_gene_id, ]
feature = "TOP_sabatini_nonRP"
TOP_sabatini_nonRP <- cbind(TOP_sabatini_nonRP, feature)

uORF = lymphoA[lymphoA$ensemblID %in% resultat$uORF$ensembl_gene_id, ]
feature = "uORF"
uORF <- cbind(uORF, feature)


feature = "all"
lymphoA = cbind(lymphoA, feature)

data = rbind(
  lymphoA,
  #  ARE,
  ATTA,
  Up_UPF2ko_BMM,
  # TOP,
  TOP_sabatini,
  TOP_sabatini_nonRP,
  Up_UPF2ko_thymo
)

p2 <- ggplot(data = data,
       aes(x = feature, y = asIndexTDD_t3TripTripCHX.0hTrip, colour = feature)) +
  geom_boxplot(show.legend = FALSE) + ylab("absolute IndexTDD") + ylim(-0.5,1.5) +
  ggtitle("absolute TDD index in activated lympho")

pdf(file = "results/06_boxplot_subset_genes.pdf" )
grid.arrange(p1, p2, nrow = 2) 
dev.off()

