############################
#### Make the Database  ####
############################

setwd("~/RMI2/Projet_TDD/20200114_Normalisation/")

pks <- list('ggplot2' , 'tidyverse' , 'DESeq2' , 'reshape2', 'biomaRt')
lapply(pks , library , character.only = T, quietly = FALSE)
theme_set(theme_bw())

# load sample datas -----

load(file = "results/03_IndexTDDallConditions.RData")
transcrits <-
  read.delim(
    "data/Mouse_mm10_all_Transcript_length_GC.txtw",
    header = TRUE,
    sep = ","
  )
cds <-
  read.delim("data/Mouse_mm10_all_CDS_length_GC.txtw",
             header = TRUE,
             sep = ",")
utr3 <-
  read.delim("data/Mouse_mm10_all_UTR3_length_GC.txtw",
             header = TRUE,
             sep = ",")
utr5 <-
  read.delim("data/Mouse_mm10_all_UTR5_length_GC.txtw",
             header = TRUE,
             sep = ",")
intronUtr3 <-
  read.delim("data/n_3UI+_mm10_transcripts.csv",
             header = TRUE,
             sep = ",")
counts <-
  read.delim("data/HTSeq_count_stats_all_libraries.csv",
             header = TRUE,
             sep = ",")
countsRP <-
  read.delim("data/RiboProf_counts_stats.df",
             header = TRUE,
             sep = "\t")
uORF <-
  read.delim("data/uORF_mouse_solene.txt",
             header = TRUE,
             sep = "\t")

##  Put right colones names
colnames(transcrits) = c("EnsemblID", "GeneName", "Length.transcrit", "GC%.transcrit")
colnames(cds) = c("EnsemblID", "GeneName", "Length.cds", "GC%.cds")
colnames(utr3) = c("EnsemblID", "GeneName", "Length.utr3", "GC%.utr3")
colnames(utr5) = c("EnsemblID", "GeneName", "Length.utr5", "GC%.utr5")
colnames(intronUtr3) = c("Ref_seq", "GeneName")
colnames(counts) = c("EnsemblID", 
                     paste("RNAseqCounts", 
                           colnames(counts[2:960]), 
                           sep = "_"))
colnames(countsRP) = c("EnsemblID", 
                       paste("RPCounts", 
                             colnames(countsRP[2:73]), 
                             sep ="_"))
colnames(uORF) = c("TranscriptID", 
                   colnames(uORF[2:10]))

## merge data without colone 1=2 as it repeated in each data.frame
table <- merge(transcrits, cds[,c(1,3,4)], by = "EnsemblID", all=TRUE)
table <- merge(table, utr3[,c(1,3,4)], by = "EnsemblID", all=TRUE)
table <- merge(table, utr5[,c(1,3,4)], by = "EnsemblID", all=TRUE)

## Adding intronUtr3
IntronUTR3 <- 1
intronUtr3 <- cbind(intronUtr3[,1:2], IntronUTR3)
table <- merge(table, intronUtr3, by = "GeneName", all=TRUE)
table <- table[,c(1:10,12)]
table[is.na(table[,11]),11] <- 0 # remplace NA by 0 only in colon 11 
table <- unique(table) # take off duplicates

## Adding uORF
uORF = as.data.frame(uORF)
uORF2 = uORF[uORF$C_start == "AUG" &
               uORF$Place == "UTR5", ]
uORF2[uORF2$Kozak == "near 5'", "Kozak"] = 0 # convert near 5' in 0, if not I can not sum kozak scores

uORF3 = data.frame(matrix(nrow = length(unique(uORF[, "TranscriptID"])), ncol = 3))
colnames(uORF3) = c("TranscriptID", "n-uORF", "sumKozakuOFR")
uORF3[, 1] = unique(uORF[, "TranscriptID"])



for (i in 1:length(uORF3[, 1]))
{
  uORF3[i, 2] = as.numeric(as.character(length(uORF2[grep(uORF3[i, 1], 
                                                          uORF2[, 1]), "Kozak"])))
  uORF3[i, 3] = sum(as.numeric(as.character(uORF2[grep(uORF3[i, 1], 
                                                       uORF2[, 1]), "Kozak"])))
}

uORF3[, "TranscriptID"] = str_extract(uORF3[, "TranscriptID"], "ENSMUST...........") # take off the number after the dot in the ID

# EnsemblID are not the same in the two tables : In counts it is geneID (ENSMUSGxxxx.xx) and in table2 is transcriptID (ENSMUSTxxxx.xx )
# I start removing .xx which is not corresponding to the version of the gene
# I use biomaRT to add geneID to table2
table[, "EnsemblID"] = str_extract(table[, "EnsemblID"], "ENSMUST...........") # take off the number after the dot in the ID

##Definition du fichier contenant les probes
TranscriptsID <- table[, "EnsemblID"]

##D?finition de la base de donn? o? est effectu?e la recherche
mart <- useDataset("mmusculus_gene_ensembl", 
                   useMart("ensembl"))

resultat <-
  getBM(
    attributes = c("ensembl_gene_id", "ensembl_transcript_id"),
    filters = "ensembl_transcript_id",
    values = TranscriptsID,
    mart = mart
  )
colnames(resultat) = c("EnsemblID", "TranscriptID")
colnames(table) = c(
  "GeneName",
  "TranscriptID",
  "Length.transcrit",
  "GC%.transcrit",
  "Length.cds",
  "GC%.cds",
  "Length.utr3",
  "GC%.utr3",
  "Length.utr5",
  "GC%.utr5",
  "IntronUTR3"
)
table <- merge(resultat, table, by = "TranscriptID", all = TRUE)
table <- merge(table, uORF3, all = TRUE)


## Adding TDD_index
### Lympho Resting
TDDindexes_LR <- as_tibble(IndexTDD[["LymphoR"]])
IndexTDD_tmp <-  dplyr::select(TDDindexes_LR,
                      ensemblID,
                      asIndexTDD_t3TripTripCHX.untreated:relIndexTDD_t1TripTripHAR.0hTrip) %>% 
                  group_by(ensemblID) %>%
                  summarise_all(mean)
colnames(IndexTDD_tmp) <- c("EnsemblID", paste0(colnames(IndexTDD_tmp)[-1], "_LR"))
IndexTDD_tmp[,"EnsemblID"] <- str_extract(as.character(unlist(IndexTDD_tmp[,"EnsemblID"])),
                                          "ENSMUSG...........")

table = merge(table, IndexTDD_tmp, by="EnsemblID",all=TRUE)

### Lympho Activated
TDDindexes_LA <- as_tibble(IndexTDD[["LymphoA"]])
IndexTDD_tmp <-  dplyr::select(TDDindexes_LA,
                               ensemblID,
                               asIndexTDD_t3TripTripCHX.untreated:relIndexTDD_t1DRBDRBHAR.0hTrip) %>% 
  group_by(ensemblID) %>%
  summarise_all(mean)
colnames(IndexTDD_tmp) <- c("EnsemblID", paste0(colnames(IndexTDD_tmp)[-1], "_LA"))
IndexTDD_tmp[,"EnsemblID"] <- str_extract(as.character(unlist(IndexTDD_tmp[,"EnsemblID"])),
                                          "ENSMUSG...........")

table = merge(table, IndexTDD_tmp, 
              by="EnsemblID",
              all=TRUE)

### Macro Resting
TDDindexes_MR <- as_tibble(IndexTDD[["MacroR"]])
IndexTDD_tmp <-  dplyr::select(TDDindexes_MR,
                               ensemblID,
                               asIndexTDD_t3TripTripCHX.untreated:relIndexTDD_t3TripTripHAR.0hTrip) %>% 
  group_by(ensemblID) %>%
  summarise_all(mean)
colnames(IndexTDD_tmp) <- c("EnsemblID", paste0(colnames(IndexTDD_tmp)[-1], "_MR"))
IndexTDD_tmp[,"EnsemblID"] <- str_extract(as.character(unlist(IndexTDD_tmp[,"EnsemblID"])),
                                          "ENSMUSG...........")

table = merge(table, IndexTDD_tmp, 
              by="EnsemblID",
              all=TRUE)

### Macro Activated
TDDindexes_MA <- as_tibble(IndexTDD[["MacroA"]])
IndexTDD_tmp <-  dplyr::select(TDDindexes_MA,
                               ensemblID,
                               asIndexTDD_t3TripTripCHX.untreated:relIndexTDD_t3TripTripHAR.0hTrip) %>% 
  group_by(ensemblID) %>%
  summarise_all(mean)
colnames(IndexTDD_tmp) <- c("EnsemblID", paste0(colnames(IndexTDD_tmp)[-1], "_MA"))
IndexTDD_tmp[,"EnsemblID"] <- str_extract(as.character(unlist(IndexTDD_tmp[,"EnsemblID"])),
                                          "ENSMUSG...........")

table = merge(table, IndexTDD_tmp, 
              by="EnsemblID",
              all=TRUE)

##  Adding % of degradation 
### at 3h
Deg_tmp <-  dplyr::select(TDDindexes_LR,
                          ensemblID,
                          t0Trip,
                          t1Trip,
                          t3Trip) %>% 
            group_by(ensemblID) %>%
            summarise_all(mean)

Deg_tmp <- cbind(as.character(Deg_tmp$ensemblID), 
                         (Deg_tmp[,"t0Trip"]-Deg_tmp[,"t3Trip"])/Deg_tmp[,"t0Trip"],
                         (Deg_tmp[,"t0Trip"]-Deg_tmp[,"t1Trip"])/Deg_tmp[,"t0Trip"]
           )
colnames(Deg_tmp) = c("EnsemblID", "Deg3hTrip_LR", "Deg1hTrip_LR")
Deg_tmp <- as_tibble(Deg_tmp)

Deg_tmp[,"EnsemblID"] = str_extract(as.character(unlist(Deg_tmp[,"EnsemblID"])),
                                    "ENSMUSG...........")

table = merge(table, Deg_tmp, by = "EnsemblID", all = TRUE)
### Activated
Deg_tmp <-  dplyr::select(TDDindexes_LA,
                             ensemblID,
                             t0Trip,
                             t1Trip,
                             t3Trip) %>% 
  group_by(ensemblID) %>%
  summarise_all(mean)

Deg_tmp <- cbind(as.character(Deg_tmp$ensemblID), 
                 (Deg_tmp[,"t0Trip"]-Deg_tmp[,"t3Trip"])/Deg_tmp[,"t0Trip"],
                 (Deg_tmp[,"t0Trip"]-Deg_tmp[,"t1Trip"])/Deg_tmp[,"t0Trip"]
)
colnames(Deg_tmp) = c("EnsemblID", "Deg3hTrip_LA", "Deg1hTrip_LA")
Deg_tmp <- as_tibble(Deg_tmp)

Deg_tmp[,"EnsemblID"] = str_extract(as.character(unlist(Deg_tmp[,"EnsemblID"])),
                                    "ENSMUSG...........")

table = merge(table, Deg_tmp, by = "EnsemblID", all = TRUE)

### at 3h
Deg_tmp <-  dplyr::select(TDDindexes_MR,
                          ensemblID,
                          t0Trip,
                          t3Trip) %>% 
  group_by(ensemblID) %>%
  summarise_all(mean)

Deg_tmp <- cbind(as.character(Deg_tmp$ensemblID), 
                 (Deg_tmp[,"t0Trip"]-Deg_tmp[,"t3Trip"])/Deg_tmp[,"t0Trip"]
)
colnames(Deg_tmp) = c("EnsemblID", "Deg3hTrip_MR")
Deg_tmp <- as_tibble(Deg_tmp)

Deg_tmp[,"EnsemblID"] = str_extract(as.character(unlist(Deg_tmp[,"EnsemblID"])),
                                    "ENSMUSG...........")

table = merge(table, Deg_tmp, by = "EnsemblID", all = TRUE)
### Activated
Deg_tmp <-  dplyr::select(TDDindexes_MA,
                          ensemblID,
                          t0Trip,
                          t3Trip) %>% 
  group_by(ensemblID) %>%
  summarise_all(mean)

Deg_tmp <- cbind(as.character(Deg_tmp$ensemblID), 
                 (Deg_tmp[,"t0Trip"]-Deg_tmp[,"t3Trip"])/Deg_tmp[,"t0Trip"]
)
colnames(Deg_tmp) = c("EnsemblID", "Deg3hTrip_MA")
Deg_tmp <- as_tibble(Deg_tmp)

Deg_tmp[,"EnsemblID"] = str_extract(as.character(unlist(Deg_tmp[,"EnsemblID"])),
                                    "ENSMUSG...........")

table = merge(table, Deg_tmp, by = "EnsemblID", all = TRUE)

## cleanning the table
table = table[!is.na(table[,"EnsemblID"]),]

##########################################################################################################
# Add RiboDensity  ### STOP HERE
## Step 1 : Picking colones -> exon only, and condition must match beetwen RNAseq and RP
RiboDensity_RP <- countsRP[,grep("(EnsemblID)|(CDS)$",colnames(countsRP))]
RiboDensity_RNAseq <- counts[,grep("(EnsemblID)|(3h_untreated.*CDS$)",colnames(counts))]

## Step 2 : Normalize value at 1 millions of Read
for (i in 2:length(RiboDensity_RP[1,]))
{
  nromfactor = 1000000 / sum(as.numeric(RiboDensity_RP[-(1:5),i]))
  for (j in 6:length(RiboDensity_RP[,1]))
  {
    RiboDensity_RP[j,i] = as.numeric(RiboDensity_RP[j,i]) * nromfactor
  }
}

for (i in 2:length(RiboDensity_RNAseq[1,]))
{
  nromfactor = 1000000 / sum(as.numeric(RiboDensity_RNAseq[-(1:5),i]))
  for (j in 6:length(RiboDensity_RNAseq[,1]))
  {
    RiboDensity_RNAseq[j,i] = as.numeric(RiboDensity_RNAseq[j,i]) * nromfactor
  }
}

## Step 3 : Ratio of RP / RNAseq

RiboDensity = merge(RiboDensity_RP[-(1:5),], RiboDensity_RNAseq[-(1:5),])
RiboDensity2 = cbind(as.character(RiboDensity$EnsemblID),
                     RiboDensity$RPCounts_Tmac.2_rT1_CDS/RiboDensity$RNAseqCounts_R2_3_Resting_3h_untreated_CDS, 
                     RiboDensity$RPCounts_Tmac.4_rT2_CDS/RiboDensity$RNAseqCounts_R3_3_Resting_3h_untreated_CDS,
                     RiboDensity$RPCounts_Tmac.6_rT3_CDS/RiboDensity$RNAseqCounts_R4_3_Resting_3h_untreated_CDS,
                     RiboDensity$RPCounts_Tmac.1_aT1_CDS/RiboDensity$RNAseqCounts_A2_3_Activated_3h_untreated_CDS,
                     RiboDensity$RPCounts_Tmac.3_aT2_CDS/RiboDensity$RNAseqCounts_A3_3_Activated_3h_untreated_CDS,
                     RiboDensity$RPCounts_Tmac.5_aT3_CDS/RiboDensity$RNAseqCounts_A4_3_Activated_3h_untreated_CDS,
                     RiboDensity$RPCounts_Tmac.7_rM1_CDS/RiboDensity$RNAseqCounts_LPSno_macro_3h_untreated_i3_3_CDS,
                     RiboDensity$RPCounts_Tmac.9_rM2_CDS/RiboDensity$RNAseqCounts_LPSno_macro_3h_untreated_i4_3_CDS,
                     RiboDensity$RPCounts_Tmac.11_rM3_CDS/RiboDensity$RNAseqCounts_LPSno_macro_3h_untreated_i5_3_CDS,
                     RiboDensity$RPCounts_Tmac.8_aM1_CDS/RiboDensity$RNAseqCounts_LPS_macro_3h_untreated_m3_3_CDS,
                     RiboDensity$RPCounts_Tmac.10_aM2_CDS/RiboDensity$RNAseqCounts_LPS_macro_3h_untreated_m4_3_CDS,
                     RiboDensity$RPCounts_Tmac.12_aM3_CDS/RiboDensity$RNAseqCounts_LPS_macro_3h_untreated_m5_3_CDS)

colnames(RiboDensity2) = c("EnsemblID",
                           "RiboDensity_Resting_lympho_1",
                           "RiboDensity_Resting_lympho_2",
                           "RiboDensity_Resting_lympho_3",
                           "RiboDensity_Activated_lympho_1",
                           "RiboDensity_Activated_lympho_2",
                           "RiboDensity_Activated_lympho_3",
                           "RiboDensity_LPSno_macro_1",
                           "RiboDensity_LPSno_macro_2",
                           "RiboDensity_LPSno_macro_3",
                           "RiboDensity_LPS_macro_1",
                           "RiboDensity_LPS_macro_2",
                           "RiboDensity_LPS_macro_3")

RiboDensity3 <- data.frame(matrix(nrow = nrow(RiboDensity2), ncol = 5))
colnames(RiboDensity3) <- c("EnsemblID", "RiboDens_LymphoR","RiboDens_LymphoA",
                            "RiboDens_MacroR", "RiboDens_MacroA")
for (i in 1:nrow(RiboDensity2))
{
  RiboDensity3[i,] = cbind(RiboDensity2[i,"EnsemblID"],
                           mean(as.numeric(RiboDensity2[i,2:4])),
                           mean(as.numeric(RiboDensity2[i,5:7])),
                           mean(as.numeric(RiboDensity2[i,8:10])),
                           mean(as.numeric(RiboDensity2[i,11:13])))
}

RiboDensity3[,"EnsemblID"] = str_extract(RiboDensity3[,"EnsemblID"], "ENSMUSG...........")
RiboDensity3[, -1] <- apply(RiboDensity3[, -1], 2, function(x) as.numeric(as.character(x)))

table = merge(table, RiboDensity3, all = TRUE)
as_tibble(table)

# Purge the db
table <- subset(table,!is.na(asIndexTDD_t3TripTripCHX.untreated_LR) |
                      !is.na(asIndexTDD_t1TripTripCHX.untreated_LR) |
                      !is.na(asIndexTDD_t3TripTripHAR.untreated_LR) |
                      !is.na(asIndexTDD_t1TripTripHAR.untreated_LR) |
                      !is.na(relIndexTDD_t3TripTripCHX.untreated_LR) |
                      !is.na(relIndexTDD_t1TripTripCHX.untreated_LR) |
                      !is.na(relIndexTDD_t3TripTripHAR.untreated_LR) |
                      !is.na(relIndexTDD_t1TripTripHAR.untreated_LR) |
                      !is.na(asIndexTDD_t3TripTripCHX.0hTrip_LR) |
                      !is.na(asIndexTDD_t1TripTripCHX.0hTrip_LR) |
                      !is.na(asIndexTDD_t3TripTripHAR.0hTrip_LR) |
                      !is.na(asIndexTDD_t1TripTripHAR.0hTrip_LR) |
                      !is.na(relIndexTDD_t3TripTripCHX.0hTrip_LR) |
                      !is.na(relIndexTDD_t1TripTripCHX.0hTrip_LR) |
                      !is.na(relIndexTDD_t3TripTripHAR.0hTrip_LR) |
                      !is.na(relIndexTDD_t1TripTripHAR.0hTrip_LR) 
)

table$DELTAasTDDindex_Lympho <- table$asIndexTDD_t3TripTripCHX.0hTrip_LA - table$asIndexTDD_t3TripTripCHX.0hTrip_LR
table$DELTAasTDDindex_Macro <- table$asIndexTDD_t3TripTripCHX.0hTrip_MA - table$asIndexTDD_t3TripTripCHX.0hTrip_MR

table$asNonTDDindex_LR <- table$Deg3hTrip_LR - table$asIndexTDD_t3TripTripCHX.0hTrip_LR
table$asNonTDDindex_LA <- table$Deg3hTrip_LA - table$asIndexTDD_t3TripTripCHX.0hTrip_LA
table$asNonTDDindex_MR <- table$Deg3hTrip_MR - table$asIndexTDD_t3TripTripCHX.0hTrip_MR
table$asNonTDDindex_MA <- table$Deg3hTrip_MA - table$asIndexTDD_t3TripTripCHX.0hTrip_MA



# Save results
save(table, file = "results/database_20200113.RData")
write.table(table, 
            file = paste0("results/database_20200113.csv"),
            quote = FALSE,
            sep = ",",
            row.names = FALSE,
            col.names = TRUE)
