# processing database for Random Forest and binning
library(tidyverse)

db <- "~/RMI2/gitlab/tdd/data/databases/current_subset.csv"

table <- as_tibble(read.csv(db))
table$gene_id <- gsub("(^[^.]*)(.*$)", "\\1",  table$gene_id)
table$transcript_id <- gsub("(^[^.]*)(.*$)", "\\1",  table$transcript_id)

# Normalize deltaG by length
table$CDS_DG <- table$CDS_DG / table$CDS_length
table$UTR5_DG <- table$UTR5_DG / table$UTR5_length
table$UTR3_DG <- table$UTR3_DG / table$UTR3_length

table$X31CDS.End_DG <-  table$X31CDS.End_DG / (table$CDS_length - 30)

# Sum G and C % to have GC %
table$UTR5_GC <- table$UTR5_G + table$UTR5_C
table$UTR3_GC <- table$UTR3_G + table$UTR3_C
table$CDS_GC <- table$CDS_G + table$CDS_C

# remplace NA to 0 when appropriate
table$n_m6ASeq_Peaks[is.na(table$n_m6ASeq_Peaks)] <- 0
table$total_m6ASeq_score[is.na(table$total_m6ASeq_score)] <- 0
table$n4G_CDS[is.na(table$n4G_CDS)] <- 0
table$n4G_UTR5[is.na(table$n4G_UTR5)] <- 0
table$n4G_UTR3[is.na(table$n4G_UTR3)] <- 0

# calculate mean of polyA tail length
table$Hwang.Lympho_Resting.polyA_length <- rowMeans(table[,c("Hwang.Lympho_Resting.polyA_length_WT1",
                                                             "Hwang.Lympho_Resting.polyA_length_WT2",
                                                             "Hwang.Lympho_Resting.polyA_length_WT3")
                                                          ],
                                                    na.rm = TRUE)
table$Hwang.Lympho_Activated.polyA_length <- rowMeans(table[,c("Hwang.Lympho_Activated.polyA_length_WT1",
                                                               "Hwang.Lympho_Activated.polyA_length_WT2")
                                                            ],
                                                      na.rm = TRUE)

# add uORF ----------------------------------------------------------------


uORF <- read.delim("~/RMI2/gitlab/tdd/data/uORF_mouse_solene.txt",
                   header = TRUE,
                   sep = "\t")
colnames(uORF) = c("TranscriptID", 
                   colnames(uORF[2:10]))
uORF = as.data.frame(uORF)
uORF2 = uORF %>% filter(C_start == "AUG" &
                          Place == "UTR5")

uORF2[uORF2$Kozak == "near 5'", "Kozak"] = 0 # convert near 5' in 0, if not I can not sum kozak scores

uORF3 = data.frame(matrix(nrow = length(unique(uORF[, "TranscriptID"])), ncol = 3))
colnames(uORF3) = c("transcript_id", "number_uORF", "sumKozakuOFR")
uORF3[, 1] = unique(uORF[, "TranscriptID"])

for (i in 1:length(uORF3[, 1]))
{
  uORF3[i, 2] = as.numeric(as.character(length(uORF2[grep(uORF3[i, 1], 
                                                          uORF2[, 1]), "Kozak"])))
  uORF3[i, 3] = sum(as.numeric(as.character(uORF2[grep(uORF3[i, 1], 
                                                       uORF2[, 1]), "Kozak"])))
}

uORF3[, "transcript_id"] = str_extract(uORF3[, "transcript_id"], "ENSMUST...........") # take off the number after the dot in the ID

table <- merge(table, uORF3, by = "transcript_id", all.x = TRUE)
table[is.na(table$number_uORF),"number_uORF"] <- 0
table[is.na(table$sumKozakuOFR),"sumKozakuOFR"] <- 0

table <- as_tibble(table)

# load GC3 percent --------------------------------------------------------
source(file = "src/CSC_functions.R")
fastaFile <- readDNAStringSet("~/RMI2/gitlab/tdd/data/cds_seq_mm10.txt")
df_codon_count <- list_to_df(list_codon = count_codon(fastaFile))

GC3_percent <- calc_GC3_percent(df_codon_count = df_codon_count)
GC3_percent <- as_tibble(as.data.frame(GC3_percent), rownames = "transcript_id")

table <- merge(x = table, y = GC3_percent, by = "transcript_id", all.x = TRUE)

# Add basal expression in RPKM --------------------------------------------
calc_rpkm <- function(readcount, len = basal_counts$Length.gene) {
  readcount * (10^3 / len) * (10^6 / sum(readcount, na.rm = T))
}

raw_counts <- read.csv("data/HTSeq_count_stats_all_libraries.csv")
basal_counts <- raw_counts %>% select(Gene_id, matches("(0h_Triptolide.*exon$)|(Activated_3h_untreated.*exon$)")) %>%
  filter(!str_detect(Gene_id, "^__"))
basal_counts[is.na(basal_counts)] <- 0
basal_counts$Gene_id <- gsub("\\.[[:digit:]]*", "", basal_counts$Gene_id)

basal_counts <- merge(basal_counts, 
                      table %>% select(gene_id, Length.utr5, Length.cds, Length.utr3),
                      by.x = "Gene_id",
                      by.y = "gene_id")
basal_counts$Length.gene = apply(basal_counts[,c( "Length.utr5", "Length.cds", "Length.utr3")], 1, sum)
total_counts <- apply(basal_counts[,c(2:16)], 2 , sum)

RPKM_counts <- as.data.frame(apply(X = basal_counts[,c(2:16)], 
                                   MARGIN = 2, 
                                   FUN = calc_rpkm))
RPKM_counts$ensemblID <- basal_counts$Gene_id

## filtering gene with only NA in all libraries
RPKM_counts <- RPKM_counts[apply(RPKM_counts[,c(1:15)], 1, function(x) {!all(is.na(x))}),]
RPKM_counts <- RPKM_counts %>% mutate(LPS_macro = (LPS_macro_0h_Triptolide_m3_4_exon + LPS_macro_0h_Triptolide_m4_4_exon + LPS_macro_0h_Triptolide_m5_4_exon) / 3,
                                      LPSno_macro = (LPSno_macro_0h_Triptolide_i3_4_exon + LPSno_macro_0h_Triptolide_i4_4_exon + LPSno_macro_0h_Triptolide_i5_4_exon) / 3,
                                      Act_lympho = (A4_7_Activated_0h_Triptolide_exon + A3_7_Activated_0h_Triptolide_exon + A2_7_Activated_0h_Triptolide_exon) / 3,
                                      Res_lympho = (R3_4_Resting_0h_Triptolide_exon + R2_4_Resting_0h_Triptolide_exon + R4_4_Resting_0h_Triptolide_exon) / 3,
                                      Act_lympho_6h = (A2_3_Activated_3h_untreated_exon + A3_3_Activated_3h_untreated_exon + A4_3_Activated_3h_untreated_exon )/3) %>%
  select(ensemblID, Res_lympho, Act_lympho, LPSno_macro, LPS_macro, Act_lympho_6h)
colnames(RPKM_counts) <- c("gene_id", 
                           "basal_expr_Lympho_Resting_RPKM", 
                           "basal_expr_Lympho_Activated_RPKM",
                           "basal_expr_Macro_Resting_RPKM", 
                           "basal_expr_Macro_Activated_RPKM",
                           "basal_expr_Lympho_Activated_6h_RPKM")

table <- merge(table, RPKM_counts, by = "gene_id")

# Save the database -------------------------------------------------------
write.csv(table, file = "~/RMI2/gitlab/tdd/data/databases/2020-03-26_07-39-50_Subset_Data_processed.csv")
