##########################################################################
### Data Preprocessing of epinano results for eligos2 identified sites ###
##########################################################################

######################
### Load Packages ####
######################

pkgs <- c("backports","tidyverse","here","skimr","dplyr", "ggplot2", "ggsci","ggforce",
          "janitor","readxl","xlsx", "MetBrewer","ggrepel", "usethis", "ggpubr")

lapply(pkgs, library, character.only = TRUE)


# Task 1.1: Saving the file directories to per position information

dir_default <- setwd(here("data","epinano_human_eligos2_sites","hac"))

dir_ivt <- setwd(here("data","epinano_human_eligos2_sites","ivt_hac"))

##############################################################################
# Task 1.2: save all file names in specified directory as a character vector #
##############################################################################

file_list_default <- list.files(path = dir_default)

file_list_ivt <- list.files(path = dir_ivt)

################################################################
# Task 1.4: Function for reading in all files from a directory #
################################################################

# Create a loop to read in every file of the directory and append it to the initialized data.frame plus add a new column that contains the name of the pool
# Comment: Could be sped up with fread and data.tables

read_dir <- function(file_list, work_dir){
  
  setwd(work_dir)
  
  dataset <- data.frame()
  
  for (i in 1:length(file_list)){
    temp_data <- read_csv(file_list[i]) #each file will be read in, specify which columns you need read in to avoid any errors # specifying col_types is essential to see spike_ins
    temp_data$sample <-  gsub("\\.csv", "", file_list[i])#clean the data as needed, in this case I am creating a new column that indicates which file each row of data came from
    dataset <- rbind(dataset, temp_data) #for each iteration, bind the new data to the building dataset
  }
  
  rm(i)
  rm(temp_data)
  
  return(dataset)
}


default_raw <- read_dir(file_list = file_list_default, work_dir = dir_default)

ivt_raw <- read_dir(file_list = file_list_ivt, work_dir = dir_ivt)

############################
#### 2: Data Wrangling #####
############################
  

#######################################################################################
# Task 2.1: Function to generate new columns, calculate parameters and rename samples #
#######################################################################################

process_data <- function(df){
  
  df_clean <- df %>% 
    
    # Split file name into individual columns
    separate(col = sample, into = c("model","genotype_strand"), sep = "-") %>% 
    separate(col = genotype_strand, into = c("genotype","strand_info"), sep = "\\.") %>% 
    
    # Calculate corrected mismatch frequency so that sum of mis-,ins-,del- and match frequency = 1
    mutate(c_mis = mis * (1-(del + ins))) %>% 
    
    # Calculate SumErr
    mutate(sumerr = rowSums(cbind(as.numeric(c_mis), as.numeric(ins), as.numeric(del)))) %>% 
    
    # Calculate delta SumErr 
    group_by(`#Ref`,base,pos,strand) %>% 
    # Calculate delta per postion and on Summed values
    filter(!pos == "133292612") %>% # positions w/o match in one of the genotypes-> non-eligos2 sites missed when filtering
    filter(!(pos == "30723445" & `#Ref` == "16")) %>%
    mutate(delta_sumerr = sumerr - sumerr[genotype == "ko"]) %>% 
    
    # Finally remove the 'negative control' from final table
    filter(!genotype == "ko") %>% 
    
    # Generate an ID column to overlap on
    unite("id",`#Ref`,pos,strand, remove = F)
  
  
  return(df_clean)
  
}

default_clean <- process_data(default_raw)

ivt_clean <- process_data(ivt_raw)

################################################################################################
# Task 2.2: Annotate table with eligos2 hits information, motifs, and GLORI m6A quantification #
################################################################################################

# load the kmer + model hit info

motifs <- read_table(here("data","eligos_2_human","eligos_2_kmer_info.txt"), col_names = F)

motifs <- motifs %>% 
  rename(id = X1,`5mer` = X2,model_hit = X3)


# load the GLORI hits used for Venn

glori <- read_tsv(here("known_sites","human_m6A", "filtered","GLORI_min_cov20_3models_protein_coding_only.bed"), col_names = F)

glori <- glori %>% 
  # Generate column for overlaps
  unite("id",X1,X3,X6, sep = "_") %>%
  # calculate mean across replicates
  rowwise() %>% 
  mutate(mean_glori_score = mean(c_across(X9:X10))) %>% 
  select(id, mean_glori_score)


######################################
## Task 2.3: Generate a mastertable ##
######################################

annotate_data <- function(df,model){
  
  df_final <- df %>% 
    # Overlap with motif information and filter for correct model
    left_join(motifs, by = join_by(id == id)) %>% 
    filter(model == model & model_hit == model) %>% 
    
    # add column that states whether kmer is DRACH or no
    mutate(is_DRACH = factor(ifelse(str_detect(`5mer`, "^[AGT][GA][A][C][ACT]$"), 1, 0), levels = c(1,0))) %>% 
    
    # add glori information
    left_join(glori, by = join_by(id == id))
  
  return(df_final)
  
}

default_clean_annot <- annotate_data(default_clean, model = "default") 

ivt_clean_annot <- annotate_data(ivt_clean, model = "ivt")

##################################################
# Task 2.4: Write Table to processed data folder #
##################################################


# Default

write_tsv(default_clean_annot,
            file = here("data","epinano_human_eligos2_sites","processed","default_processed.txt"))


# IVT

write_tsv(ivt_clean_annot,
            file = here("data","epinano_human_eligos2_sites","processed","ivt_processed.txt"))










