library(scales)
library(factoextra)
library(tibble)
library(ggplot2)
library(readr)
library(tidyr)
library(dplyr)
library(glue)

theme_set(theme_classic())
theme_update(
  panel.background = element_rect(fill = "transparent", colour = NA),
  plot.background = element_rect(fill = "transparent", colour = NA),
  legend.background = element_rect(fill = "transparent", colour = NA)
)

setwd(paste0(Sys.getenv("HOME"), "/code/malawi_transposon/script/R_SCRIPT/"))
source("modules/frequently_used_items.R")
source("modules/create_presence_absence_matrix.R")

################################################################################

df <- read_csv(
  "./local/malawi_haplochromines-summary-preprocessed.csv",
   col_types = paste0('cciiiiicciiicilliidc', paste0(rep('ciicic', num_of_genomes), collapse=''), collapse=''),
   na = c("", "NA", ".")
)

df_pa_matrix <- create_pa_matrix_ranked(
  df %>% filter(n_paths_bio %in% c(2,3), lenbio_min!=lenbio_max) %>% select(ends_with('_len')) %>% drop_na()
)
colnames(df_pa_matrix) <- color_schema_species$species_abrv

# write presence-absence matrix into CSV file for a phylogenetic tree construction (separate script)
write_csv(df_pa_matrix, "./local/phylogenetic/pa_matrix_phylogenetic_ranked.csv")

################################################################################
# count shared events

compared_groups <- list(
  c("mayZeb", "troMau"),
  c("astCal", "mayZeb", "troMau"),
  c("copChr", "otoArg"),
  c("copChr", "otoArg", "aulStu"),
  c("astCal", "mayZeb", "troMau", "copChr", "otoArg", "aulStu"),
  c("rhaChi", "rhaChi2"),
  c("astCal", "mayZeb", "troMau", "copChr", "otoArg", "aulStu", "rhaChi", "rhaChi2")
)

# stricter comparison: all must have the same variant
n_compared <- nrow(df_pa_matrix)
for (samples_to_compare in compared_groups) {
  print(
    glue(
      paste0(samples_to_compare, collapse=","), ": {n_sites} ({round(n_sites/n_compared*100, 1)})",
      n_sites = sum(rowSums(df_pa_matrix[samples_to_compare] == as.vector(df_pa_matrix[samples_to_compare][,1])) %in% c(0, length(samples_to_compare)))
    ),
    glue(
      "shortest: {n0} ({round(n0/n_compared*100, 1)}), insertion: {n1} ({round(n1/n_compared*100, 1)}), nested: {n2} ({round(n2/n_compared*100, 1)})",
      "\nshortest: {n0} ({round(n0/n_compared*100, 1)}), combined : {n1+n2} ({round((n1+n2)/n_compared*100, 1)})",
      n0 = sum(rowSums(df_pa_matrix[samples_to_compare] == 0) == length(samples_to_compare) ),
      n1 = sum(rowSums(df_pa_matrix[samples_to_compare] == 1) == length(samples_to_compare) ),
      n2 = sum(rowSums(df_pa_matrix[samples_to_compare] == 2) == length(samples_to_compare) )
    ),
    ""
  )
}

# more lenient comparison where nested insertion 2 is counted as 1
for (samples_to_compare in compared_groups) {
  print(
    glue(
      paste0(samples_to_compare, collapse=","),
      " - shortest: {n0} ({round(n0/n_compared*100, 1)}), insertion: {n1} ({round(n1/n_compared*100, 1)})",
      n0 = sum(rowSums(df_pa_matrix[samples_to_compare] == 0) == length(samples_to_compare) ),
      n1 = sum(rowSums(df_pa_matrix[samples_to_compare] >= 1) == length(samples_to_compare) )
    )
  )
}

################################################################################
# count differences between species pair

count_different_svs <- function(x) {
  nrow(df_pa_matrix) - sum(rowSums(df_pa_matrix[x] == as.vector(df_pa_matrix[x[1]])) %in% c(0, length(x)))
}

# print all out
combn(
  color_schema_species$species_abrv, 2, 
  function(x) paste(paste0(x, collapse='-'), count_different_svs(x), sep=':')
)

# get an average without Rhamphochromis
mean(combn(color_schema_species$species_abrv, 2, count_different_svs)[-28])
median(combn(color_schema_species$species_abrv, 2, count_different_svs)[-28])