require(ggplot2)
require(readr)
require(cowplot)
require(dplyr)
require(stringr)   

scores <-
  read_csv("../results/cscores-10kSpecies_-_KRANK-rankingkmers_comparison.csv")
scores
scores$Taxonomic_rank <- factor(
  scores$Taxonomic_rank,
  levels = c(
    "kingdom",
    "phylum",
    "class",
    "order",
    "family",
    "genus",
    "species"
  )
)
scores <- scores %>% filter(Distance_to_closest < 0.35)

scores$Method[scores$Method == "random k-mer ranking ~ k=32 w=35 h=12 b=16 l=2 mer-count const. (0.00)"] <-
  "random"
scores$Method[scores$Method == "negative species count ranking ~ k=32 w=35 h=12 b=16 l=2 mer-count const. (0.00)"] <-
  "species discrim."
scores$Method[scores$Method == "negative child taxon count ranking ~ k=32 w=35 h=12 b=16 l=2 mer-count const. (0.00)"] <-
  "children discrim."
scores$Method[scores$Method == "positive species count ranking ~ k=32 w=35 h=12 b=16 l=2 mer-count const. (0.00)"] <-
  "species common"
scores$Method[scores$Method == "positive child taxon count ranking ~ k=32 w=35 h=12 b=16 l=2 mer-count const. (0.00)"] <-
  "children common"
scores$Method[scores$Method == "weighted sum of species counts w.r.t. k-mer coverages ~ k=32 w=35 h=12 b=16 l=2 mer-count const. (0.00)"] <-
  "taxon covering"
scores_p1 <- scores %>%
  filter(Method %in% c("random", "species common", "species discrim.")) %>%
  # mutate(Distance_to_closest = cut(Distance_to_closest, include.lowest = TRUE, breaks = c(0, 0.001, 0.025, 0.05, 0.1, 0.2, 0.35))) %>%
  group_by(Method, Taxonomic_rank) %>%
  summarise(Precision = mean(Precision), Recall = mean(Recall), F1 = mean(F1))
scores_p1$expt = "Species-based ranking (R)"
p1 <- ggplot(
  scores_p1
  ) +
  aes(x = reorder(Method, F1), y = F1, color = Taxonomic_rank, shape = Method) +
  geom_point(size = 5, alpha = 0.85) +
  facet_wrap(c("expt")) +
  labs(shape = "Ranking", colour = "Distance to the closest", x = "Ranking", y = "F1") +
  geom_line(aes(group = Taxonomic_rank), color = "grey20") +
  scale_colour_brewer(palette = "Paired") + 
  theme_cowplot(font_size = 14) + scale_shape_manual(values = c(16, 15, 4)) +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 10)) +
  scale_y_continuous(breaks = scales::pretty_breaks(n = 5))
p1
scores_p2 = scores %>%
  filter(Method %in% c("random", "children common", "children discrim.")) %>%
  # mutate(Distance_to_closest = cut(Distance_to_closest, include.lowest = TRUE, breaks = c(0, 0.001, 0.025, 0.05, 0.1, 0.2, 0.35))) %>%
  group_by(Method, Taxonomic_rank) %>%
  summarise(Precision = mean(Precision), Recall = mean(Recall), F1 = mean(F1))
scores_p2$expt = "Children-based ranking (R')"
p2 <- ggplot(
  scores_p2
) +
  aes(x = reorder(Method, F1), y = F1, color = Taxonomic_rank, shape = Method) +
  geom_point(size = 5, alpha = 0.85) +
  facet_wrap(c("expt")) +
  labs(shape = "Ranking", colour = "Distance to the closest", x = "Ranking", y = "F1") +
  geom_line(aes(group = Taxonomic_rank), color = "grey20") +
  scale_colour_brewer(palette = "Paired") + 
  theme_cowplot(font_size = 14) + scale_shape_manual(values = c(8, 18, 16)) +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 10)) +
  scale_y_continuous(breaks = scales::pretty_breaks(n = 5))
p2
scores_p3 <- scores %>%
  filter(Method %in% c("random", "taxon covering")) %>%
  # mutate(Distance_to_closest = cut(Distance_to_closest, include.lowest = TRUE, breaks = c(0, 0.001, 0.025, 0.05, 0.1, 0.2, 0.35))) %>%
  group_by(Method, Taxonomic_rank) %>%
  summarise(Precision = mean(Precision), Recall = mean(Recall), F1 = mean(F1))
scores_p3$expt <- "Weighted sum (R*)"
p3 <- ggplot(
  scores_p3
) +
  aes(x = reorder(Method, F1), y = F1, color = Taxonomic_rank, shape = Method) +
  geom_point(size = 5, alpha = 0.85) +
  facet_wrap(c("expt")) +
  labs(shape = "Ranking", colour = "Distance to the closest", x = "Ranking", y = "F1") +
  geom_line(aes(group = Taxonomic_rank), color = "grey20") +
  scale_colour_brewer(palette = "Paired") + 
  theme_cowplot(font_size = 14) + scale_shape_manual(values = c(16, 17)) + 
  scale_x_discrete(labels = function(x) str_wrap(x, width = 10)) +
  scale_y_continuous(breaks = scales::pretty_breaks(n = 5))
p3

prow <- plot_grid(
  p1 + theme(legend.position = "none") + theme(axis.title.x = element_text(size=0)),
  p2 + theme(legend.position = "none") + theme(axis.title.x = element_text(size=0)),
  p3 + theme(legend.position = "none") + theme(axis.title.x = element_text(size=0)),
  ncol = 3,
  rel_heights = c(1, 1, 1)
)
prow

legend <- get_legend(
  ggplot(scores %>%
      mutate(Distance_to_closest = cut(Distance_to_closest, include.lowest = TRUE, breaks = c(0, 0.001, 0.025, 0.05, 0.1, 0.2, 0.35))) %>%
      group_by(Method, Distance_to_closest) %>%
      summarise(Precision = mean(Precision), Recall = mean(Recall), F1 = mean(F1))
  ) +
  aes(
      x = reorder(Method, F1),
      y = F1,
      shape = Method,
      color = Distance_to_closest
    ) +
    labs(shape = "Ranking", colour = "Distance to the closest", x = "Ranking", y = "F1") +
    geom_point(size = 5, alpha = 0.85) +
    scale_colour_brewer(palette = "Paired") +
    theme_cowplot(font_size = 14) +
    scale_shape_manual(
      values = c(
        "taxon covering" = 17,
        "children common" = 18,
        "random" = 16,
        "species common" = 15,
        "species discrim." = 4,
        "children discrim." = 8
      )
    ) + theme(legend.box.margin = margin(-6, 0, 0, 0)) +
    theme(
      legend.position = "bottom",
      legend.justification = "center",
      legend.direction = "horizontal",
      legend.box = "vertical"
    ) + guides(color = "none")
)
plot_grid(prow, legend, nrow=2, rel_heights = c(3, 0.5))
ggsave2("../figures/kmer_ranking_comparison-10kSpecies.pdf", width=9, height = 5)
