if(!require(optparse)) install.packages("optparse")
if(!require(dplyr)) install.packages("dplyr")
if(!require(ggplot2)) install.packages("ggplot2")
if(!require(scales)) install.packages("scales")
if(!require(ggsci)) install.packages("ggsci")
if(!require(gridExtra)) install.packages("gridExtra")
if(!require(ggpubr)) install.packages("ggpubr")
if(!require(DescTools)) install.packages("DescTools")
if(!require(ggh4x)) install.packages("ggh4x")
if(!require(lemon)) install.packages("lemon")
if(!require(tidyr)) install.packages("tidyr")
if(!require(ggpp)) install.packages("ggpp")

library(optparse)
library(dplyr)
library(ggplot2)
library(scales)
library(ggsci)
library(grid)
library(gridExtra)
library(DescTools)
library(ggpubr)
library(ggh4x)
library(lemon)
library(tidyr)
library(tibble)
library(ggpp)

all_tax_levels <- c( "Class", "Order", "Family", "Genus", "Species")
tax_levels <- c("Species", "Genus", "Family")
all_ds <- c("Sim-100", "HiFi_D6331", "ZymoR103", "ZymoQ20")
real_ds <- c("ZymoR103", "ZymoQ20", "HiFi_D6331")
metrics <- c("Precision", "Recall", "F1-Score", "F0.5-Score")

create_real_ds_plot <- function(input_file, lr_output_file, sr_output_file)  
{
  #input_file <- file.path("E:/TaxorResearchData/results", "level_metrics.tsv")
  #input_file <- opt$metrics_file
  
  ds <- read.csv(input_file, header=TRUE, sep="\t", quote = "\"", dec = ".", fill = TRUE, comment.char = "")
  
  ds <- ds %>%
    mutate(Dataset = 
             case_when(Dataset == "refseq-abfv-sim100" ~ "Sim-100",
                       Dataset == "ZymoR103-groundTruth" ~ "ZymoR103",
                       Dataset == "ZymoQ20-groundTruth" ~ "ZymoQ20",
                       Dataset == "HiFi_D6331-groundTruth" ~ "HiFi_D6331",
                       TRUE ~ Dataset))  %>%
    mutate(Level = case_when(Level == "species" ~ "Species",
                             Level == "genus" ~ "Genus",
                             Level == "family" ~ "Family",
                             Level == "order" ~ "Order",
                             Level == "class" ~ "Class",
                             TRUE ~ Dataset))
  
  
  lr_data <- ds %>%
    filter(Dataset %in% real_ds) %>%
    filter(Level %in% tax_levels)
  
  
  #all_real_ds_metrics_plot <- 
  #    ggplot(df, aes(fill=Tool, y=Value, x=factor(Level, levels = tax_levels))) + 
  #    geom_bar(position="dodge", stat="identity")+
  #    theme_minimal()+
  #    theme(legend.title = element_blank())+
  #    theme(legend.position = "bottom")+
  #    theme(legend.direction='horizontal')+
  #    theme(legend.text = element_text(size = 16))+
  #    theme(axis.text = element_text(size = 14))+
  #    guides(fill = guide_legend(nrow = 1))+
  #    scale_fill_npg()+
  #    scale_y_continuous(breaks=c(0.0,0.5, 1.0))+
  #    labs(x="", y="")+
  #    theme(strip.text = element_text(size = 16, face="bold"))+
  #    theme(strip.placement = "outside")+
  #    facet_nested(factor(Metric, levels = metrics) ~ factor(Dataset, levels = real_ds), switch = "y")
  
  
  precision_recall_lr <- 
    ggplot(lr_data, aes(x=Recall, y=Precision, colour=Tool))+
    geom_point(size = 4, aes(shape=Dataset))+
    #geom_point(size = 4, shape = 4)+
    scale_shape_manual(values=c(15, 16, 17))+
    #scale_shape_discrete(solid=F)+
    theme_minimal()+
    theme(legend.title = element_blank())+
    theme(legend.position = "bottom")+
    theme(legend.text = element_text(size = 14))+
    labs(title ="", subtitle = "", x="Recall", y="Precision")+
    theme(plot.title.position = "plot", plot.subtitle = element_text(size = 18, face="bold", hjust = 0.5))+
    scale_color_npg()+
    theme(axis.text = element_text(size = 12))+
    theme(axis.title = element_text(size = 14))+
    theme(plot.title = element_text(size = 20, face="bold"))+
    #scale_color_manual(values = pal_simpsons("springfield")(6)[c(1,2,4,5,6,7)])+ 
    #scale_fill_npg()+
    scale_y_continuous(breaks=seq(0.7, 1.0, 0.1), limits=c(0.7, 1.0))+
    scale_x_continuous(breaks=seq(0.2, 1.0, 0.1), limits=c(0.2, 1.0))+
    theme(strip.text = element_text(size = 16, face="bold"))+  
    facet_nested(~factor(Level, levels = tax_levels))
  
  ggsave(filename=lr_output_file, 
         plot = precision_recall_lr, 
         device = cairo_pdf, 
         width = 297, 
         height = 210, 
         units = "mm")
  
  dev.off()
  
  sr_data <- ds %>%
    filter(Dataset == "Illumina_D6300-groundTruth") %>%
    filter(Level %in% tax_levels)
  
  
  precision_recall_sr <- 
    ggplot(sr_data, aes(x=Recall, y=Precision, colour=Tool))+
    geom_point(size = 4, aes(shape=Tool))+
    #geom_point(size = 4, shape = 4)+
    scale_shape_manual(values=c(15, 16, 17, 18, 19))+
    #scale_shape_discrete(solid=F)+
    theme_minimal()+
    theme(legend.title = element_blank())+
    theme(legend.position = "bottom")+
    theme(legend.text = element_text(size = 14))+
    labs(title ="", subtitle = "", x="Recall", y="Precision")+
    theme(plot.title.position = "plot", plot.subtitle = element_text(size = 18, face="bold", hjust = 0.5))+
    scale_color_npg()+
    theme(axis.text = element_text(size = 12))+
    theme(axis.title = element_text(size = 14))+
    theme(plot.title = element_text(size = 20, face="bold"))+
    #scale_color_manual(values = pal_simpsons("springfield")(6)[c(1,2,4,5,6,7)])+ 
    #scale_fill_npg()+
    scale_y_continuous(breaks=seq(0.7, 1.0, 0.1), limits=c(0.7, 1.0))+
    scale_x_continuous(breaks=seq(0.1, 1.0, 0.1), limits=c(0.1, 1.0))+
    theme(strip.text = element_text(size = 16, face="bold"))+  
    facet_nested(~factor(Level, levels = tax_levels))
  
  
  ggsave(filename=sr_output_file, 
         plot = precision_recall_sr, 
         device = cairo_pdf, 
         width = 297, 
         height = 210, 
         units = "mm")
  
  dev.off()
}


create_avg_plots <- function(df, recall_precision_output_file, f_score_output_file)
{
  df <- real_data
  species_avg <- df %>%
    filter(Level == "Species") %>%
    group_by(Tool, Metric) %>%
    summarise(avg = mean(Value)) %>%
    pivot_wider(names_from = Metric, values_from = avg) 
  
  
  prec_recall_plot <- 
    ggplot(species_avg, aes(fill=Tool, x=Precision, y=Recall))+
    geom_point(aes(colour = Tool), size = 4)+
    theme_minimal()+
    theme(legend.title = element_blank())+
    theme(legend.position = "bottom")+
    theme(legend.text = element_text(size = 18))+
    theme(axis.text = element_text(size = 14))+
    theme(axis.title = element_text(size = 14))+
    labs(title = "(A)", subtitle = "Average Precision vs. Average Recall \n Species Level")+
    theme(plot.subtitle = element_text(size = 18, face="bold", hjust = 0.5))+
    theme(plot.title = element_text(size = 20, face="bold"), plot.title.position = "plot")+
    scale_color_npg()+
    scale_y_continuous(breaks=seq(0.0, 1.0, 0.2), limits=c(0, 1))+
    scale_x_continuous(breaks=seq(0.0, 1.0, 0.2), limits=c(0, 1))+
    geom_abline(intercept = 0, slope = 1, linetype="dashed")
  
  ggsave(filename=recall_precision_output_file, 
         plot = prec_recall_plot, 
         device = cairo_pdf, 
         width = 297, 
         height = 210, 
         units = "mm")
  
  dev.off()
  
  f1_plot <- 
    ggplot(species_avg, aes(fill=Tool, x=Tool, y=`F1-Score`))+
    geom_point(aes(colour = Tool), size = 4)+
    theme_minimal()+
    theme(plot.title = element_text(size = 20, face="bold"))+
    theme(legend.title = element_blank(), legend.position = "none")+
    labs(title ="(B)", subtitle = "Average F1-Score \n Species Level", x="")+
    theme(plot.title.position = "plot", plot.subtitle = element_text(size = 18, face="bold", hjust = 0.5))+
    theme(axis.text.x = element_text(angle = 45, hjust = 1, size=14), 
          axis.text.y = element_text(size = 12))+
    theme(axis.title = element_text(size = 14))+
    scale_color_npg()+
    scale_y_continuous(breaks=seq(0.0, 1.0, 0.2), limits=c(0, 1))
  
  f0.5_plot <- ggplot(species_avg, aes(fill=Tool, x=Tool, y=`F0.5-Score`))+
    geom_point(aes(colour = Tool), size = 4)+
    theme_minimal()+
    theme(plot.title = element_text(size = 20, face="bold"))+
    theme(legend.title = element_blank(), legend.position = "none")+
    labs(title ="(C)", subtitle = "Average F0.5-Score \n Species Level", x="")+
    theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 14),
          axis.text.y = element_text(size = 12))+
    theme(axis.title = element_text(size = 14))+
    theme(plot.title.position = "plot", plot.subtitle = element_text(size = 18, face="bold", hjust = 0.5))+
    scale_color_npg()+
    scale_y_continuous(breaks=seq(0.0, 1.0, 0.2), limits=c(0, 1))
  
  f_score_plot <- ggarrange(f1_plot, f0.5_plot, nrow = 1, ncol = 2)
  
  f_score_plot
  
  ggsave(filename=f_score_output_file, 
         plot = f_score_plot, 
         device = cairo_pdf, 
         width = 297, 
         height = 210, 
         units = "mm")
  
  dev.off()
}

taxonomic_abundance_plot <- function(tax_ab_file)
{
  #  tax_ab_file <- taxonomic_abundance_file
  tax_abundance_ds <- read.csv(tax_ab_file, header=TRUE, sep="\t", quote = "\"", dec = ".", fill = TRUE, comment.char = "")
  ds <- tax_abundance_ds %>%
    mutate(Dataset = 
             case_when(Dataset == "ZymoR103-groundTruth" ~ "ZymoR103",
                       Dataset == "ZymoQ20-groundTruth" ~ "ZymoQ20",
                       Dataset == "HiFi_D6331-groundTruth" ~ "HiFi_D6331",
                       Dataset == "Illumina_D6300-groundTruth" ~ "Illumina_D6300",
                       TRUE ~ Dataset)) %>%
    mutate(Tool = 
             case_when(Tool == "theoretical" ~ "Theoretical",
                       TRUE ~ Tool)) %>%
    mutate(Abundance = Abundance / 100)
  
  theoretical <- ds %>%
    filter(Tool == "Theoretical") %>%
    select(-Tool)
  
  taxor_ds <- ds %>%
    filter(Tool == "Taxor") %>%
    select(-Tool) %>%
    inner_join(theoretical, by=c("Dataset" = "Dataset","Species" = "Species")) %>%
    mutate(Tool = "Taxor")
  
  
  ganon_ds <- ds %>%
    filter(Tool == "Ganon") %>%
    select(-Tool) %>%
    inner_join(theoretical, by=c("Dataset" = "Dataset","Species" = "Species")) %>%
    mutate(Tool = "Ganon")
  
  kmcp_ds <- ds %>%
    filter(Tool == "KMCP") %>%
    select(-Tool) %>%
    inner_join(theoretical, by=c("Dataset" = "Dataset","Species" = "Species")) %>%
    mutate(Tool = "KMCP")
  
  sourmash_ds <- ds %>%
    filter(Tool == "Sourmash") %>%
    select(-Tool) %>%
    inner_join(theoretical, by=c("Dataset" = "Dataset","Species" = "Species")) %>%
    mutate(Tool = "Sourmash")
  
  vs <- rbind(taxor_ds, ganon_ds, kmcp_ds, sourmash_ds)
  
  dist <- abs(taxor_ds[c("Abundance.x")] - taxor_ds[c("Abundance.y")])
  taxor_l1 <- round(sum(dist) + (3 - sum(taxor_ds$Abundance.x)), digits = 2)
  
  dist <- abs(ganon_ds[c("Abundance.x")] - ganon_ds[c("Abundance.y")])
  ganon_l1 <-round(sum(dist) + (3 - sum(ganon_ds$Abundance.x)), digits = 2)
  
  dist <- abs(kmcp_ds[c("Abundance.x")] - kmcp_ds[c("Abundance.y")])
  kmcp_l1 <-round(sum(dist) + (3 - sum(kmcp_ds$Abundance.x)), digits = 2)
  
  dist <- abs(sourmash_ds[c("Abundance.x")] - sourmash_ds[c("Abundance.y")])
  sourmash_l1 <-round(sum(dist) + (3 - sum(sourmash_ds$Abundance.x)), digits = 2)
  
  l1_table <- data.frame(Tool=c("Taxor", "Ganon", "KMCP", "Sourmash"), L1=c(taxor_l1,ganon_l1,kmcp_l1, sourmash_l1))
  l1_tb <- tibble(x = 0.0, y = 0.3, tb = list(l1_table))
  
  tax_abundance_plot <- 
    ggplot(vs, aes(x=Abundance.x, y=Abundance.y, colour=Tool,
                   group=interaction(Dataset, Species)))+
    #  geom_point(size = 4, shape = 4)+
    geom_point(size = 4, aes(shape=Tool))+
    scale_shape_manual(values=c(0, 1, 2, 4))+
    theme_minimal()+
    theme(legend.title = element_blank())+
    theme(legend.position = "bottom")+
    theme(legend.text = element_text(size = 14))+
    labs(title ="A", subtitle = "Taxonomic abundance on species level \n  Theoretical vs. Predicted", x="Predicted", y="Theoretical")+
    theme(plot.title.position = "plot", plot.subtitle = element_text(size = 18, face="bold", hjust = 0.5))+
    #    scale_color_npg()+
    theme(axis.text = element_text(size = 12))+
    theme(axis.title = element_text(size = 14))+
    theme(plot.title = element_text(size = 20, face="bold"))+
    scale_color_manual(values = pal_simpsons("springfield")(5)[c(1,2,4,5)])+ 
    #scale_shape_manual(values = c(5, 16, 17)) +
    scale_y_continuous(breaks=seq(0.0, 0.3, 0.05), limits=c(0, 0.3))+
    scale_x_continuous(breaks=seq(0.0, 0.3, 0.05), limits=c(0, 0.3))+
    geom_abline(intercept = 0, slope = 1, linetype="dashed")+
    #ggtexttable(l1_table, rows = NULL, theme = ttheme("light"))
    geom_table(data=l1_tb, aes(x=0.0,y=0.3, label=tb), table.theme = ttheme_gtlight,
               size = 5.0)
  
  tax_abundance_plot 
}

sequence_abundance_plot <- function(seq_ab_file)
{
  #  seq_ab_file <- sequence_abundance_file
  seq_abundance_ds <- read.csv(seq_ab_file, header=TRUE, sep="\t", quote = "\"", dec = ".", fill = TRUE, comment.char = "")
  ds <- seq_abundance_ds %>%
    mutate(Dataset = 
             case_when(Dataset == "ZymoR103-groundTruth" ~ "ZymoR103",
                       Dataset == "ZymoQ20-groundTruth" ~ "ZymoQ20",
                       Dataset == "HiFi_D6331-groundTruth" ~ "HiFi_D6331",
                       Dataset == "Illumina_D6300-groundTruth" ~ "Illumina_D6300",
                       TRUE ~ Dataset)) %>%
    mutate(Tool = 
             case_when(Tool == "theoretical" ~ "Theoretical",
                       TRUE ~ Tool)) %>%
    mutate(Abundance = round(Abundance / 100, digits = 3))
  
  theoretical <- ds %>%
    filter(Tool == "Theoretical") %>%
    select(-Tool)
  
  taxor_ds <- ds %>%
    filter(Tool == "Taxor") %>%
    select(-Tool) %>%
    inner_join(theoretical, by=c("Dataset" = "Dataset","Species" = "Species")) %>%
    mutate(Tool = "Taxor")
  
  metamaps_ds <- ds %>%
    filter(Tool == "MetaMaps") %>%
    select(-Tool) %>%
    inner_join(theoretical, by=c("Dataset" = "Dataset","Species" = "Species")) %>%
    mutate(Tool = "MetaMaps")
  
  kraken2_ds <- ds %>%
    filter(Tool == "Kraken2") %>%
    select(-Tool) %>%
    inner_join(theoretical, by=c("Dataset" = "Dataset","Species" = "Species")) %>%
    mutate(Tool = "Kraken2")
  
  centrifuge_ds <- ds %>%
    filter(Tool == "Centrifuge") %>%
    select(-Tool) %>%
    inner_join(theoretical, by=c("Dataset" = "Dataset","Species" = "Species")) %>%
    mutate(Tool = "Centrifuge")
  
  vs <- rbind(taxor_ds, metamaps_ds, kraken2_ds, centrifuge_ds)
  
  dist <- abs(taxor_ds[c("Abundance.x")] - taxor_ds[c("Abundance.y")])
  taxor_l1 <- round(sum(dist)+ (3 - sum(taxor_ds$Abundance.x)), digits = 2)
  
  dist <- abs(metamaps_ds[c("Abundance.x")] - metamaps_ds[c("Abundance.y")])
  metamaps_l1 <-round(sum(dist)+ (3 - sum(metamaps_ds$Abundance.x)), digits = 2)
  
  dist <- abs(kraken2_ds[c("Abundance.x")] - kraken2_ds[c("Abundance.y")])
  kraken2_l1 <-round(sum(dist)+ (3 - sum(kraken2_ds$Abundance.x)), digits = 2)
  
  dist <- abs(centrifuge_ds[c("Abundance.x")] - centrifuge_ds[c("Abundance.y")])
  centrifuge_l1 <-round(sum(dist)+ (3 - sum(centrifuge_ds$Abundance.x)), digits = 2)
  
  l1_table <- data.frame(Tool=c("Taxor", "MetaMaps", "Kraken2", "Centrifuge"), L1=c(taxor_l1,metamaps_l1,kraken2_l1, centrifuge_l1))
  l1_tb <- tibble(x = 0.0, y = 0.3, tb = list(l1_table))
  
  seq_abundance_plot <- 
    ggplot(vs, aes(x=Abundance.x, y=Abundance.y, colour=Tool,
                   group=interaction(Dataset, Species)))+
    geom_point(size = 4, aes(shape=Tool))+
    #geom_point(size = 4, shape = 4)+
    scale_shape_manual(values=c(0, 1, 2, 4))+
    #scale_shape_discrete(solid=F)+
    theme_minimal()+
    theme(legend.title = element_blank())+
    theme(legend.position = "bottom")+
    theme(legend.text = element_text(size = 14))+
    labs(title ="B", subtitle = "Sequence abundance on species level \nTheoretical vs. Predicted", x="Predicted", y="Theoretical")+
    theme(plot.title.position = "plot", plot.subtitle = element_text(size = 18, face="bold", hjust = 0.5))+
    #scale_color_npg()+
    theme(axis.text = element_text(size = 12))+
    theme(axis.title = element_text(size = 14))+
    theme(plot.title = element_text(size = 20, face="bold"))+
    scale_color_manual(values = pal_simpsons("springfield")(5)[c(1,2,4,5)])+ 
    scale_y_continuous(breaks=seq(0.0, 0.3, 0.05), limits=c(0, 0.3))+
    scale_x_continuous(breaks=seq(0.0, 0.3, 0.05), limits=c(0, 0.3))+
    geom_abline(intercept = 0, slope = 1, linetype="dashed")+
    #ggtexttable(l1_table, rows = NULL, theme = ttheme("light"))
    geom_table(data=l1_tb, aes(x=0.0,y=0.3, label=tb), table.theme = ttheme_gtlight,
               size = 5.0)
  
  seq_abundance_plot 
  
}

create_abundance_plots <- function(taxonomic_abundance_file, seq_abundance_file, ab_plot_output_file)
{
  seq_abundance_file <- sequence_abundance_file
  seq_ab_plot <- sequence_abundance_plot(seq_abundance_file)
  tax_ab_plot <- taxonomic_abundance_plot(taxonomic_abundance_file)
  
  ab_plot <- 
    ggarrange(tax_ab_plot, seq_ab_plot, nrow = 1, ncol = 2)
  
  ab_plot
  
  ggsave(filename=ab_plot_output_file, 
         plot = ab_plot, 
         device = cairo_pdf, 
         width = 297, 
         height = 210, 
         units = "mm")
  
  dev.off()
}

tax_abundance_chi_square <- (tax_ab_file)
{
  #tax_ab_file <- taxonomic_abundance_file
  tax_abundance_ds <- read.csv(tax_ab_file, header=TRUE, sep="\t", quote = "\"", dec = ".", fill = TRUE, comment.char = "")
  ds <- tax_abundance_ds %>%
    mutate(Dataset = 
             case_when(Dataset == "ZymoR103-groundTruth" ~ "ZymoR103",
                       Dataset == "ZymoQ20-groundTruth" ~ "ZymoQ20",
                       Dataset == "HiFi_D6331-groundTruth" ~ "HiFi_D6331",
                       TRUE ~ Dataset)) %>%
    mutate(Tool = 
             case_when(Tool == "theoretical" ~ "Theoretical",
                       TRUE ~ Tool)) %>%
    mutate(Abundance = round(Abundance / 100, digits = 3))
  
  
  
  df <- ds %>%
    filter(Tool == "Theoretical") %>%
    select(-Tool) %>%
    rename(Theoretical = Abundance)
  
  
  for(tool in c("Taxor", "Ganon", "KMCP"))
  {
    
    df <- ds %>%
      filter(Tool == tool) %>%
      select(-Tool) %>%
      rename(!!tool := Abundance) %>%
      inner_join(df, by=c("Dataset" = "Dataset","Species" = "Species"))
    
  }
  
  test_results <- data.frame(Dataset=character(),
                             Tool=character(),
                             chi_squared_value=double(),
                             df=integer(),
                             crit_value=double(),
                             p_value=double()
  )
  
  for (ds_name in real_ds)
  {
    
    #ds_name <- "ZymoR103"
    print(ds_name)
    this_ds <- df %>%
      filter(Dataset == ds_name)
    
    row_to_add <- this_ds %>%
      summarise(Dataset = ds_name,
                Species = "others",
                Taxor = 1 - sum(Taxor),
                Ganon = 1 - sum(Ganon),
                KMCP = 1 - sum(KMCP),
                Theoretical = 1 - sum(Theoretical))
    
    this_ds <- this_ds %>% 
      bind_rows(row_to_add) %>%
      mutate(Theoretical = 
               case_when(Theoretical == 0 ~ 0.00001,
                         TRUE ~ Theoretical))
    
    for(tool in c("Taxor", "Ganon", "KMCP"))
    {
      tool_ds <- this_ds %>%
        select(Theoretical,!!tool)
      
      x2 <- sum((tool_ds[[tool]] - tool_ds$Theoretical)^2 / tool_ds$Theoretical)
      degfree <- nrow(tool_ds)-1
      p <- pchisq(q = x2, df = degfree, lower.tail = FALSE)
      q <- qchisq(p=.05, df=degfree, lower.tail=FALSE) 
      
      test_results <- test_results %>% 
        add_row(Dataset = ds_name, Tool = tool, df = degfree, 
                chi_squared_value = x2, crit_value = q, p_value = p)
      
    }
    
  }
  
  test_results
}

seq_abundance_chi_square <- (seq_ab_file)
{
  seq_ab_file <- sequence_abundance_file
  seq_abundance_ds <- read.csv(seq_ab_file, header=TRUE, sep="\t", quote = "\"", dec = ".", fill = TRUE, comment.char = "")
  ds <- seq_abundance_ds %>%
    mutate(Dataset = 
             case_when(Dataset == "ZymoR103-groundTruth" ~ "ZymoR103",
                       Dataset == "ZymoQ20-groundTruth" ~ "ZymoQ20",
                       Dataset == "HiFi_D6331-groundTruth" ~ "HiFi_D6331",
                       TRUE ~ Dataset)) %>%
    mutate(Tool = 
             case_when(Tool == "theoretical" ~ "Theoretical",
                       TRUE ~ Tool)) %>%
    mutate(Abundance = round(Abundance / 100, digits = 3))
  
  
  
  df <- ds %>%
    filter(Tool == "Theoretical") %>%
    select(-Tool) %>%
    rename(Theoretical = Abundance)
  
  
  for(tool in c("Taxor", "MetaMaps", "Kraken2", "Centrifuge"))
  {
    
    df <- ds %>%
      filter(Tool == tool) %>%
      select(-Tool) %>%
      rename(!!tool := Abundance) %>%
      inner_join(df, by=c("Dataset" = "Dataset","Species" = "Species"))
    
  }
  
  test_results <- data.frame(Dataset=character(),
                             Tool=character(),
                             chi_squared_value=double(),
                             df=integer(),
                             crit_value=double(),
                             p_value=double()
  )
  
  for (ds_name in real_ds)
  {
    
    #ds_name <- "ZymoR103"
    print(ds_name)
    this_ds <- df %>%
      filter(Dataset == ds_name)
    
    row_to_add <- this_ds %>%
      summarise(Dataset = ds_name,
                Species = "others",
                Taxor = 1 - sum(Taxor),
                MetaMaps = 1 - sum(MetaMaps),
                Kraken2 = 1 - sum(Kraken2),
                Centrifuge = 1 - sum(Centrifuge),
                Theoretical = 1 - sum(Theoretical))
    
    this_ds <- this_ds %>% 
      bind_rows(row_to_add) %>%
      mutate(Theoretical = 
               case_when(Theoretical == 0 ~ 0.00001,
                         TRUE ~ Theoretical))
    
    for(tool in c("Taxor", "MetaMaps", "Kraken2", "Centrifuge"))
    {
      tool_ds <- this_ds %>%
        select(Theoretical,!!tool)
      
      x2 <- sum((tool_ds[[tool]] - tool_ds$Theoretical)^2 / tool_ds$Theoretical)
      degfree <- nrow(tool_ds)-1
      p <- pchisq(q = x2, df = degfree, lower.tail = FALSE)
      q <- qchisq(p=.05, df=degfree, lower.tail=FALSE) 
      
      test_results <- test_results %>% 
        add_row(Dataset = ds_name, Tool = tool, df = degfree, 
                chi_squared_value = x2, crit_value = q, p_value = p)
      
    }
    
  }
  
  test_results
}

create_chi_squared_table <- function(taxonomic_abundance_file, sequence_abundance_file, output_file)
{
  
}

create_read_utilization_plot <- function(df, output_file)
{
  
  df <- df %>%
    filter(Metric == "Utilization") %>%
    mutate(perc = Value * 100)
  
  
  util_plot <- ggplot(df, aes(x=Tool, y=perc, fill=factor(Level, levels = all_tax_levels))) +
    geom_bar(stat="identity")+
    theme_minimal()+
    theme(legend.title = element_blank(), legend.position = "right")+
    #labs(title ="(A)", subtitle = "SIM-100", x="")+
    theme(axis.text.x = element_text(angle = 45, hjust = 1))+
    labs(x="", y = "Percent of reads")+
    theme(strip.text = element_text(size=12,face="bold"))+
    theme(plot.title.position = "plot", plot.subtitle = element_text(size = 12, face="bold", hjust = 0.5))+
    scale_fill_manual(values = pal_npg("nrc")(5)[c(2,5,3,1,4)])+
    scale_y_continuous(breaks=seq(0, 100, 20), limits=c(0, 100))+
    facet_wrap(~factor(Dataset, levels = all_ds), ncol = 2, nrow = 2)
  
  ggsave(filename=output_file, 
         plot = util_plot, 
         device = cairo_pdf, 
         width = 297, 
         height = 210, 
         units = "mm")
  
  dev.off()
}



option_list = list(
  make_option(c("-o", "--output_dir"), type="character", default=NULL, 
              help="directory for saving created figures"),
  make_option(c("-m", "--metrics_file"), type = "character", default=NULL, 
              help="file with tab-separated-values of precision, recall and f-scores"),
  make_option(c("-t", "--tax_abundance"), type="character", default=NULL, 
              help="file containing taxonomic abundances"),
  make_option(c("-s", "--seq_abundance"), type="character", default=NULL, 
              help="file containing sequence abundances")
); 

opt_parser = OptionParser(option_list=option_list);
opt = parse_args(opt_parser);



if (is.null(opt$output_dir))
{
  print_help(opt_parser)
  stop("Output directory is missing", call.=FALSE)
}

if(is.null(opt$metrics_file) && 
   (is.null(opt$tax_abundance) || 
    is.null(opt$seq_abundance)))
{
  print_help(opt_parser)
  stop("Either the metrics file or the two abundance files need to be provided", call.=FALSE)
}


if (!is.null(opt$metrics_file))
{
  
  input_file <- file.path("E:/TaxorResearchData/results", "results.tsv")
  #input_file <- opt$metrics_file
  
  ds <- read.csv(input_file, header=TRUE, sep="\t", quote = "\"", dec = ".", fill = TRUE, comment.char = "")
  
  ds <- ds %>%
    mutate(Dataset = 
             case_when(Dataset == "refseq-abfv-sim100" ~ "Sim-100",
                       Dataset == "ZymoR103-groundTruth" ~ "ZymoR103",
                       Dataset == "ZymoQ20-groundTruth" ~ "ZymoQ20",
                       Dataset == "HiFi_D6331-groundTruth" ~ "HiFi_D6331",
                       Dataset == "Illumina_D6300-groundTruth" ~ "Illumina_D6300",
                       TRUE ~ Dataset))  %>%
    mutate(Level = case_when(Level == "species" ~ "Species",
                             Level == "genus" ~ "Genus",
                             Level == "family" ~ "Family",
                             Level == "order" ~ "Order",
                             Level == "class" ~ "Class",
                             TRUE ~ Dataset))
  
  create_read_utilization_plot(ds, file.path(opt$output_dir, "read_utilization.pdf"))
  
  real_data <- ds %>%
    filter(Dataset %in% real_ds) %>%
    filter(Level %in% tax_levels) %>%
    filter(Metric %in% metrics)
  
  #  df <- real_data %>%
  #    filter(Level == "Species") %>%
  #    filter(Metric %in% metrics)
  
  create_real_ds_plot(real_data, file.path(opt$output_dir, "all_metrics_plot.pdf"))
  recall_precision_output_file <- file.path(opt$output_dir, "avg_recall_precision_plot.pdf")
  f_score_output_file <- file.path(opt$output_dir, "avg_f_scores_plot.pdf")
  
  create_avg_plots(real_data, recall_precision_output_file, f_score_output_file)
}

if (!(is.null(opt$tax_abundance) || is.null(opt$seq_abundance)))
{
  
  taxonomic_abundance_file <- opt$tax_abundance
  sequence_abundance_file <- opt$seq_abundance
  
  taxonomic_abundance_file <- file.path("E:/TaxorResearchData/results", "taxonomic_abundances.tsv")
  sequence_abundance_file <- file.path("E:/TaxorResearchData/results", "sequence_abundances.tsv")
  
  
  create_abundance_plots(taxonomic_abundance_file, sequence_abundance_file, file.path(opt$output_dir, "abundance_plot.pdf"))
}
