library('ggplot2')
library('tidyr')
knitr::opts_knit$set(root.dir = '~/git/dashing2-experiments/jirange')
setwd('~/git/dashing2-experiments/jirange')
df <- read.table('table.5k.10.out', sep='\t', header=T, comment.char='')
df <- df %>% pivot_longer(cols=c(Mash, Dashing1,
                                 SS8Bytes, SS2Bytes, SS1Byte, SSNibble,
                                 MH8Bytes, MH4Bytes, MH2Bytes, MH1Byte, MHNibble),
                          names_to = "type", values_to = "jest")
df$totbits <- df$Sketchsize
df$diff <- df$jest - df$TrueJI
df$jidec <- round(df$TrueJI, digits=1)
r64 <- df$type %in% c('SS8Bytes', 'MH8Bytes', 'Mash')
r32 <- df$type %in% c('MH4Bytes')
r16 <- df$type %in% c('SS2Bytes', 'MH2Bytes')
r8  <- df$type %in% c('SS1Byte', 'MH1Byte', 'Dashing1')
r4  <- df$type %in% c('SSNibble')
df$totbits[r64] <- df$totbits[r64] * 64
df$totbits[r32] <- df$totbits[r32] * 32
df$totbits[r16] <- df$totbits[r16] * 16
df$totbits[r8] <- df$totbits[r8] * 8
df$totbits[r4] <- df$totbits[r4] * 4
dffilt <- df[df$totbits >= 16384 & df$totbits <= 65536,]
error_plot <- function(bits=16384, box=F) {
  ggp <- ggplot(dffilt %>% dplyr::filter(totbits == bits), aes(x=type, y=diff, color=type))
  if(box) {
    ggp <- ggp + geom_boxplot()
  } else {
    ggp <- ggp + geom_point(position=position_jitterdodge(), alpha=0.3)
  }
  ggp +
      theme_bw() +
      facet_wrap(~jidec) +
      labs(title=paste('Sketch total of', as.character(bits), 'bits'),
           y='Est - True JI') +
      theme(axis.title.x=element_blank(),
            axis.text.x=element_blank(),
            axis.ticks.x=element_blank())
}
error_plot(bits=16384, box=F)

error_plot(bits=16384, box=T)

error_plot(bits=32768, box=F)

error_plot(bits=32768, box=T)

error_plot(bits=65536, box=F)

error_plot(bits=65536, box=T)

dfsse <- dffilt %>% dplyr::group_by(k, totbits, jidec, type) %>% dplyr::summarise(sse=sum((jest-TrueJI)**2))
## `summarise()` has grouped output by 'k', 'totbits', 'jidec'. You can override using the `.groups` argument.
sse_plot <- function(bits=16384, max_y=0.1, no_filt=F) {
  dftmp <- if(no_filt) { dfsse } else { dfsse %>% dplyr::filter(totbits == bits) }
  ggplot(dftmp,
        aes(x=type, y=sse, color=type)) + facet_wrap(~jidec) + geom_point() +
        theme_bw() +
        labs(title=paste('Sketch total of', as.character(bits/8), 'bytes'),
             y='Est - True JI') +
        theme(axis.title.x=element_blank(),
              axis.text.x=element_blank(),
              axis.ticks.x=element_blank()) +
        ylim(c(0, max_y))
}
sse_plot(bits=16384)
## Warning: Removed 29 rows containing missing values (geom_point).

sse_plot(bits=32768, max_y=0.075)
## Warning: Removed 30 rows containing missing values (geom_point).

sse_plot(bits=65536, max_y=0.05)
## Warning: Removed 28 rows containing missing values (geom_point).

sse_plot(no_filt=T, max_y=0.03)
## Warning: Removed 120 rows containing missing values (geom_point).