
#######################################################
##  Read me      ######################################
## This script is for analysing the phenotype data
## from the DHFR tag experiment of Durand et al 2019.
## It was used for data analysis concerning figure 6.

## The directory in which you have found this file
## contains evertyhing you should need to rerun the
## analysis as it was done in Durand et al 2019.
## The gitter-data directory is empty, but will be
## filled up with the .dat files from the gitter
## batch function within this script.

## The names of the directories with the image files
## are in the following pattern: carbon source (2% glucose) - 
## methotrexate (mtx) or dmso (con) - plating round.
#######################################################

#######################################################
##  Dependencies      #################################

library(gitter) ## colony size image analysis
library(ggplot2) # plotting
library(data.table) ## spreadsheet handling
library(dplyr) ## spreadsheet manipulation
library(magrittr) ## piping
library(stringr) ## string manipulations

## In the plots, exchange my theme with your theme
## of choice
source(paste0('~/Dropbox/myResearch/general-scripts/',
              'plotting/fnThemeOnestop.R')) ## My ggplot theme

#######################################################

#######################################################
##  Variables       ###################################

timepoint <- 30

color.SpA <- 'green3'
color.SpB <- 'red2'
color.SpC <- 'blue4'

#######################################################

#######################################################
##  Functions      ####################################

fnRunGitterBatch <- function(path.to.images) {
  ## Runs the gitter.batch function from gitter
  ## on images within the directory that you
  ## specify. Creates a directory for the output
  setwd(path.to.images)
  
  run <- gsub('.*/', '', path.to.images)
  dir.create(paste0('../gitter-data/', run))
  ## Since gitter doesn't do very well at locating
  ## the colonies at the early time points. I am using
  ## the fact that you can add a reference image to
  ## help it locate colonies. For runs that have more
  ## than one time point, it will take the last
  ## time point and use that as the reference image.
  ## This requires you to have the image files ordered
  ## from first time point to last time point.
  reference.image <- tail(list.files(path.to.images), 1)
  path.to.reference.image <- paste(path.to.images, reference.image,
                                   sep = '/')
  
  gitter.batch(image.files = path.to.images,
               ref.image.file = path.to.reference.image,
               verbose = 'p',
               grid.save = paste0(path.to.images, paste0('/../gitter-data/', run)),
               dat.save = paste0(path.to.images, paste0('/../gitter-data/', run)),
               plate.format = c(32, 48))
}

fnReadGitterData <- function(path.to.gitter.data, layout.plate) {
  ## This function takes the path to the folder with
  ## the gitter data and reads in all the .dat files
  ## within that directory. Each directory should
  ## contain the dat from the images of one plate
  
  gitter.data.files <- list.files(path = path.to.gitter.data,
                                  pattern = '.dat', full.names = TRUE)
  
  gitter.data <- lapply(gitter.data.files, read.table, sep ='\t',
                        col.names = c('row', 'col', 'size',
                                      'circularity', 'flags'))
  ## Put the plate layout in here as well, merge on
  ## col and row
  gitter.data.layout <- lapply(gitter.data,
                               function(x) merge(x, layout.plate)
  )
  
  stopifnot(length(gitter.data) > 1)
  
  return(gitter.data.layout)
}

fnLayoutCleanUp <- function(gitter.data.layout) {
  ## Takes the info of the strain construct
  ## and puts it into separate columns
  
  ## Needs to be lapplyed for a list of data,
  ## i.e. for whenever you have several time points
  ## for one plate
  
  ## In order to make it work for the summary
  ## data after the modelling. This is what
  ## the if statement is for here. At the else
  ## is what happens if there is no column named
  ## orf
  if('orf' %in% names(gitter.data.layout)) {
  ## Put the strain in its own col, border = border
  gitter.data.layout$strain <- gsub('_.*', '',
                                    gitter.data.layout$orf)
  
  ## Put the frame in its own col, border = NA
  gitter.data.layout$frame <- str_extract(string = gitter.data.layout$orf,
                                          pattern = 'in|out')
  ## Change border = NA to border = border
  gitter.data.layout$frame[is.na(gitter.data.layout$frame)] <- 'Border'
  
  gitter.data.layout$id <- gitter.data.layout$orf
  
  ## Put the position as a column as well
  gitter.data.layout$pos <- paste(gitter.data.layout$row, gitter.data.layout$col, sep = '.')
  
  ## Just keep the orf number in the orf column, border = border
  gitter.data.layout$orf <- gsub('.*_', '', gitter.data.layout$orf)
  } else {
    gitter.data.layout$strain <- gsub('_.*', '',
                                      gitter.data.layout$id)
    
    ## Put the frame in its own col, border = NA
    gitter.data.layout$frame <- str_extract(string = gitter.data.layout$id,
                                            pattern = 'in|out')
    ## Change border = NA to border = border
    gitter.data.layout$frame[is.na(gitter.data.layout$frame)] <- 'Border'
    
    gitter.data.layout$id <- gitter.data.layout$id
    
    ## Just keep the orf number in the orf column, border = border
    gitter.data.layout$orf <- gsub('.*_', '', gitter.data.layout$id)
  }
  
  return(gitter.data.layout)
}

fnPlotGrowthCurve <- function(strain.id, run) {
  ## Plotting growth curves
  run.data <- phenotype.data.time.series[[run]]
  
  strain.data <- run.data[id %in% strain.id]
  
  growth.curve.plot <- ggplot(strain.data, aes(timepoint, size, color = pos)) +
    geom_point() + theme_oneStop() + scale_color_oneStop() +
    labs(title = strain.id) + theme(aspect.ratio = 1)
  
  if(length(strain.id) > 1) {
   growth.curve.plot <- growth.curve.plot + facet_wrap(~id) +
     theme(legend.position = 'none')
  }
  
  return(growth.curve.plot)
}

fnSingelTimepointSummary <- function(data) {
  size.median <- aggregate(data = data, size ~ id + orf + strain + frame,
                           median, na.rm = TRUE)
  size.mean <- aggregate(data = data, size ~ id + orf + strain + frame,
                         mean, na.rm = TRUE)
  size.sd <- aggregate(data = data, size ~ id + orf + strain + frame,
                       sd, na.rm = TRUE)
  size.n <- aggregate(data = data, size ~ id + orf + strain + frame,
                      length)
  size.iqr <- aggregate(data = data, size ~ id + orf + strain + frame,
                      IQR)
  
  singe.timepoint.summary <- data.table(id = size.median$id,
                                        orf = size.median$orf,
                                        strain = size.median$strain,
                                        frame = size.median$frame,
                                        median = size.median$size,
                                        mean = size.mean$size,
                                        sd = size.sd$size,
                                        iqr = size.iqr$size,
                                        cov = size.sd$size / size.mean$size,
                                        replicates = size.n$size)
  
}

fnMergeConAndMtx <- function(data, environment) {
  ## Put the con and mtx condition together in the same
  ## data frame. Use on the summary data
  to.bind <- data[grep(environment, names(data))]
  bound <- do.call(rbind, to.bind)
  
  return(bound)
}

fnScatterFormat <- function(x) {
  ## Put the concatenated summary into
  ## a wide format so that I can use it
  ## for making scatter plots
  frame.split <- split(x, x$frame)
  frame.merge <- merge(frame.split[['in']],
                       frame.split[['out']],
                       by = c('orf', 'strain', 'run'))
  
  return(frame.merge)
}

fnRemoveOutliers <- function(data) {
  ## Run filtering on the single timepoints data,
  ## remove strains that are outside of 1.5 IQRs from
  ## the median.
  ## I checked and this threshold doesn't
  ## remove anything. I'm not doing this step
  ## since no outliers were found
  data <- as.data.table(data)
  
  data.split <- split(data, data$id)
  
  fnIqrThreshold <- function(x) {
    x$iqr <- IQR(x$size)
    x$iqr.1.5 <- x$iqr * 1.5
    x$iqr.upper.threshold <- median(x$size) + x$iqr.1.5
    x$iqr.lower.threshold <- median(x$size) - x$iqr.1.5
    
    return(x)
  }
  
  data.split.threshold <- lapply(y.s, fnIqrThreshold)
  
  data.threshold <- rbindlist(data.split.threshold)
  
  ## Remove samples above or below the threshold
  outliers.removed <- data.threshold[size > iqr.lower.threshold &
                                       size < iqr.upper.threshold]
  
  return(outliers.removed)
}

#######################################################

#######################################################
##  Input       #######################################

setwd('full.path.to.directory.with.the.image.file.folders')

## Directories wit the images on which I will run the
## gitter.batch analysis
phenotyping.runs <- list.files('full.path.to.directory.with.the.image.file.folders',
                               pattern = 'glu', full.names = TRUE)

## Run gitter.batch on all the directories
## with images. Creates directories for the
## output.
lapply(phenotyping.runs, fnRunGitterBatch)

## Read in the plate layout
plate.layout <- read.table('plate-layout-1536.txt',
  header = TRUE, stringsAsFactors = FALSE)

## Directories with the .dat files for
## the phenotyping runs, lapply over these
## below
phenotyping.runs.dat <- list.files(
  'gitter-data', full.names = TRUE
  )

## Read in the gitter.dat that you created
## using the fnRunGitterBatch above
phenotype.data <- lapply(phenotyping.runs.dat,
                         fnReadGitterData,
                         layout.plate = plate.layout)

#######################################################

#######################################################
##  Input curation  ###################################

## Name the phenotype runs according to the names of
## the directories from which they came
names(phenotype.data) <- gsub('.*/', '', phenotyping.runs.dat)

## Put the information about the strain in a more
## usable manner.
phenotype.data <- lapply(phenotype.data, FUN = function(x) lapply(x, fnLayoutCleanUp))

## Add the respective timepoints as a column
phenotype.data <- lapply(phenotype.data,
                         function(x) mapply(cbind, x,
                                            'timepoint' = 1:length(x),
                                            SIMPLIFY = F))

phenotype.data.time.series <- lapply(phenotype.data, rbindlist)

#######################################################

#######################################################
##  Analyses      #####################################

## Extract timepoint 30
single.timepoint <- lapply(phenotype.data, FUN = function(x) x[[timepoint]])

## Run summary on the single timepoint
## I'm going ahead without filtering the
## data for outliers, since there are none.
single.timepoint.summary <- lapply(single.timepoint, fnSingelTimepointSummary)
single.timepoint.summary <- lapply(single.timepoint.summary,
                                   function(x) x[frame != 'Border'])
single.timepoint.summary <- Map(cbind, single.timepoint.summary, run = names(single.timepoint.summary))

####
## Using these curves for the figure
curve.data.mtx <- fnPlotGrowthCurve(strain.id = c('B_out_162702', 'B_in_162702'), run = 'glu-mtx-2')
curve.data.dmso <- fnPlotGrowthCurve(strain.id = c('B_out_162702', 'B_in_162702'), run = 'glu-con-2')

plot.mtx.curve  <- ggplot(curve.data.mtx$data, aes(timepoint, size, color = frame)) +
    geom_point() + theme_oneStop() + scale_color_oneStop() +
    theme(aspect.ratio = 1)

plot.dmso.curve <- ggplot(curve.data.dmso$data, aes(timepoint, size, color = frame)) +
  geom_point() + theme_oneStop() + scale_color_oneStop() +
  theme(aspect.ratio = 1)


###

summary.concatenated <- do.call(rbind, single.timepoint.summary)

## Create a wide data frame with the in and out
## in different columns to be able to make scatter
## plots
summary.concatenated.scatter <- fnScatterFormat(summary.concatenated)

## Plot for supplementary figure
plot.supp.scatter <- ggplot(summary.concatenated.scatter, aes(log2(median.y), log2(median.x), color = strain)) +
  geom_point() +
  geom_point(data = summary.concatenated.scatter[strain == 'CTRL'], color = 'gray', size = 0.5) +
  facet_wrap(~run) +
  theme_oneStop() +
  scale_color_manual(values = c(color.SpA, color.SpB, color.SpC, color.SpC)) +
  geom_abline(slope = 1, intercept = 0, linetype = 'dashed') + theme(aspect.ratio = 1) + 
  labs(title = 'Colony size at 60h', x = 'out of frame', y = 'in frame') +
  scale_x_continuous(limits = c(5, 11.5)) +
  scale_y_continuous(limits = c(5, 11.5))

## Plot for main figure
main.scatter <- summary.concatenated.scatter[run == 'glu-mtx-2']
plot.main.scatter <- ggplot(main.scatter,
       aes(log2(median.y), log2(median.x), color = strain)) +
  geom_point() +
  geom_point(data = main.scatter[strain == 'CTRL'], color = 'gray', size = 0.5) +
  theme_oneStop() +
  scale_color_manual(values = c(color.SpA, color.SpB, color.SpC, color.SpC)) +
  geom_abline(slope = 1, intercept = 0, linetype = 'dashed') + theme(aspect.ratio = 1) + 
  labs(title = 'Colony size at 60h', x = 'out of frame', y = 'in frame') +
  scale_x_continuous(limits = c(5, 11.5)) +
  scale_y_continuous(limits = c(5, 11.5))

#######################################################

#######################################################
##  Output curation  ##################################

####
## 181129 | This data is what is accompanying the submission
## of the paper as supplementary table 6. It contains data
## for the second run of both the mtx and dmso condition
raw.data.submission <- Map(cbind, single.timepoint,
                         run = names(single.timepoint))
raw.data.submission.181129 <- rbind(raw.data.submission$`glu-con-2`,
                                  raw.data.submission$`glu-mtx-2`)
###

#######################################################

#######################################################
##  Output      #######################################

## growth curve in mtx for figure 6A
svg(filename = 'growth-curve-mtx-B-in-out-ORF_162702.svg', width = 5, height = 5)
plot.mtx.curve
dev.off()
## growth curve in dmso for figure 6A
svg(filename = 'growth-curve-dmso-B-in-out-ORF_162702.svg', width = 5, height = 5)
plot.dmso.curve
dev.off()

## supplementary figure scatter plots
svg('~/Dropbox/myResearch/eleonore-de-novo/supp-scatter-timepoint-30-glu-181130.svg',
    width = 7, height = 7)
plot.supp.scatter
dev.off()

## Figure 6B scatter plot 
svg('~/Dropbox/myResearch/eleonore-de-novo/scatter-timepoint-30-glu-mtx-2-181130.svg',
    width = 5, height = 5)
plot.main.scatter
dev.off()

## Supplementary table 6
write.table(raw.data.submission.181129, file = 'supplementary-table-6.txt',
            row.names = FALSE)


#######################################################

#######################################################
##  Notes      ########################################



#######################################################

#######################################################
##  FIN     ###########################################
setwd('~/Dropbox/')
#######################################################
