#Set virus/sample folder name
# = which row of smpl.info table
select.smpl <- which(smpl.info$Virus == "ASLV")

#Create data.frame with positional combination frequencies
sp.range <- seq(-13,-1, by = 1)
smpl.title <- "seqName"
df.aslv <- as.data.frame(do.call(rbind,
                            lapply(select.smpl,
                                   function(ss) {
                                     #Load sample variables
                                     virus <- smpl.info$seqName[ss]
                                     vrs <- smpl.info$Virus[ss]
                                     ctrl <- smpl.info$Control[ss]
                                     sln <- smpl.info$seqLen[ss]
                                     celltype <- smpl.info$Cell[ss]
                                     
                                     #Target site duplication (length)
                                     tsd <- unname(retro.tsd[names(retro.tsd) == vrs])
                                     half.tsd <- floor(tsd/2)
                                     
                                     #Load sequences
                                     #Load IS
                                     is.seq <- as(read.table(paste0(data.wd, "IS/", virus, "/", virus, "_is", sln,".txt"),
                                                             sep = "\t", stringsAsFactor = FALSE, header = FALSE),
                                                  "DataFrame")
                                     
                                     is.nuc.mat <- NucMat(is.seq[,1])
                                     colnames(is.nuc.mat) <- RelPosNames(is.nuc.mat)
                                     
                                     ran.seq.name <- paste0(ctrl,"_seq_CAP")
                                     ran.seq <- as(read.table(paste0(data.wd, "IS/Ran/", ran.seq.name, ".txt"),
                                                              sep = "\t", stringsAsFactor = FALSE, header = FALSE),
                                                   "DataFrame")
                                     
                                     ran.nuc.mat <- NucMat(ran.seq[,1])
                                     colnames(ran.nuc.mat) <- RelPosNames(ran.nuc.mat)
                                     
                                     
                                     #Calculate dinucleotide frequency and enrichment (fold, KLID)
                                     sp.range <- as.numeric(colnames(is.nuc.mat)[colnames(is.nuc.mat) < 0])
                                     dinuc.freq <- ListToDF(DinucAtPosFreq(sp.range, sample.seq.mat = is.nuc.mat, control.seq.mat = ran.nuc.mat),
                                                            method = "dinuc")
                                     
                                     #Add virus-specific columns to data:
                                     ##Virus name
                                     if(smpl.title == "whole") {dinuc.freq$Virus <- smpl.info$Name[ss]}
                                     if(smpl.title == "virus") {
                                       if(celltype == "invitro") {
                                         dinuc.freq$Virus <- paste0(vrs,"iv")
                                       } else {
                                         dinuc.freq$Virus <- vrs
                                       }
                                     }
                                     if(smpl.title == "virus_celltype") {dinuc.freq$Virus <- paste0(vrs, "_", celltype)}
                                     if(smpl.title == "seqName") {dinuc.freq$Virus <- virus}
                                     
                                     ##position relative to cleavage site
                                     cs.rel.pos <- as.numeric(levels(dinuc.freq$Pos)[dinuc.freq$Pos]) + half.tsd
                                     cs.rel.pos[cs.rel.pos >= 0] <- cs.rel.pos[cs.rel.pos >= 0] + 1
                                     dinuc.freq$tsdPos <- cs.rel.pos
                                     
                                     dinuc.freq
                                   })))

#Create combined bar plot + points showing dinucleotide combinations KLID
select.pos <- c(-5:-1, 1:3)
sel.vir <- unique(df.aslv$Virus)

lvls.virus <- sel.vir

#Create data.frame with data used for plotting
data.to.plot <- df.aslv[df.aslv$tsdPos %in% select.pos & df.aslv$Virus %in% sel.vir,]

data.to.plot$Virus <- factor(data.to.plot$Virus, levels = lvls.virus)
data.to.plot$tsdPos <- factor(data.to.plot$tsdPos, levels = select.pos)
data.to.plot$N1 <- factor(data.to.plot$N1, levels = nucs)
data.to.plot$N2 <- factor(data.to.plot$N2, levels = nucs)

klid <- as.data.frame(do.call(rbind,
                              lapply(lvls.virus,
                                     function(lvls.v) {
                                       #Derive (per postion) KLID
                                       k <- sapply(select.pos,
                                                   function(p) {
                                                     s <- sum(data.to.plot$KLID[data.to.plot$tsdPos == p &
                                                                                  data.to.plot$Virus == lvls.v])
                                                     if(length(s) == 0) {s <- c()}
                                                     s
                                                   })
                                       
                                       df <- data.frame(tsdPos = select.pos,
                                                        KLID = k,
                                                        Virus = lvls.v)
                                       df[df == 0] <- NA
                                       df
                                     })))
klid <- klid[!is.na(klid$KLID),]
klid$tsdPos <- factor(klid$tsdPos, levels = select.pos)
klid$Virus <- factor(klid$Virus, levels = lvls.virus)

#PLOT
p.aslv.a <- 
  ggplot(data = data.to.plot) +
  geom_col(data = klid, aes(x = tsdPos, y = KLID, group = Virus),
           fill = "gray90", color = "gray40", width = 0.75, na.rm = TRUE) +
  geom_hline(yintercept = 0) +
  geom_vline(xintercept = abs(min(select.pos)) + 0.5, color = "black", lty = 2) +
  scale_shape_manual(values=c(21:25)) +
  scale_colour_manual(values = nuc.cols) +
  geom_quasirandom(aes(x = tsdPos, y = KLID, group = Virus, color = N2, shape = N1),
                   size = 2, stroke = 1.5, fill = alpha("gray",.5)) +
  geom_text(aes(label = Virus), x = 2.5, y = 90, fontface = "bold", size = 3) +
  xlab("STR site-relative position") +
  coord_cartesian(ylim = c(0, 120)) +
  #scale_x_discrete(labels = select.pos) +
  guides(shape = guide_legend(order=1),
         color = guide_legend(order=2)) +
  theme_classic() +
  theme(plot.title = element_text(hjust = 0, colour = "black", face = "bold"),
        axis.line.x = element_blank(),
        axis.title.x = element_text(colour = "black", face = "bold", size=8, angle = 0),
        axis.title.y = element_text(colour = "black", face = "bold", size=8),
        axis.text.x = element_text(colour = "black", face = "bold", size=8, angle = 0),
        axis.text.y = element_text(colour = "black", face = "bold", size=8, angle = 0),
        legend.position=c("top"), legend.box = "vertical",
        legend.title = element_text(colour = "black", size=9, face="bold"),
        legend.text = element_text(colour = "black", size=8, face="bold"),
        legend.spacing = unit(0, units = "points"),
        strip.background = element_blank(),
        strip.text = element_blank(),
        rect = element_rect(fill = "transparent"),
        panel.background = element_rect(fill = "transparent"),
        plot.background = element_rect(fill = "white", color = "white")
  ) +
  facet_wrap(~Virus, ncol = 2, scales = "fixed", strip.position = "right")

