## Evan Ernst
## CSHL 2019
## eernst@cshl.edu
##

#f (!requireNamespace("BiocManager"))
# install.packages("BiocManager")

library("edgeR")
library("tidyverse")
library("statmod")
library("ggthemes")
library("ggrepel")
library("extrafont")
library("ggplot2")

font_import(pattern = "Arial")
loadfonts()

load_featurecounts <- function(filename) {
  fcDfRaw <- 
  fcDfRaw
}

# This table is derived from the TAIR10 transposon annotation and has the format:
# 
# TE_ID	TE_family	TE_group
# AT1TE52125	ATHILA2	LTR/Gypsy
# AT1TE42735	ATCOPIA68	LTR/Copia
# AT1TE36140	ATCOPIA27	LTR/Copia
# AT1TE21850	ATREP11	RC/Helitron
# AT1TE95105	ATREP3	RC/Helitron
# ...
# 
teAnnot <- read.delim("TE_ID_description.txt")
# Sort the rows of the TE annotation file
teAnnot <- teAnnot %>% arrange(TE_ID)


### EdgeR DE analysis
# 
# Input is the count table produced by running standalone featureCounts in the following way:
# Program:featureCounts v1.6.3; Command:"featureCounts" "-T" "16" "--tmpDir" "/tmp/uge/50617705.1.primary.q" "--largestOverlap" "-p" "-B" "-t" "exon" "-g" "ID" "-J" "-G" "TAIR10_genome.fa" "-o" "vlp-seq.deduped.te_only.counts" "-a" "TE_only.gff" "col0-1.primary.deduped.bam" "col0-2.primary.deduped.bam" "col0-3.primary.deduped.bam" "ddm1-1.primary.deduped.bam" "ddm1-2.primary.deduped.bam" "ddm1-3.primary.deduped.bam" "ddm1rdr6-1.primary.deduped.bam" "ddm1rdr6-2.primary.deduped.bam" "ddm1rdr6-3.primary.deduped.bam"
# 
# It has the format:
# 
# Geneid	Chr	Start	End	Strand	Length	col0-1.primary.deduped.bam	col0-2.primary.deduped.bam	col0-3.primary.deduped.bam	ddm1-1.primary.deduped.bam	ddm1-2.primary.deduped.bam	ddm1-3.primary.deduped.bam	ddm1rdr6-1.primary.deduped.bam	ddm1rdr6-2.primary.deduped.bam	ddm1rdr6-3.primary.deduped.bam
# AT1TE00010	Chr1	11897	11976	+	80	0	0	0	0	1	2	0	0	0
# AT1TE00020	Chr1	16883	17009	-	127	1	0	0	0	2	0	1	0	0
# AT1TE00025	Chr1	17024	18924	+	1901	4	0	1	3	1	0	1	0	2
# ...
# 
countDf <- read.delim("vlp-seq.deduped.te_only.counts" ,header = TRUE, skip = 1)
newSampleNames <- gsub(".primary.deduped.bam", "", colnames(countDf), perl = TRUE)
colnames(countDf) <- as.factor(newSampleNames)
# sort the counts by TE ID
countDf <- countDf %>% arrange(Geneid)
countDf <- select(countDf, -Geneid, -Chr, -Start, -End, -Strand, -Length)
# Remove outlier ddm1.2 with very high background
countDf <- select(countDf, -matches("ddm1.2"))

group = factor(c(1,1,1,2,2,3,3,3))
levels(group) = c("WT","ddm1","ddm1rdr6")

dgeList <- DGEList(counts=countDf, group = group, genes = teAnnot)

rownames(dgeList$counts) <- rownames(dgeList$genes) <- dgeList$genes$TE_ID

# Non-specific low-expression gene filter
keep <- rowSums(cpm(dgeList)>50) >= 3
dgeList <- dgeList[keep, , keep.lib.sizes=FALSE]

dgeList <- calcNormFactors(dgeList)
design <- model.matrix(~group)
dgeList <- estimateDisp(dgeList, design = design, robust = TRUE)
dgeList$common.dispersion

# Diagnostic plots
points <- c(15,16,17)
colors <- c("#666666", "#FFCC00","#FF3300")
plotMDS(dgeList, col=colors[group], pch=points[group])
legend("topright", legend=levels(group), pch=points, col=colors, ncol=2)

plotMD(cpm(dgeList, log=TRUE), column = 4)
abline(h=0, col="red", lty=2, lwd=2)

plotBCV(dgeList)

# Quasi-likelihood F-tests
fit <- glmQLFit(dgeList, design)
plotQLDisp(fit)

# Log fold change cutoff
lfc <- 2
# False discovery rate nominal control
fdr <- 0.05

# DDM1 - WT
qltWTvDDM1 <- glmQLFTest(fit, coef=2)
topTags(qltWTvDDM1, n = 15)
#summary(deWTvDDM1 <- decideTestsDGE(qltWTvDDM1, p.value = fdr, lfc = lfc))
summary(deWTvDDM1 <- decideTestsDGE(qltWTvDDM1, p.value = fdr, lfc = lfc))
deTagsWTvDDM1 <- rownames(dgeList)[as.logical(deWTvDDM1)]
plotSmear(qltWTvDDM1, de.tags=deTagsWTvDDM1,cex=0.3, main="ddm1 vs. WT")
abline(h=c(-lfc, lfc), col="blue")

# DDM1RDR6 - WT
qltWTvDDM1RDR6 <- glmQLFTest(fit, coef=3)
topTags(qltWTvDDM1RDR6, n = 15)
summary(deWTvDDM1RDR6 <- decideTestsDGE(qltWTvDDM1RDR6, p.value = fdr, lfc = lfc))
deTagsWTvDDM1RDR6 <- rownames(dgeList)[as.logical(deWTvDDM1RDR6)]
plotSmear(qltWTvDDM1RDR6, de.tags=deTagsWTvDDM1RDR6,cex=0.3, main="ddm1rdr6 vs. WT")
abline(h=c(-lfc, lfc), col="blue")

#DDM1RDR6 - DDM1
qltDDM1vDDM1RDR6 <- glmQLFTest(fit, contrast = c(0,-1,1))
topTags(qltDDM1vDDM1RDR6, n = 15)
summary(deDDM1vDDM1RDR6 <- decideTestsDGE(qltDDM1vDDM1RDR6, p.value = fdr, lfc = lfc))
deTagsDDM1vDDM1RDR6 <- rownames(dgeList)[as.logical(deDDM1vDDM1RDR6)]
plotSmear(qltDDM1vDDM1RDR6, de.tags=deTagsDDM1vDDM1RDR6,cex = 0.3, main="ddm1rdr6 vs. ddm1")
abline(h=c(-lfc, lfc), col="blue")

deTblWTvDDM1up <- qltWTvDDM1$table[as.logical(deWTvDDM1 > 0),] %>% 
  tibble::rownames_to_column(var = "TE_ID") %>% left_join(teAnnot)
deTblWTvDDM1down <- qltWTvDDM1$table[as.logical(deWTvDDM1 < 0),] %>% 
  tibble::rownames_to_column(var = "TE_ID") %>% left_join(teAnnot)
deTblWTvDDM1RDR6up <- qltWTvDDM1RDR6$table[as.logical(deWTvDDM1RDR6 > 0),] %>% 
  tibble::rownames_to_column(var = "TE_ID") %>% left_join(teAnnot)
deTblWTvDDM1RDR6down <- qltWTvDDM1RDR6$table[as.logical(deWTvDDM1RDR6 < 0),] %>% 
  tibble::rownames_to_column(var = "TE_ID") %>% left_join(teAnnot)
deTblDDM1vDDM1RDR6up <- qltDDM1vDDM1RDR6$table[as.logical(deDDM1vDDM1RDR6 > 0),] %>% 
  tibble::rownames_to_column(var = "TE_ID") %>% left_join(teAnnot)
deTblDDM1vDDM1RDR6down <- qltDDM1vDDM1RDR6$table[as.logical(deDDM1vDDM1RDR6 < 0),] %>% 
  tibble::rownames_to_column(var = "TE_ID") %>% left_join(teAnnot)

if (! dir.exists("./results")) {
  dir.create("./results")
}

write.table(deTblWTvDDM1up, "results/deTableWTvDDM1up.txt",quote = FALSE,row.names = FALSE, col.names = TRUE, sep = '\t')
write.table(deTblWTvDDM1down, "results/deTableWTvDDM1down.txt",quote = FALSE,row.names = FALSE, col.names = TRUE, sep = '\t')
png(filename="results/qltWTvDDM1.png",width = 1200, height = 1200,res = 300)
plotSmear(qltWTvDDM1, de.tags=deTagsWTvDDM1,cex=0.3, main="ddm1 vs. WT")
abline(h=c(-2, 2), col="blue")
dev.off()

write.table(deTblWTvDDM1RDR6up, "results/deTableWTvDDM1RDR6up.txt",quote = FALSE,row.names = FALSE, col.names = TRUE, sep = '\t')
write.table(deTblWTvDDM1RDR6down, "results/deTableWTvDDM1RDR6down.txt",quote = FALSE,row.names = FALSE, col.names = TRUE, sep = '\t')
png(filename="results/qltWTvDDM1RDR6.png",width = 1200, height =1200,res = 300)
plotSmear(qltWTvDDM1RDR6, de.tags=deTagsWTvDDM1RDR6,cex=0.3, main="ddm1rdr6 vs. WT")
abline(h=c(-2, 2), col="blue")
dev.off()

write.table(deTblDDM1vDDM1RDR6up, "results/deTableDDM1vDDM1RDR6up.txt",quote = FALSE,row.names = FALSE, col.names = TRUE, sep = '\t')
write.table(deTblDDM1vDDM1RDR6down, "results/deTableDDM1vDDM1RDR6down.txt",quote = FALSE,row.names = FALSE, col.names = TRUE, sep = '\t')
png(filename="results/qltDDM1vDDM1RDR6.png",width = 1200, height =1200,res = 300)
plotSmear(qltDDM1vDDM1RDR6, de.tags=deTagsDDM1vDDM1RDR6,cex = 0.3, main="ddm1rdr6 vs. ddm1")
abline(h=c(-2, 2), col="blue")
dev.off()






### 
### Figures
### 

themePublication <- function(base_size=24, base_family="") {
  library(ggthemes)
  (theme_foundation(base_size=base_size, base_family=base_family) +
      theme(
        text = element_text(),
        plot.title = element_text(face = "plain", size = rel(1.2), hjust = 0.5),
        panel.background = element_rect(colour = NA),
        plot.background = element_rect(colour = NA),
        panel.border = element_rect(colour = NA),
        axis.title = element_text(face = "plain", size = rel(1)),
        axis.title.y = element_text(angle=90,vjust =2),
        axis.title.x = element_text(vjust = -0.2),
        axis.text = element_text(face = "plain"),
        axis.text.x=element_text(angle=45,hjust=1),
        axis.line = element_line(colour="black"),
        axis.ticks = element_line(),
        panel.grid.major = element_line(colour="#f0f0f0"),
        panel.grid.minor = element_blank(),
        legend.key = element_rect(colour = NA),
        legend.text = element_text(face = "plain"),
        legend.position = "bottom",
        legend.direction = "horizontal",
        legend.key.size= unit(20, "points"),
        legend.margin = margin(0,0,0,0, unit = "pt"),
        legend.title = element_blank(),
        plot.margin=unit(c(10,5,5,5),"points"),
        strip.background=element_rect(colour="#f0f0f0",fill="#f0f0f0"),
        strip.text = element_text(face="plain")
      )
  )
}


deDecorate <- function(dgeList, deList, comparison) {
  dgeList <- topTags(dgeList, n=nrow(dgeList$table))$table %>% arrange(TE_ID)
  deList <- as.data.frame(deList) %>% rename("DE" = 1) %>% rownames_to_column("TE_ID") %>% arrange(TE_ID)
  deList$TE_ID <- factor(deList$TE_ID)
  res <- dgeList %>% bind_cols(deList) %>% mutate("comparison" = comparison)
  res <- res %>% mutate(
    category = case_when(
      TE_group == "LTR/Copia" & DE != 0 ~ "COPIA",
      str_detect(TE_family, "ATHILA") & DE != 0 ~ "ATHILA",
      DE != 0 ~ "Other",
      TRUE ~ "Non-significant"
    )
  )
  res$category <- factor(res$category, levels = c("COPIA", "ATHILA", "Other", "Non-significant"))
  res
}
decoratedWTvDDM1 <- deDecorate(qltWTvDDM1, deWTvDDM1, "ddm1 vs WT")
decoratedWTvDDM1RDR6 <- deDecorate(qltWTvDDM1RDR6, deWTvDDM1RDR6, "ddm1rdr6 vs WT")
decoratedDDM1vDDM1RDR6 <- deDecorate(qltDDM1vDDM1RDR6, deDDM1vDDM1RDR6, "ddm1rdr6 vs ddm1")
decoratedDf <- bind_rows(decoratedWTvDDM1, decoratedWTvDDM1RDR6, decoratedDDM1vDDM1RDR6)
decoratedDf$comparison <- factor(decoratedDf$comparison, levels = c("ddm1 vs WT", "ddm1rdr6 vs WT", "ddm1rdr6 vs ddm1"))

teCallouts = c(
  "AT5TE20395", # EVADE
  "AT3TE76225", # SISYPHUS
  "AT1TE36035", # ATCOPIA51
  "AT4TE25200", # ATCOPIA51
  "AT5TE65370", # ATCOPIA21
  "AT1TE45315"  # ATGP3
)

# Special labels for some elements
decoratedDf <- decoratedDf %>% mutate(
  elementLabel = ifelse(
    TE_ID == "AT5TE20395",
    "EVADE",
    ifelse(
      TE_ID == "AT3TE76225",
      "SISYPHUS",
      ifelse(
        DE != 0 & TE_ID %in% teCallouts,
        as.character(TE_family), 
        ""
      )
    )
  )
)



### Volcano Plots
volcano <- ggplot(decoratedDf %>% arrange(logFC)) +
  facet_grid(. ~ comparison) +
  geom_point(data = decoratedDf,
             aes(x=logFC, y=-log10(FDR), colour=category, shape = category),
             size = 2) +
  geom_text_repel(
    aes(
      label = elementLabel,
      color = category,
      x=logFC, y=-log10(FDR)),
    box.padding = 1.5,
    force = 3,
    size = 3,
    fontface = "italic",
    family = "Arial"
  ) +
  xlab(c("Log2 fold change")) + 
  ylab("-Log10 adjusted p-value") +
  scale_x_continuous(limits = c(-10,10)) +
  scale_color_manual(values = c(tableau_color_pal("Tableau 10")(3), "#999999")) +
  themePublication(base_size=18, base_family = "Arial") +
  theme(legend.position = "right",
        legend.direction = "vertical",
        axis.title.y = element_text(angle=90,vjust =2, size = rel(0.75)),
        axis.title.x = element_text(vjust = 0, size = rel(0.75)),
        axis.text.x=element_text(angle=0,hjust=0.5, vjust = -0.5),
        axis.text.y=element_text(hjust=0.5),
        panel.grid.major = element_line(colour="#f0f0f000"),
        panel.spacing = unit(2, "lines"),
        plot.title = element_text(size = rel(1.25), hjust = 0.5),
        strip.background=element_rect(colour="#ffffff",fill="#ffffff"),
        strip.text = element_text(face="plain", size = rel(1.25)))
volcano
ggsave("results/volcano-plots.pdf", width = 12, height = 4, useDingbats=FALSE)
ggsave("results/volcano-plots.png", width = 12, height = 4)



### TE Family Heatmap
heatmapDf <- decoratedDf %>% 
  filter(comparison != "ddm1rdr6 vs ddm1" & TE_family != "Unassigned") %>% 
  droplevels() %>%
  group_by(TE_family, comparison) %>%
  add_tally(name = "totalElements") %>%
  mutate(deElementsInFam = sum(abs(DE))) %>%
  ungroup() %>%
  group_by(TE_ID) %>% 
  mutate(deAnyComparison = sum(abs(DE)) > 0) %>% 
  ungroup()

decorateTEFam <- function(x, df) {
  famDf <- df %>% filter(TE_family == x) %>% droplevels()
  numDeDdm1 <- famDf %>% filter(comparison == "ddm1 vs WT")
  numDeDdm1Rdr6 <- famDf %>% filter(comparison == "ddm1rdr6 vs WT")
  res <- sprintf("%s \n%2.0f%% | %2.0f%%", x, 
                 100 * numDeDdm1$deElementsInFam[1] / famDf$totalElements[1], 
                 100 * numDeDdm1Rdr6$deElementsInFam[1] / famDf$totalElements[1]
  )
}
levels(heatmapDf$TE_family) <- sapply(levels(heatmapDf$TE_family), decorateTEFam, df = heatmapDf)
ggplot(
  heatmapDf %>% filter(deAnyComparison == TRUE) %>% mutate(logFC = if_else(DE == 0, 0, logFC)) %>% droplevels(),
  aes(x = comparison, y = reorder(TE_ID,logFC), fill = logFC)) + 
  facet_wrap(~TE_family, scales = "free_y", ncol = 10, strip.position = "top" ) +
  geom_tile(colour = "grey") + 
  xlab("DE in ddm1 vs WT | DE in ddm1rdr6 vs WT") +
  theme_tufte(base_family = "Helvetica") +
  theme(
    axis.text = element_blank(),
    axis.ticks = element_blank(),
    axis.title.y = element_blank(),
    axis.text.x = element_blank()
  ) +
  scale_fill_gradient2(limits = c(-10,10), low = "#1170aa", mid = "white", high = "#fc7d0b")
ggsave("results/de-heatmap.pdf", width = 13, height = 6)
ggsave("results/de-heatmap.png", width = 13, height = 6)




