library(data.table)
library(tidyverse)

#Read files with random vs observed minMFEs
LINE_element <- fread("LINE_element_minMFE_deltas.csv")
LTR_retrotransposon <- fread("LTR_retrotransposon_minMFE_deltas.csv")
SINE_element <- fread("SINE_element_minMFE_deltas.csv")
gene <- fread("gene_minMFE_deltas.csv")
helitron <- fread("helitron_minMFE_deltas.csv")
longesttranscripts <- fread("longesttranscripts_minMFE_deltas.csv ")
miRNA <- fread("miRNA_minMFE_deltas.csv")
solo_LTR <- fread("solo_LTR_minMFE_deltas.csv ")
terminal_inverted_repeat_element <- fread("terminal_inverted_repeat_element_minMFE_deltas.csv")

LINE_element$element <- "LINE_element"
LTR_retrotransposon$element <- "LTR_retrotransposon"
SINE_element$element <- "SINE_element"
gene$element <- "gene"
helitron$element <- "helitron"
longesttranscripts$element <- "longesttranscripts"
miRNA$element <- "miRNA"
solo_LTR$element <- "solo_LTR"
terminal_inverted_repeat_element$element <- "terminal_inverted_repeat_element"

all <- rbind(
  LINE_element,
  LTR_retrotransposon,
  SINE_element,
  gene,
  helitron,
  longesttranscripts,
  miRNA,
  #solo_LTR, # exclude in figs
  terminal_inverted_repeat_element
)


all_longer <- pivot_longer(all, cols = c(minMFE, randMean))

all_longer$element <- factor(all_longer$element, levels = c(
  "miRNA",
  "gene",
  "longesttranscripts",
  "LINE_element",
  "SINE_element",
  "LTR_retrotransposon",
  #"solo_LTR",
  "helitron",
  "terminal_inverted_repeat_element"
))

all$element <- factor(all$element, levels = c(
  "miRNA",
  "gene",
  "longesttranscripts",
  "LINE_element",
  "SINE_element",
  "LTR_retrotransposon",
  #"solo_LTR",
  "helitron",
  "terminal_inverted_repeat_element"
))

colPal <- c("darkcyan", "darkgoldenrod1")
ggplot(data = all_longer, aes(x = element, y = value, fill = name)) +
  geom_violin() +
  geom_hline(yintercept = -40, linetype = "dotted") +
  theme_classic() +
  xlab("Feature") +
  ylab("MFE (kcal/mol)") +
  scale_fill_manual(values = colPal) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) 
ggsave("minMFE_vs_random_allFeatures_violin.png", device ="png", width = 6, height = 5)

ggplot(data = all, aes(x = minMFE, color = element)) +
  geom_density() + 
  theme_classic() +
  geom_vline(xintercept = -40)

all$deltaPerc <- (all$delta / all$minMFE) * 100
ggplot(data = all, aes(x = element, y = deltaPerc)) +
  geom_violin() + theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  xlab("Feature") +
  ylab("Percent difference (minMFE - random minMFE)") +
  scale_color_brewer(type = "qual", palette = "Paired") 
  
ggsave("minMFE_vs_random_allFeatures_violin_percDelta_August.png", device ="png", width = 6, height = 5)

#TE file from Jiao et al., 2017, used in RNAfold analyses
TE <- fread("Jiao2017.TE.filtered.gff3")
colnames(TE) <- c("chr", "source", "type", "min", "max", "q1", "strand", "q2", "attributes")

# Assign three letter Wicker 2007 code (DHH, etc)
TE$family <- substr(TE$attributes, 4, 6)
TE$header <- paste(TE$chr, ":", TE$min, "-", TE$max, sep = "")

TE_less <- TE[,c(4:5,10:11)]

all <- merge(all, TE_less, by = "header", all.x = T)
all$family <- ifelse(all$element == "solo_LTR", paste("solo_", all$family, sep = ""), all$family)
all$family <- ifelse(all$element == "gene", "gene", all$family)
all$family <- ifelse(all$element == "longesttranscripts", "mRNA", all$family)
all$family <- ifelse(all$element == "miRNA", "miRNA", all$family)

all$element <- ifelse(all$family == "gene" | all$family == "mRNA", "gene", as.character(all$element))


all$family <- factor(all$family, 
                     levels = c("gene", 
                                "mRNA",
                                "miRNA",
                                "DHH","DTA", "DTC", "DTH", "DTM", "DTT",
                                "RIL", "RIT",
                                "RST",
                                "RLC", "RLG", "RLX"
                                ))
ggplot(data = all, aes(x = family, y = deltaPerc, fill = element)) +
  geom_violin() + theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  xlab("Feature") +
  ylab("Percent difference (minMFE - random minMFE)") +
  scale_fill_brewer(type = "qual", palette = "Paired") +
  geom_hline(yintercept = 0, linetype = "dotted") + 
  stat_summary(fun = "mean",
               geom = "point",
               color = "black")
  
  
ggsave("minMFE_vs_random_allFeatures_violin_percDelta_families_Sep.png", device ="png", width = 8, height = 4)


ggplot(data = all, aes(x = family, y = minMFE, fill = element)) +
  geom_hline(yintercept = -40) +
  geom_violin() + theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  xlab("Feature") +
  ylab("minMFE (kcal/mol)") +
  scale_fill_brewer(type = "qual", palette = "Paired")
ggsave("minMFE_vs_random_allFeatures_violin_minMFE_families_August.png", device ="png", width = 8, height = 4)


all$min <- str_split(string = str_split(all$header, pattern = ":", simplify = T)[,2], pattern = "-", simplify = T)[,1] %>% as.numeric()
all$max <- str_split(string = str_split(all$header, pattern = ":", simplify = T)[,2], pattern = "-", simplify = T)[,2] %>% as.numeric()

temp <- str_split(all[all$element == "miRNA",]$header, pattern = ":", simplify = T)[,4]
all[all$element == "miRNA",]$min <- str_split(temp, pattern = "-", simplify = T)[,1] %>% as.numeric()
all[all$element == "miRNA",]$max <- str_split(temp, pattern = "-", simplify = T)[,2] %>% as.numeric()

all$length <- all$max - all$min 
#all$min

library(RColorBrewer)
library(ggpubr)
library(ggpmisc)


ggplot(data = all, aes(x = log10(length), y = minMFE, col = element))+
  geom_point(size = 0.1, alpha = 0.1) +
  geom_smooth(aes(x = log10(length), y = minMFE, group = element),
              color = "black", method = 'lm', size = 2, fill = NA) +
  geom_smooth(se = T, method = "lm", size = 1.3, fill = NA) +
  scale_color_brewer(type = "qual", palette = "Paired") +
  theme_classic() + stat_poly_eq(formula = y~ x,
                                 aes(label = paste(..eq.label..,..rr.label..,..p.value.., sep = "*`,`~")),
                                 parse = TRUE,
                                 label.x.npc = "right",
                                 size =4,
                                 vstep =0.05)+
  ylim(-120, 20)
  
ggsave("minMFE_byLength_dotplot_minMFE_sep.png", device ="png", width = 8, height = 5)

lmsum <- lm(minMFE ~ length*element, data  = all) %>% summary()
lmsum$coefficients %>% View()

ggplot(data = all, aes(x = log10(length), y = minMFE, col = element))+
  geom_smooth(se = F, method = "lm") +
  scale_color_brewer(type = "qual", palette = "Paired") +
  theme_classic()
ggsave("minMFE_byLength_dlines_minMFE.png", device ="png", width = 6, height = 5)


### Mean var


LINE_element <- fread("LINE_element_MFEstats.sorted.csv")
LTR_retrotransposon <- fread("LTR_retrotransposon_MFEstats.sorted.csv")
SINE_element <- fread("SINE_element_MFEstats.sorted.csv")
gene <- fread("gene_MFEstats.sorted.csv")
helitron <- fread("helitron_MFEstats.sorted.csv")
longesttranscripts <- fread("longesttranscripts_MFEstats.sorted.csv ")
miRNA <- fread("miRNA_MFEstats.sorted.csv")
solo_LTR <- fread("solo_LTR_MFEstats.sorted.csv ")
terminal_inverted_repeat_element <- fread("terminal_inverted_repeat_element_MFEstats.sorted.csv")

LINE_element$element <- "LINE_element"
LTR_retrotransposon$element <- "LTR_retrotransposon"
SINE_element$element <- "SINE_element"
gene$element <- "gene"
helitron$element <- "helitron"
longesttranscripts$element <- "longesttranscripts"
miRNA$element <- "miRNA"
solo_LTR$element <- "solo_LTR"
terminal_inverted_repeat_element$element <- "terminal_inverted_repeat_element"

all <- rbind(
  LINE_element,
  LTR_retrotransposon,
  SINE_element,
  gene,
  helitron,
  longesttranscripts,
  miRNA,
  #solo_LTR,
  terminal_inverted_repeat_element
)

gc()


all$element <- factor(all$element, levels = c(
  "miRNA",
  "gene",
  "longesttranscripts",
  "LINE_element",
  "SINE_element",
  "LTR_retrotransposon",
 # "solo_LTR",
  "helitron",
  "terminal_inverted_repeat_element"
))

TE <- fread("Jiao2017.TE.filtered.gff3")
colnames(TE) <- c("chr", "source", "type", "min", "max", "q1", "strand", "q2", "attributes")

TE$family <- substr(TE$attributes, 4, 6)
TE$header <- paste(TE$type, "::", TE$chr, ":", TE$min, "-", TE$max, sep = "")

TE_less <- TE[,c(4:5,10:11)]

all <- merge(all, TE_less, by = "header", all.x = T)
#all$family <- ifelse(all$element == "solo_LTR", paste("solo_", all$family, sep = ""), all$family)
all$family <- ifelse(all$element == "gene", "gene", all$family)
all$family <- ifelse(all$element == "longesttranscripts", "mRNA", all$family)
all$family <- ifelse(all$element == "miRNA", "miRNA", all$family)

all$element <- ifelse(all$family == "gene" | all$family == "mRNA", "gene", as.character(all$element))


all$family <- factor(all$family, 
                     levels = c("gene", 
                                "mRNA",
                                "miRNA",
                                "DHH","DTA", "DTC", "DTH", "DTM", "DTT",
                                "RIL", "RIT",
                                "RST",
                                "RLC", "RLG", "RLX"
                     ))

library(cowplot)

violin_meanMFE <- ggplot(data = all, aes(x = family, y = meanMFE, fill = element)) +
  geom_boxplot() + theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  xlab("Feature") +
  ylab("mean MFE (kcal/mol)") +
  geom_hline(yintercept = -40, linetype = "dotted") +
  scale_fill_brewer(type = "qual", palette = "Paired") +
  coord_flip()
ggsave("meanMFE_allFeatures_violin_families_August.png", device ="png", width = 5, height = 3)

density_meanMFE <- ggplot(data = all, aes(x = meanMFE, fill = element, col = element)) +
  geom_density(alpha = 0.1) + 
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  xlab("Feature") +
  ylab("mean MFE (kcal/mol)") +
  geom_vline(xintercept = -40, linetype = "dotted") +
  scale_fill_brewer(type = "qual", palette = "Paired") +
  scale_color_brewer(type = "qual", palette = "Paired") 
ggsave("meanMFE_allFeatures_vdensity_families_August.png", device ="png", width = 5, height = 3)

PF <- fread("B73_all_partitionfunctions.txt")
colnames(PF) <- c("header","PF")

PF$range <- str_split(PF$header, pattern = ":", simplify = TRUE)[,4]
PF$min <- str_split(PF$range, pattern = "-", simplify = TRUE)[,1] %>% as.numeric()
PF$max <- str_split(PF$range, pattern = "-", simplify = TRUE)[,2] %>% as.numeric()
PF$type <- str_split(PF$header, pattern = ":", simplify = TRUE)[,1]
PF$chr <- str_split(PF$header, pattern = ":", simplify = TRUE)[,3]

PF$FeatureID <- ifelse(PF$type != "gene" & PF$type != "miRNA", 
                       paste(PF$type, "::", PF$chr, ":", PF$min+1, "-", PF$max, sep = ""),
                       PF$header)

allnew <- merge(all, PF, by.x = "header", by.y = "FeatureID")
allnew$length <- allnew$max.y - allnew$min.y
allnew$PFnorm <- allnew$PF/allnew$length



ggplot(data = allnew, aes(x = PFnorm, fill = element, col = element)) +
  geom_density(alpha = 0.1) + 
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  xlab("Partition function / length") +
  #ylab("Partition function /") +
  #geom_vline(xintercept = -40, linetype = "dotted") +
  scale_fill_brewer(type = "qual", palette = "Paired") +
  scale_color_brewer(type = "qual", palette = "Paired")  +
  xlim(0,.75)
ggsave("PFnormalized_allFeatures_vdensity_families_August.png", device ="png", width = 5, height = 3)

ggplot(data = allnew, aes(x = meanMFE, y = PFnorm, col = element)) +
  geom_smooth(method = "lm") +
  stat_poly_eq(formula = y~ x,
                                          aes(label = ..rr.label..),
                                          parse = TRUE,
                                          label.x.npc = "right",
                                          size =2,
                                          vstep =0.05) +
  theme_bw() +
  ylab("Partition function / length")
ggsave("meanMFE_vs_PF_all.png", device = "png", scale = 2)

ggplot(data = allnew, aes(x = meanMFE, y = PFnorm)) +
  #geom_smooth(method = "lm", alpha = 0) +
  geom_point(data = sample_n(allnew,
                             size = 20000), alpha = 0.1, size = 0.5,
             aes(col = type)) +
  stat_poly_eq(formula = y~ x,
               aes(label = paste(..eq.label..,..rr.label.., sep = "*`,`~")),
               parse = TRUE,
               label.x.npc = "right",
               size =4,
               vstep =0.05) +
  theme_classic() +
  ylab("Partition function / length")
ggsave("meanMFE_vs_PF_alldot.png", device = "png", scale = 2)

plot_grid(violin_meanMFE, density_meanMFE, nrow = 2, labels = c("", ""),
          rel_heights = c(1, 1), align = "v")
ggsave("combo_meanMFE.png", height = 7, width = 6)

##minMFE
violin_minMFE <- ggplot(data = all, aes(x = family, y = minMFE, fill = element)) +
  geom_boxplot() + theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  xlab("Feature") +
  ylab("min MFE (kcal/mol)") +
  geom_hline(yintercept = -40, linetype = "dotted") +
  scale_fill_brewer(type = "qual", palette = "Paired") +
  coord_flip()
ggsave("minMFE_allFeatures_violin_families_August.png", device ="png", width = 5, height = 3)

density_minMFE <- ggplot(data = all, aes(x = minMFE, fill = element, col = element)) +
  geom_density(alpha = 0.1) + 
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  xlab("Feature") +
  ylab("min MFE (kcal/mol)") +
  geom_vline(xintercept = -40, linetype = "dotted") +
  scale_fill_brewer(type = "qual", palette = "Paired") +
  scale_color_brewer(type = "qual", palette = "Paired") 
ggsave("minMFE_allFeatures_vdensity_families_August.png", device ="png", width = 5, height = 3)

plot_grid(violin_minMFE, density_minMFE, nrow = 2, labels = c("", ""),
          rel_heights = c(1, 1), align = "v")
ggsave("combo_minMFE.png", height = 7, width = 6)

##MFEvariance
violin_MFEvariance <- ggplot(data = all, aes(x = family, y = MFEvariance, fill = element)) +
  geom_boxplot() + theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  xlab("Feature") +
  ylab("Variance") +
  scale_fill_brewer(type = "qual", palette = "Paired") +
  coord_flip()
ggsave("MFEvariance_allFeatures_violin_families_August.png", device ="png", width = 5, height = 3)

density_MFEvariance <- ggplot(data = all, aes(x = MFEvariance, fill = element, col = element)) +
  geom_density(alpha = 0.1) + 
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  xlab("Feature") +
  ylab("Variance") +
  scale_fill_brewer(type = "qual", palette = "Paired") +
  scale_color_brewer(type = "qual", palette = "Paired") 
ggsave("MFEvariance_allFeatures_vdensity_families_August.png", device ="png", width = 5, height = 3)

plot_grid(violin_MFEvariance, density_MFEvariance, nrow = 2, labels = c("", ""),
          rel_heights = c(1, 1), align = "v")
ggsave("combo_MFEvariance.png", height = 7, width = 6)


##PctMFEltthresh
violin_PctMFEltthresh <- ggplot(data = all, aes(x = family, y = PctMFEltthresh, fill = element)) +
  geom_boxplot() + theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  xlab("Feature") +
  ylab("%") +
  scale_fill_brewer(type = "qual", palette = "Paired") +
  coord_flip()
ggsave("PctMFEltthresh_allFeatures_violin_families_August.png", device ="png", width = 5, height = 3)

density_PctMFEltthresh <- ggplot(data = all, aes(x = PctMFEltthresh, fill = element, col = element)) +
  geom_density(alpha = 0.1) + 
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  xlab("Feature") +
  ylab("%") +
  scale_fill_brewer(type = "qual", palette = "Paired") +
  scale_color_brewer(type = "qual", palette = "Paired") 
ggsave("PctMFEltthresh_allFeatures_vdensity_families_August.png", device ="png", width = 5, height = 3)

plot_grid(violin_PctMFEltthresh, density_PctMFEltthresh, nrow = 2, labels = c("", ""),
          rel_heights = c(1, 1), align = "v")
ggsave("combo_PctMFEltthresh.png", height = 7, width = 6)

ggplot(data = all, aes(x = family, y = MFEvariance, fill = element, col = element)) +
  geom_boxplot() + theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  xlab("Feature") +
  ylab("MFE variance") +
  scale_fill_brewer(type = "qual", palette = "Paired") +
  scale_fill_brewer(type = "qual", palette = "Paired") +
  scale_color_brewer(type = "qual", palette = "Paired") +
  coord_flip()
ggsave("MFEvariance_allFeatures_violin_families_August.png", device ="png", width = 5, height = 3)


ggplot(data = all, aes(x = family, y = PctMFEltthresh, fill = element)) +
  geom_violin() + theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  xlab("Feature") +
  ylab("Percent below -40 kcal/mol") +
  scale_fill_brewer(type = "qual", palette = "Paired")
ggsave("threshold_allFeatures_violin_families_August.png", device ="png", width = 8, height = 4)
