library(data.table)
library(tidyverse)

setwd("~")

# Read siRNA stats
LINE_element <- fread("LINE_element-v2.siRNA_count_stats.csv")
LTR_retrotransposon <- fread("LTR_retrotransposon.siRNA_count_stats.csv")
SINE_element <- fread("SINE_element.siRNA_count_stats.csv.gz")
gene <- fread("gene.siRNA_count_stats.csv")
helitron <- fread("helitron.siRNA_count_stats.csv")
mRNA <- fread("mRNA.siRNA_count_stats.csv.gz")
miRNA <- fread("miRNA.siRNA_count_stats.csv.gz")
solo_LTR <- fread("solo_LTR.siRNA_count_stats.csv")
terminal_inverted_repeat_element <- fread("terminal_inverted_repeat_element-v2.siRNA_count_stats.csv")

LINE_element$element <- "LINE_element"
LTR_retrotransposon$element <- "LTR_retrotransposon"
SINE_element$element <- "SINE_element"
gene$element <- "gene"
helitron$element <- "helitron"
mRNA$element <- "mRNA"
miRNA$element <- "miRNA"
solo_LTR$element <- "solo_LTR"
terminal_inverted_repeat_element$element <- "terminal_inverted_repeat_element"

all <- rbind(
  LINE_element,
  LTR_retrotransposon,
  SINE_element,
  gene,
  helitron,
  mRNA,
  miRNA,
  solo_LTR,
  terminal_inverted_repeat_element
)

# Assign miRNA-like or not designation
MFE_Des <- fread("new_MFE_designations_Sep25.txt")
MFE_Des <- MFE_Des[,c("FeatureID", "attribute", "sig")]

MFE_Des <- MFE_Des %>% unique()

all <- merge(all, MFE_Des, by = "FeatureID")

all$element <- all$attribute
all$attribute<- NULL
gc()


all$family <- ifelse(all$element == "solo_LTR", paste("solo_", all$family, sep = ""), all$family)
all$family <- ifelse(all$element == "gene", "gene", all$family)
all$family <- ifelse(all$element == "mRNA", "mRNA", all$family)
all$family <- ifelse(all$element == "miRNA", "miRNA", all$family)

all$element <- ifelse(all$family == "gene" | all$family == "mRNA", "gene", as.character(all$element))

#siRNA lengths
lens <- c("21", "22", "24")

all$V1 <- NULL

gc()

all_long <- pivot_longer(data = all, cols = !c(FeatureID, 
                                               Strand, 
                                               Chromosome, 
                                               StartPos, 
                                               StopPos,
                                               MFE_Region_Length, 
                                               nonMFE_Region_Length,
                                               element,
                                               sig,
                                               family))

all_long$library <- str_split(all_long$name, pattern = "_", simplify = TRUE)[,2]
all_long$siRNA_length <- str_split(all_long$name, pattern = "_", simplify = TRUE)[,3]

## For future retrieval  ##
#write_delim(all_long, "all_siRNA_long_oct.tsv", delim = "\t")
#all_long <- fread("all_siRNA_long_sep.tsv")

all_long$header <- NULL
all_long$Strand <- NULL
all_long$Chromosome <- NULL
all_long$StartPos <- NULL
all_long$StopPos <- NULL
gc()

all_long$secondary_structure <- str_split(all_long$name, pattern = "_", simplify = TRUE)[,1]
all_long$measure <- str_split(all_long$name, pattern = "_", simplify = TRUE)[,6]

all_long$name <- NULL

all_long <- filter(all_long, all_long$measure == "CountsPerNT")

gc()

write_delim(all_long, "all_siRNA_long_CountsPerNT.tsv", delim = "\t")

all_long$family <- factor(all_long$family, 
                     levels = c("gene", 
                                "mRNA",
                                "miRNA",
                                "DHH","DTA", "DTC", "DTH", "DTM", "DTT",
                                "RIL", "RIT",
                                "RST",
                                "RLC", "RLG", "RLX",
                                "solo_RLC", "solo_RLG", "solo_RLX"
                     ))

# Compare siRNA mapping by SS designation
ggplot(all_long, aes(x = element, y = log10(value*1000), fill = secondary_structure)) +
  geom_boxplot() + 
  scale_fill_brewer(type = "qual", palette = 2) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  xlab("siRNA counts per NT") + 
  facet_wrap(~ siRNA_length, scales = "free_y", nrow=3)
ggsave("siRNA_family_all_boxplots_sep_hairpin_mar.png", device = "png", width = 10, height = 8)

all_long$siRNA_log10perkb <- log10(all_long$value * 1000 + 1)
testgg <- ggplot(all_long, aes(x = element, y = siRNA_log10perkb, fill = secondary_structure, col = secondary_structure)) +
  geom_violin() + 
  scale_fill_brewer(type = "qual", palette = 2) +
  scale_color_brewer(type = "qual", palette = 2) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  xlab("siRNA counts per NT") + 
  stat_summary(fun = "mean",
               geom = "point") +
  facet_wrap(~ siRNA_length, scales = "free_y", nrow=3) 
  
ggsave(plot = testgg, filename = "siRNA_family_all_violin_plus1_hairpin_mar.png", device = "png", width = 15, height = 10)

# Compare within elements
all_wider <- pivot_wider(all_long, names_from = "secondary_structure")

all_wider$CountsPerNT <- (all_wider$MFE_Region_Length * all_wider$MFE + all_wider$nonMFE_Region_Length * all_wider$nonMFE) / (all_wider$MFE_Region_Length + all_wider$nonMFE_Region_Length)

## siRNA in low vs high MFE regions or hp/non-hp regions
ggplot(all_wider, aes(x = element, y = log10(CountsPerNT*1000+1), fill = element)) +
  geom_boxplot() + 
  scale_fill_brewer(type = "qual", palette = "Paired") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  xlab("siRNA counts per NT") + 
  facet_wrap(~ siRNA_length, scales = "free_y", nrow=3)
ggsave("siRNA_family_all_boxplots_TotalsiRNAperNT_hairpin_mar.png", device = "png", width = 10, height = 8)


## Statistical tests
library(lme4)
library(lmerTest)

all_wider <- filter(all_wider, all_wider$MFE > 0 & all_wider$nonMFE >0)

all_longer <- pivot_longer(all_wider, cols = c("MFE", "nonMFE"))

# Examples
DHH_Model1_21 <- lmer(log(value*1000 + 1) ~ name + (1|FeatureID), 
                  data = all_longer[all_longer$siRNA_length == 21 & all_longer$element == "DHH",])
summary(DHH_Model1_21)

DHH_Model1_24 <- lmer(log(value*1000 + 1) ~ name + (1|FeatureID), 
                      data = all_longer[all_longer$siRNA_length == 24 & all_longer$element == "DHH",])
summary(DHH_Model1_24)

DTH_Model1_21 <- lmer(log(value*1000 + 1) ~ name + (1|FeatureID), 
                      data = all_longer[all_longer$siRNA_length == 21 & all_longer$element == "DTH",])
summary(DTH_Model1_21)


fams <- all_longer$element %>% unique()
sizes = c(21,22,24)
df <- cbind(1,2,3,4) %>% as.data.frame()
colnames(df) <- c("family", "siRNA_size", "measure", "data")

# Test each type and make table
for (i in fams) {
  for (j in sizes) {
    model <- lmer(log(value*1000 + 1) ~ name + (1|FeatureID), 
                          data = all_longer[all_longer$siRNA_length == j & all_longer$element == i,])
    summodel <- summary(model)
    pval <- summodel$coefficients[2,5] %>% as.vector()
    est <- summodel$coefficients[2,1] %>% as.vector()
    err <- summodel$coefficients[2,2] %>% as.vector()
    entry <- paste("Estimate = ", 
                   print(formatC(signif(est,digits=3), digits=3,format="e", flag="#")), 
                   " | ", 
                   "P = ", 
                   print(formatC(signif(pval,digits=3), digits=3,format="e", flag="#")), 
                   " | ", 
                   "std. err. = ",
                   print(formatC(signif(err,digits=3), digits=3,format="e", flag="#")),
                   sep = "" )
    newdf <- data.frame(i, j, "Structure", entry)
    colnames(newdf) <- colnames(df)
    df <- rbind(df, newdf)
  }
}

df <- df[-1,] #Remove substitute line

df_wider <- pivot_wider(df, names_from = siRNA_size, values_from = data)

write_delim(df_wider, "features_structure_by_siRNAabundance_mixedeffects.csv", delim = ",")


## Zero inflated
library(BhGLM)
library(NBZIMM)
library(nlme)
library(glmmTMB)


RLC <- filter(all_long, all_long$family == "RLC")

model_zeros <- glmmTMB::glmmTMB(log(value*1000 + 1) ~ secondary_structure + (1|FeatureID), 
                               data = all_long[all_long$siRNA_length == 24 & all_long$family == "RLC",], 
                       ziformula = TRUE) 
summary(model_zeros)


model_gene<- lme.zig(fixed = log10(value * 1000 + 1) ~ secondary_structure,random = ~ 1 | FeatureID, data = all_long[all_long$siRNA_length == 24 & all_long$element == "gene",]) 
summary(model_gene)



fams <- all_longer$element %>% unique()
sizes = c(21,22,24)
df <- cbind(1,2,3,4) %>% as.data.frame()
colnames(df) <- c("family", "siRNA_size", "measure", "data")

for (i in fams) {
  for (j in sizes) {
    model <- lmer(log(value*1000 + 1) ~ secondary_structure + (1|FeatureID), 
                  data = all_long[all_long$siRNA_length == j & all_long$element == i,])
    summodel <- summary(model)
    pval <- summodel$coefficients[2,5] %>% as.vector()
    est <- summodel$coefficients[2,1] %>% as.vector()
    err <- summodel$coefficients[2,2] %>% as.vector()
    entry <- paste("Estimate = ", 
                   print(formatC(signif(est,digits=3), digits=3,format="e", flag="#")), 
                   " | ", 
                   "P = ", 
                   print(formatC(signif(pval,digits=3), digits=3,format="e", flag="#")), 
                   " | ", 
                   "std. err. = ",
                   print(formatC(signif(err,digits=3), digits=3,format="e", flag="#")),
                   sep = "" )
    newdf <- data.frame(i, j, "Structure", entry)
    colnames(newdf) <- colnames(df)
    df <- rbind(df, newdf)
  }
}

df <- df[-1,]

df_wider <- pivot_wider(df, names_from = siRNA_size, values_from = data)

write_delim(df_wider, "features_structure_by_siRNAabundance_ZEROINFLATEDmixedeffects.csv", delim = ",")


model_miRNA<- lme.zig(fixed = log10(value * 1000 + 1) ~ secondary_structure,random = ~ 1 | FeatureID, data = all_long[all_long$siRNA_length == 24 & all_long$family == "miRNA",]) 
summary(model_miRNA)

model_DHH<- lme.zig(fixed = log10(value * 1000 + 1) ~ secondary_structure,random = ~ 1 | FeatureID, data = all_long[all_long$siRNA_length == 24 & all_long$family == "DHH",]) 
summary(model_DHH)

model_DTT<- lme.zig(fixed = log10(value * 1000 + 1) ~ secondary_structure,random = ~ 1 | FeatureID, data = all_long[all_long$siRNA_length == 24 & all_long$family == "DTT",]) 
summary(model_DTT)

model_DTA<- lme.zig(fixed = log10(value * 1000 + 1) ~ secondary_structure,random = ~ 1 | FeatureID, data = all_long[all_long$siRNA_length == 24 & all_long$family == "DTA",]) 
summary(model_DTA)

model_DTC<- lme.zig(fixed = log10(value * 1000 + 1) ~ secondary_structure,random = ~ 1 | FeatureID, data = all_long[all_long$siRNA_length == 24 & all_long$family == "DTC",]) 
summary(model_DTC
)

model_DTM<- lme.zig(fixed = log10(value * 1000 + 1) ~ secondary_structure,random = ~ 1 | FeatureID, data = all_long[all_long$siRNA_length == 24 & all_long$family == "DTM",]) 
summary(model_DTM
)

model_DTT<- lme.zig(fixed = log10(value * 1000 + 1) ~ secondary_structure,random = ~ 1 | FeatureID, data = all_long[all_long$siRNA_length == 24 & all_long$family == "DTT",]) 
summary(model_DTT
)

model_RIL<- lme.zig(fixed = log10(value * 1000 + 1) ~ secondary_structure,random = ~ 1 | FeatureID, data = all_long[all_long$siRNA_length == 24 & all_long$family == "RIL",]) 
summary(model_RIL
)

model_RIT<- lme.zig(fixed = log10(value * 1000 + 1) ~ secondary_structure,random = ~ 1 | FeatureID, data = all_long[all_long$siRNA_length == 24 & all_long$family == "RIT",]) 
summary(model_RIT)

model_RST<- lme.zig(fixed = log10(value * 1000 + 1) ~ secondary_structure,random = ~ 1 | FeatureID, data = all_long[all_long$siRNA_length == 24 & all_long$family == "RST",]) 
summary(model_RST
)

model_RLG<- lme.zig(fixed = log10(value * 1000 + 1) ~ secondary_structure,random = ~ 1 | FeatureID, data = all_long[all_long$siRNA_length == 24 & all_long$family == "RLG",]) 
summary(model_RLG
)

#DTC
all_long[all_long$siRNA_length == 21 & all_long$element == "DTC" & all_long$value != 0 & all_long$secondary_structure == "MFE",]$value %>% mean()
all_long[all_long$siRNA_length == 21 & all_long$element == "DTC" & all_long$value != 0 & all_long$secondary_structure == "nonMFE",]$value %>% mean()
all_long[all_long$siRNA_length == 22 & all_long$element == "DTC" & all_long$value != 0 & all_long$secondary_structure == "MFE",]$value %>% mean()
all_long[all_long$siRNA_length == 22 & all_long$element == "DTC" & all_long$value != 0 & all_long$secondary_structure == "nonMFE",]$value %>% mean()
all_long[all_long$siRNA_length == 24 & all_long$element == "DTC" & all_long$value != 0 & all_long$secondary_structure == "MFE",]$value %>% mean()
all_long[all_long$siRNA_length == 24 & all_long$element == "DTC" & all_long$value != 0 & all_long$secondary_structure == "nonMFE",]$value %>% mean()

#DTC
all_long[all_long$siRNA_length == 21 & all_long$element == "DTC" & all_long$value != 0 & all_long$secondary_structure == "MFE",]$value %>% mean()
all_long[all_long$siRNA_length == 21 & all_long$element == "DTC" & all_long$value != 0 & all_long$secondary_structure == "nonMFE",]$value %>% mean()
all_long[all_long$siRNA_length == 22 & all_long$element == "DTC" & all_long$value != 0 & all_long$secondary_structure == "MFE",]$value %>% mean()
all_long[all_long$siRNA_length == 22 & all_long$element == "DTC" & all_long$value != 0 & all_long$secondary_structure == "nonMFE",]$value %>% mean()
all_long[all_long$siRNA_length == 24 & all_long$element == "DTC" & all_long$value != 0 & all_long$secondary_structure == "MFE",]$value %>% mean()
all_long[all_long$siRNA_length == 24 & all_long$element == "DTC" & all_long$value != 0 & all_long$secondary_structure == "nonMFE",]$value %>% mean()

#DTC
all_long[all_long$siRNA_length == 21 & all_long$element == "DTC" & all_long$value != 0 & all_long$secondary_structure == "MFE",]$value %>% mean()
all_long[all_long$siRNA_length == 21 & all_long$element == "DTC" & all_long$value != 0 & all_long$secondary_structure == "nonMFE",]$value %>% mean()
all_long[all_long$siRNA_length == 22 & all_long$element == "DTC" & all_long$value != 0 & all_long$secondary_structure == "MFE",]$value %>% mean()
all_long[all_long$siRNA_length == 22 & all_long$element == "DTC" & all_long$value != 0 & all_long$secondary_structure == "nonMFE",]$value %>% mean()
all_long[all_long$siRNA_length == 24 & all_long$element == "DTC" & all_long$value != 0 & all_long$secondary_structure == "MFE",]$value %>% mean()
all_long[all_long$siRNA_length == 24 & all_long$element == "DTC" & all_long$value != 0 & all_long$secondary_structure == "nonMFE",]$value %>% mean()

#DTH
all_long[all_long$siRNA_length == 21 & all_long$element == "DTH" & all_long$value != 0 & all_long$secondary_structure == "MFE",]$value %>% mean()
all_long[all_long$siRNA_length == 21 & all_long$element == "DTH" & all_long$value != 0 & all_long$secondary_structure == "nonMFE",]$value %>% mean()
all_long[all_long$siRNA_length == 22 & all_long$element == "DTH" & all_long$value != 0 & all_long$secondary_structure == "MFE",]$value %>% mean()
all_long[all_long$siRNA_length == 22 & all_long$element == "DTH" & all_long$value != 0 & all_long$secondary_structure == "nonMFE",]$value %>% mean()
all_long[all_long$siRNA_length == 24 & all_long$element == "DTH" & all_long$value != 0 & all_long$secondary_structure == "MFE",]$value %>% mean()
all_long[all_long$siRNA_length == 24 & all_long$element == "DTH" & all_long$value != 0 & all_long$secondary_structure == "nonMFE",]$value %>% mean()

#gene
all_long[all_long$siRNA_length == 21 & all_long$element == "gene"  & all_long$secondary_structure == "MFE",]$value %>% mean()
all_long[all_long$siRNA_length == 21 & all_long$element == "gene"  & all_long$secondary_structure == "nonMFE",]$value %>% mean()
all_long[all_long$siRNA_length == 22 & all_long$element == "gene"  & all_long$secondary_structure == "MFE",]$value %>% mean()
all_long[all_long$siRNA_length == 22 & all_long$element == "gene"  & all_long$secondary_structure == "nonMFE",]$value %>% mean()
(all_long[all_long$siRNA_length == 24 & all_long$element == "gene"  & all_long$secondary_structure == "MFE",]$value*1000) %>% median()
(all_long[all_long$siRNA_length == 24 & all_long$element == "gene"  & all_long$secondary_structure == "nonMFE",]$value*1000) %>% median()


#RLC
all_long[all_long$siRNA_length == 21 & all_long$element == "RLC"  & all_long$secondary_structure == "MFE",]$value %>% mean()
all_long[all_long$siRNA_length == 21 & all_long$element == "RLC"  & all_long$secondary_structure == "nonMFE",]$value %>% mean()
all_long[all_long$siRNA_length == 22 & all_long$element == "RLC"  & all_long$secondary_structure == "MFE",]$value %>% mean()
all_long[all_long$siRNA_length == 22 & all_long$element == "RLC"  & all_long$secondary_structure == "nonMFE",]$value %>% mean()
all_long[all_long$siRNA_length == 24 & all_long$element == "RLC"  & all_long$secondary_structure == "MFE",]$value %>% mean()
all_long[all_long$siRNA_length == 24 & all_long$element == "RLC"  & all_long$secondary_structure == "nonMFE",]$value %>% mean()
