library(data.table)
library(tidyverse)
library(ggpubr)

cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
all <- fread("all_minMFE_structureDesignations_wilcox.txt")
all <- filter(all, all$type == "gene")

pan <- fread("pangene_wideFormat.txt")

pan$header <- gsub("gene::", "", x = pan$FeatureID)

all <- merge(all, pan, by = "header", all.x = TRUE)

orth <- fread("SF_50.5_W22v2_Sbv3.1_B73v4.36-nodups.txt", h=T)
colnames(orth) <- c("W22", "dup.W22", "syntelog_stat", "v4_ID", "dup.B73", "rm1", "rm2", "rm3", "rm4")
orth <- orth[,c("W22", "dup.W22", "syntelog_stat", "v4_ID", "dup.B73")]

all <- merge(all, orth, by = "v4_ID", all.x = TRUE)

# Designate by syntelog status
all$syntelog <- ifelse(all$syntelog_stat == "no detected annotated syntelog",
                       FALSE,
                       TRUE)

# Tissue expr
old_expr <- fread("E-GEOD-50191-query-results.tpms.tsv")
old_expr[is.na(old_expr)] <- 0

all <- merge(all, old_expr, by.x = "IDsimple", by.y = "Gene ID", all.x = TRUE)
all <- pivot_longer(all, cols = colnames(all[,(303 - 22):303]), names_to = "tissue")


my_comparisons <- list(  
                       
   #c("nonstructured", "structured"),
   c("nonstructured", "structured")
                        
                        
                      )

# Get chromosome
all$chr_all <- str_split(all$header, pattern = ":", simplify = T)[,1]

# By 23 B73 tissues 
expr_vs_strucure <- ggplot(data = all[all$sig != "random",], aes(x = syntelog, y = log10(value + 1), color = sig)) +
  geom_boxplot() +
  scale_color_manual(values = cbbPalette)  +
  stat_compare_means(comparisons = my_comparisons) +
  xlab("Has Sorghum syntelog") +ylab("Expression (TPM)") +
  theme_classic() +
  #facet_wrap(facets = "tissue") + 
  theme(axis.text.x = element_text(angle = 45,hjust=1))
  
expr_vs_strucure
ggsave(expr_vs_strucure, filename = "expr_vs_strucure_acrosstissues_Aug_smaller_syn_04272023.png", device = "png", width = 5, height = 5)


### ONLY those with sorghum syntelogs
expr_vs_strucure <- ggplot(data = all[all$sig != "random" & all$syntelog == TRUE & !is.na(all$sig),], aes(x = sig, y = log10(value + 1), color = sig)) +
  geom_boxplot() +
  scale_color_manual(values = cbbPalette)  +
  stat_compare_means(comparisons = my_comparisons) +
  xlab("Has Sorghum syntelog") +ylab("Expression (TPM)") +
  theme_classic() +
  #facet_wrap(facets = "tissue") + 
  theme(axis.text.x = element_text(angle = 45,hjust=1))

expr_vs_strucure
ggsave(expr_vs_strucure, filename = "expr_vs_strucure_acrosstissues_onlysyn.png", device = "png", width = 5, height = 5)

gene_hp <- fread("LP_hairpins_gt20_genes.bed")
hp_genes <- cbind(unique(gene_hp$seqnames), "1") %>% as.data.frame()
colnames(hp_genes) <- c("FeatureID", "has_hp")
all <- merge(all, hp_genes, by = "FeatureID", all.x = TRUE)

all[is.na(all$has_hp),]$has_hp <- 0

expr_vs_strucure <- ggplot(data = all, aes(x = syntelog, y = log10(value + 1), color = has_hp)) +
  geom_boxplot() +
  scale_color_manual(values = c("darkgrey", "brown4"))  +
  stat_compare_means(comparisons = my_comparisons) +
  xlab("Has Sorghum syntelog") +ylab("Expression (TPM)") +
  theme_classic() +
  #facet_wrap(facets = "tissue") + 
  theme(axis.text.x = element_text(angle = 45,hjust=1))

expr_vs_strucure
ggsave(expr_vs_strucure, filename = "expr_vs_hairpin_acrosstissues_Aug_smaller_syn_04272023.png", device = "png", width = 5, height = 5)


log10(all[all$sig == "structured",]$value + 1 ) %>% mean()
log10(all[all$sig == "nonstructured",]$value + 1) %>% mean()

#ggsave("expr_vs_StrucDesignation_Aug_smaller_syn.png", 
#       plot = expr_vs_strucure, 
#       device = "png",
#       width = 6, height = 4)

expr_vs_minMFE <- ggplot(data = all[all$sig != "random",], aes(x = minMFE.x,
                                         y = log10(value + 1), 
                                         col = syntelog,
                                         linetype = sig)) +
  geom_smooth(method = "lm") +
  scale_color_manual(values = cbbPalette) +
  #stat_compare_means(comparisons = my_comparisons) +
  xlab("Secondary structure") +ylab("Expression (TPM)") +
  theme_classic()
  #facet_wrap(facets = "tissue") 
  #geom_point(data = sample_n(all[all$sig != "random",], size = 100), alpha = 0.5) 
expr_vs_minMFE

ggsave(filename = "expr_vs_minMFE_syn_042723.png", device ="png", scale = 2)

my_comparisons <- list(  
  
  c("nonstructured", "structured")
  
)

# By minMFE
ggplot(data = all, aes(x = syntelog, y = minMFE.x, fill = syntelog)) + 
  geom_violin() +
  scale_fill_manual(values = cbbPalette) +
  #stat_compare_means(comparisons = my_comparisons) +
  xlab("Syntelog in Sorghum") +ylab("minMFE (kcal/mol)") +
  theme_classic() +
  stat_compare_means() +
  stat_summary(fun.y=median, geom="point", size=2, color="red")
  
ggsave("SynvsNot_minMFE.png", device = "png", scale =  2)

ggplot(data = all, aes(x = minMFE.x, fill = syntelog)) + 
  #geom_histogram(alpha = 0.5, bins = 50, aes(y=..density..)) +
  geom_density(alpha = 0.2) +
  scale_fill_manual(values = cbbPalette) +
  #stat_compare_means(comparisons = my_comparisons) +
  xlab("minMfe (kcal/mol)") +ylab("Density") +
  theme_classic() +
  geom_vline(xintercept = -40)#+
  #stat_compare_means() 
ggsave("SynvsNot_minMFE_density.png", device = "png", scale =  2)


mod_test <- lm(log10(value + 1) ~ minMFE.x, data =  all[all$sig == "nonstructured",])
summary(mod_test)
ggsave(expr_vs_minMFE,
       filename = "quant_expr_vs_strucure_23tissues_Aug_smaller_syn.png", 
       device = "png", 
       width = 7, height = 5)



minMFE_vs_strucure <- ggplot(data = all, aes(x = sig, y = randMean, fill = sig)) +
  geom_boxplot() +
  scale_fill_brewer(type = "qual", palette = 2) +
  stat_compare_means(comparisons = my_comparisons)
minMFE_vs_strucure



library(ggpmisc)


minMFE_vs_strucure <- ggplot(data = all, aes(x = sig, y = randMean, fill = sig)) +
  geom_boxplot() +
  scale_fill_brewer(type = "qual", palette = 2) +
  stat_compare_means(comparisons = my_comparisons)
minMFE_vs_strucure


minMFE_by_expr <- ggplot(data = all, aes(x = minMFE.x, y = B73_rpkm_9)) +
  geom_smooth(method = "lm")
minMFE_by_expr 

## Coefficient of variation
all <- fread("all_minMFE_structureDesignations_wilcox.txt")
all <- filter(all, all$type == "gene")
gc()

pan <- fread("pangene_wideFormat.txt")

pan$header <- gsub("gene::", "", x = pan$FeatureID)

all <- merge(all, pan, by = "header")

old_expr <- fread("E-GEOD-50191-query-results.tpms.tsv")
old_expr[is.na(old_expr)] <- 0

all <- merge(all, old_expr, by.x = "IDsimple", by.y = "Gene ID")

# CoV and mean by tissue
for (i in 1:nrow(all)) {
  all$tissue_mean[[i]] <- mean(as.numeric(all[i,(298 - 22):298]))
  all$tissue_cov[[i]] <- sd(as.numeric(all[i,(298 - 22):298])) / all$tissue_mean[[i]] * 100
}

all <- fread("gene_pan_expression_all_incTissues.txt")

orth <- fread("SF_50.5_W22v2_Sbv3.1_B73v4.36-nodups.txt", h=T)
colnames(orth) <- c("W22", "dup.W22", "syntelog_stat", "v4_ID", "dup.B73", "rm1", "rm2", "rm3", "rm4")
orth <- orth[,c("W22", "dup.W22", "syntelog_stat", "v4_ID", "dup.B73")]

all <- merge(all, orth, by = "v4_ID", all.x = TRUE)

all$syntelog <- ifelse(all$syntelog_stat == "no detected annotated syntelog",
                       FALSE,
                       TRUE)


all <- merge(all, orth, by = "v4_ID", all.x = TRUE)

log10(all[all$sig == "structured",]$tissue_mean + 1 ) %>% mean()
log10(all[all$sig == "nonstructured",]$tissue_mean + 1) %>% mean()

ggplot(all[all$sig != "random" & !is.na(all$syntelog),], aes(x = syntelog, y = tissue_cov, col = sig))  + 
  geom_violin() +
  #geom_boxplot(width = 0.1, alpha = 0.7) +
  stat_compare_means(comparisons = my_comparisons, inherit.aes = TRUE) +
  scale_color_manual(values = cbbPalette) +
  stat_summary(
    fun = "median",
    geom = "point",aes(col = sig),
    position = position_dodge(width = 0.9)) +
  theme_classic() 
ggsave("TissueCov_by_designation_violin_Aug_smaller_syn.png", device ="png",
       width = 6, height = 5)  

all$has_hp <- ifelse(all$FeatureID %in% HP$seqnames, 
                     TRUE,
                     FALSE)
ggplot(all[all$sig != "random" & !is.na(all$syntelog),], aes(x = syntelog, y = tissue_cov, col = has_hp))  + 
  geom_violin() +
  #geom_boxplot(width = 0.1, alpha = 0.7) +
  stat_compare_means(comparisons = my_comparisons, inherit.aes = TRUE) +
  scale_color_manual(values = cbbPalette) +
  stat_summary(
    fun = "median",
    geom = "point",aes(col = has_hp),
    position = position_dodge(width = 0.9)) +
  theme_classic() 
ggsave("TissueCov_by_HP_violin_Aug_smaller_syn.png", device ="png",
       width = 6, height = 5) 

mod_test <- lm(tissue_cov ~ sig, data = all[all$syntelog == TRUE,])
modsum <- summary(mod_test)

mod_test <- lm(tissue_cov ~ has_hp, data = all[all$syntelog == TRUE,])
modsum <- summary(mod_test)

siRNA <- fread("MFE_siRNA_join.txt")

all <- merge(all, siRNA, by = "FeatureID")
gc()

ggplot(all[all$sig != "random" & all$syntelog == FALSE,], aes(x = sig, y = log10(CountsPerNT), col = sig))  + 
  geom_violin() +
  geom_boxplot(width = 0.1, alpha = 0.7, position = position_dodge()) +
  stat_compare_means(comparisons = my_comparisons, size = 2) +
  scale_color_manual(values = cbbPalette) +
  theme_classic() +
  facet_wrap(facets = "siRNA_length") +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ylab("siRNA abundance (log10 counts/NT)")
ggsave("CountsPerNT_by_designation_violin_nonsynonly_may.png",
       device ="png",
       scale = 2)  

ggplot(data = all[all$sig != "random",], aes(y = CountsPerNT, x = log10(tissue_mean), col = sig)) +
  #geom_point(data = sample_n(all[all$sig != "random",], size = 10000), alpha = 0.7, size = 0.1) +
  geom_smooth(
    method = "lm", 
    color = "white", size = 1.7, aes(group= sig)) + 
  geom_smooth(
    method = "lm", 
    size = 1.3, se = T) + 
  scale_color_manual(values = cbbPalette) +
  facet_wrap(facets = "siRNA_length") +
  theme_bw() +
  #ylim(0, 30) +
  xlab("mean expression across tissues (log10 TPM)") +
  ylab("siRNA species per NT")

ggsave("meanTPM_by_siRNAperNT_lines_Aug_smaller_syn.png", device= "png", height = 5, width = 7)
#all
install.packages("performance")
library(performance)    # model assumptions & performance
#check_model(m)

m_21 <- lm(data = all[all$siRNA_length == 21,],
        formula = CountsPerNT ~ tissue_mean) 
m_22 <- lm(data = all[all$siRNA_length == 22,],
        formula = CountsPerNT ~ tissue_mean) 
m_24 <- lm(data = all[all$siRNA_length == 24,],
         formula = CountsPerNT ~ tissue_mean) 
#nonstructured
mu_21 <- lm(data = all[all$sig == "nonstructured" & all$siRNA_length == 21,],
           formula = CountsPerNT ~ tissue_mean)
mu_22 <- lm(data = all[all$sig == "nonstructured" & all$siRNA_length == 22,],
           formula = CountsPerNT ~ tissue_mean) 
mu_24 <- lm(data = all[all$sig == "nonstructured" & all$siRNA_length == 24,],
           formula = CountsPerNT ~ tissue_mean) 
#random
lm(data = all[all$sig == "random" & all$siRNA_length == 21,],
   formula = CountsPerNT ~ tissue_mean) %>% summary()
lm(data = all[all$sig == "random" & all$siRNA_length == 22,],
   formula = CountsPerNT ~ tissue_mean) %>% summary()
lm(data = all[all$sig == "random" & all$siRNA_length == 24,],
   formula = CountsPerNT ~ tissue_mean) %>% summary()
#structured
ms_21 <- lm(data = all[all$sig == "structured" & all$siRNA_length == 21,],
            formula = CountsPerNT ~ tissue_mean) 
ms_22 <- lm(data = all[all$sig == "structured" & all$siRNA_length == 22,],
            formula = CountsPerNT ~ tissue_mean) 
ms_24 <- lm(data = all[all$sig == "structured" & all$siRNA_length == 24,],
            formula = CountsPerNT ~ tissue_mean) 

expr <- fread("pangene_MFE_withCoV.txt")
all <- merge(all, expr, by.x = "v4_ID", by.y = "ID", all.x = TRUE)

alln <- all[,c("cv", "sig", "syntelog", "has_hp")]
filter(alln, alln$cv != 0 & alln$sig != "random" & !is.na(alln$syntelog)) %>% 
  ggplot(aes(x = syntelog, y = cv, color =sig)) + 
  geom_violin() +
  #geom_boxplot(width = 0.1, alpha = 0.7) +
  stat_summary(
    fun = "median",
    geom = "point",aes(col = sig),
    position = position_dodge(width = 0.9)) +
  stat_compare_means(comparisons = my_comparisons) +
  scale_color_manual(values = cbbPalette) +
  theme_classic() 
ggsave("Cov_by_designation_violin_Aug_smaller_syn.png", device ="png",
       width = 6, height = 5) 

alln %>% 
  ggplot(aes(x = syntelog, y = cv, color =has_hp)) + 
  geom_violin() +
  #geom_boxplot(width = 0.1, alpha = 0.7) +
  stat_summary(
    fun = "median",
    geom = "point",aes(col = has_hp),
    position = position_dodge(width = 0.9)) +
  stat_compare_means(comparisons = my_comparisons) +
  scale_color_manual(values = cbbPalette) +
  theme_classic() 
ggsave("Cov_by_HP_violin_Aug_smaller_syn.png", device ="png",
       width = 6, height = 5) 

alln <- all[,c("cv", "CountsPerNT", "sig", "minMFE.x")]
ggplot(data = all, aes(x = minMFE.x, y = cv)) +
  geom_smooth(method = "lm") +
  geom_smooth() +
  theme_classic()

mod_test <- lm(cv ~ sig, data = alln[alln$syntelog == TRUE,])
modsum <- summary(mod_test)

mod_test <- lm(cv ~ has_hp, data = alln[alln$syntelog == TRUE,])
modsum <- summary(mod_test)

  
struc <- all[,c("v4_ID", "sig","meanMFE", "minMFE.x", "PctMFEltthresh", "MFEvariance", "cv", "tissue_cov")]
struc <- filter(struc, struc$sig == "structured")
colnames(struc) <- c("ID","sig", "meanMFE", "minMFE", "propMFE", "varMFE", "cv", "tissue_cov")
struc <- unique(struc)
struc <- pivot_longer(struc, cols = c("meanMFE", "minMFE", "propMFE", "varMFE"))

ggplot(struc, aes(y = cv, x = value, linetype = name, col = name)) +
  geom_smooth(method = "lm") +
  scale_color_manual(values = c("blue", "black", "orange", "grey")) +
  facet_wrap(facets = "name", scales = "free_x") +
  theme_classic()
ggsave("CV_vs_MFEmeasures_syn.png", device = "png", scale = 2)
  
model_minMFE <- lm(cv ~ value, data = struc[struc$name == "minMFE",])
summary(model_minMFE)
model_meanMFE <- lm(cv ~ value, data = struc[struc$name == "meanMFE",])
summary(model_meanMFE)
model_propMFE <- lm(cv ~ value, data = struc[struc$name == "propMFE",])
summary(model_propMFE)
model_varMFE <- lm(cv ~ value, data = struc[struc$name == "varMFE",])
summary(model_varMFE)


struc <- all[,c("v4_ID", "sig","meanMFE", "minMFE.x", "PctMFEltthresh", "MFEvariance", "cv", "tissue_cov", "siRNA_length", "CountsPerNT")]
struc <- filter(struc, struc$sig == "structured")
colnames(struc) <- c("ID","sig", "meanMFE", "minMFE", "propMFE", "varMFE", "cv", "tissue_cov", "siRNA_length", "CountsPerNT")
struc <- pivot_longer(struc, cols = c("meanMFE", "minMFE", "propMFE", "varMFE"))

struc$siRNA_length <- as.factor(struc$siRNA_length)
ggplot(struc, aes(y = CountsPerNT, x = value, linetype = siRNA_length, col = siRNA_length)) +
  geom_smooth(method = "lm") +
  scale_color_brewer(palette = "Dark2") +
  facet_wrap(facets = "name", scales = "free_x") +
  theme_classic()
ggsave("siRNA_vs_MFEmeasures_syn.png", device = "png", scale = 2)

model_minMFE_24 <- lm(CountsPerNT ~ value, data =  struc[struc$name == "minMFE" & struc$siRNA_length == 24,])
summary(model_minMFE_24)
model_meanMFE_24 <- lm(CountsPerNT ~ value, data = struc[struc$name == "meanMFE" & struc$siRNA_length == 24,])
summary(model_meanMFE_24)
model_propMFE_24 <- lm(CountsPerNT ~ value, data = struc[struc$name == "propMFE" & struc$siRNA_length == 24,])
summary(model_propMFE_24)
model_varMFE_24 <- lm(CountsPerNT ~ value, data =  struc[struc$name == "varMFE" & struc$siRNA_length == 24 ,])
summary(model_varMFE_24)

model_minMFE_22 <- lm(CountsPerNT ~ value, data =  struc[struc$name == "minMFE" & struc$siRNA_length == 22,])
summary(model_minMFE_22)
model_meanMFE_22 <- lm(CountsPerNT ~ value, data = struc[struc$name == "meanMFE" & struc$siRNA_length == 22,])
summary(model_meanMFE_22)
model_propMFE_22 <- lm(CountsPerNT ~ value, data = struc[struc$name == "propMFE" & struc$siRNA_length == 22,])
summary(model_propMFE_22)
model_varMFE_22 <- lm(CountsPerNT ~ value, data =  struc[struc$name == "varMFE" & struc$siRNA_length == 22 ,])
summary(model_varMFE_22)

model_minMFE_21 <- lm(CountsPerNT ~ value, data =  struc[struc$name == "minMFE" & struc$siRNA_length == 21,])
summary(model_minMFE_21)
model_meanMFE_21 <- lm(CountsPerNT ~ value, data = struc[struc$name == "meanMFE" & struc$siRNA_length == 21,])
summary(model_meanMFE_21)
model_propMFE_21 <- lm(CountsPerNT ~ value, data = struc[struc$name == "propMFE" & struc$siRNA_length == 21,])
summary(model_propMFE_21)
model_varMFE_21 <- lm(CountsPerNT ~ value, data =  struc[struc$name == "varMFE" & struc$siRNA_length == 21 ,])
summary(model_varMFE_21)


struc <- all[,c("v4_ID", "sig","meanMFE", "minMFE.x", "PctMFEltthresh", "MFEvariance", "cv", "tissue_cov", "siRNA_length", "CountsPerNT")]
struc <- filter(struc, struc$sig == "structured")
colnames(struc) <- c("ID","sig", "meanMFE", "minMFE", "propMFE", "varMFE", "cv", "tissue_cov", "siRNA_length", "CountsPerNT")
#struc <- pivot_longer(struc, cols = c("meanMFE", "minMFE", "propMFE", "varMFE"))


struc$quantile <- ifelse(struc$minMFE <= quantile(struc$minMFE)[2], 
                         "four", ifelse(struc$minMFE <= quantile(struc$minMFE)[3] & struc$minMFE > quantile(struc$minMFE)[2] , 
                                        "three", ifelse(struc$minMFE <= quantile(struc$minMFE)[4] & struc$minMFE > quantile(struc$minMFE)[3], 
                                                        "two", ifelse(struc$minMFE <= quantile(struc$minMFE)[5] & struc$minMFE > quantile(struc$minMFE)[4],
                                                                      "one", "ERROR")
                                        )))
struc$quantile <- factor(struc$quantile, levels = c("four", "three", "two", "one"))


my_comparisons <- list(  
  c("one", "two"),
#  c("one", "three"),
#  c("one", "four"),
  c("two", "three"),
#  c("two", "four"),
  c("three", "four")
)

ggplot(struc, aes(y = log10(CountsPerNT * 1000), x = quantile,)) +
  geom_violin() +
  geom_boxplot(width = 0.08) +
  #scale_color_brewer(palette = "Dark2") +
  facet_wrap(facets = "siRNA_length", scales = "free_y") +
  stat_compare_means(comparisons = my_comparisons) +
  ylab("siRNA mapping (log10 species per kb)") +
  theme_classic()
ggsave("siRNA_vs_minMFEQuantile_less_syn.png", device = "png", scale = 2)
