# age from .divsum
library(dplyr)
library(readr)
library(magrittr)
library(ggplot2)
library(purrr)
library(tidyr)
library(ggsci)
library(ggpubr)

# read in the table section from .divsum
species_shortname <- "hsym" # hsym / hech

if(species_shortname == "hsym"){
  table <- read_table2("hsym.divsum.table")
  genome_size <- 413473981 # hech: 574487405 ; hsym:413473981
  species_name <- "Hydractinia symbiolongicarpus" # Hydractinia echinata, Hydractinia symbiolongicarpus
}else if (species_shortname=="hech"){
  table <- read_table2("hech.divsum.table")
  genome_size <- 574487405 # hech: 574487405 ; hsym:413473981
  species_name <- "Hydractinia echinata" # Hydractinia echinata, Hydractinia symbiolongicarpus
}

# clean data --------------------------------------------------------------
# remove the last empty column
table <- table[,1:dim(table)[2]-1]
# remove non-informative row (all 0)
non_zero_row_index <- mutate(table,rs=rowSums(select(table,c(-1))))[,"rs"][["rs"]] == "0"
table <- table[!non_zero_row_index,]
# wide to long form
#data_long <- gather(olddata_wide, condition, measurement, control:cond2, factor_key=TRUE)
table_long <- tidyr::gather(table,te_class,raw_cnt,!!colnames(table)[2]:!!colnames(table)[dim(table)[2]])
table_long <- mutate(table_long,perc_genome = raw_cnt/genome_size*100)

# plot for each superfamily --------------------------------------------------------------------
selected_class <- "DNA"

g <- ggplot(filter(table_long,te_class == selected_class),aes(x=Div,y=perc_genome,fill=te_class))
g + geom_bar(stat = "identity") + labs(title = selected_class) + theme_bw() + scale_fill_npg()


# plot for general class --------------------------------------------------
# add general class column
table_long <- tidyr::separate(table_long,te_class,c("general_class","superfamily"),sep="/")
#g <- ggplot(table_long,aes(x=Div,y=perc_genome,fill=general_class))
# class to plot
class_to_plot <- c("DNA","LTR","LINE","SINE","SINE?","RC")
g <- ggplot(table_long %>% filter(general_class %in% class_to_plot),aes(x=Div,y=perc_genome,fill=general_class))
g + geom_bar(stat = "identity") + 
  labs(title = species_name, x="Kimura substitution level (CpG adjusted)", y="Percentage of genome (%)") +
  scale_x_continuous(breaks = seq(0,10*ceiling(max(table_long$Div)/10),by=5))+
  theme_pubr() + scale_fill_npg() +
  theme(plot.title = element_text(face = "italic"))

output_name <- paste0(species_shortname,"_age_estimation_kimura.svg")
ggsave(output_name,width=232,height = 180, units = "mm")


# plot LTR superfamily ----------------------------------------------------
ltr_superfamily <- colnames(table)[grep("LTR",colnames(table))]
# plot function
#plot_superfamily <- function(df,family_name){
  superfamily_without_class <- unlist(strsplit(family_name,"/"))[2]
  g <- ggplot(df %>% filter(superfamily ==superfamily_without_class),aes(x=Div,y=perc_genome,fill=superfamily))
  plot_title = paste0(species_name, " - ", family_name)
  g + geom_bar(stat = "identity") + 
    labs(title = plot_title, x="Kimura substitution level (CpG adjusted)", y="Percentage of genome (%)") +
    scale_x_continuous(breaks = seq(0,10*ceiling(max(table_long$Div)/10),by=5))+
    theme_pubr() + scale_fill_npg() 
}

# plot facet
table_long <- mutate(table_long,te_class = paste0(general_class,"/",superfamily))
g <- ggplot(table_long %>% filter(te_class %in% ltr_superfamily),aes(x=Div,y=perc_genome,fill=superfamily))
g + geom_bar(stat = "identity",width=1) + 
  labs(title = paste0(species_name,": LTR"),x="Kimura substitution level (CpG adjusted)", y="Percentage of genome (%)") +
  facet_grid(rows = ~te_class) +
  scale_x_continuous(breaks = seq(0,10*ceiling(max(table_long$Div)/10),by=5))+
  theme_pubr() + scale_fill_npg() +
  theme(axis.text.x=element_text(angle=45,hjust = 1, size = 6)) 

output_name <- paste0(species_shortname,"_age_estimation_kimura_LTR.svg")
ggsave(output_name,width=440,height = 148, units = "mm")

