#!/usr/bin/env Rscript

# Copyright (C) 2019 Tobias Jakobi
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

### imports

require(reshape2)
library(ggplot2)
library(ggrepel)
library(bbplot)
library(viridis)
library(dplyr)
# library(plyr)
library(ggpubr)
library(ggsignif)



BM_repeats <- read.table("BM.out", stringsAsFactors = FALSE, sep = "\t", header=F )
colnames(BM_repeats) <-c("chr", "start", "stop", "up_dist", "down_dist", "sim")

BM_dexseq <- read.table("Bone_intersect.csv", stringsAsFactors = FALSE, sep = "\t", header=F )[,c(1:5)]
colnames(BM_dexseq) <-c("chr", "start", "stop", "name", "score")

BM_dexseq_intersect <- read.table("BM_dexseq_intersect.csv", stringsAsFactors = FALSE, sep = "\t", header=F )[,c(1:5)]
colnames(BM_dexseq_intersect) <- colnames(BM_dexseq)


BM_dexseq_intersect$chr <- as.character(BM_dexseq_intersect$chr)
BM_dexseq$chr <- as.character(BM_dexseq$chr)



LI_repeats <- read.table("LI.out", stringsAsFactors = FALSE, sep = "\t", header=F )
colnames(LI_repeats) <-c("chr", "start", "stop", "up_dist", "down_dist", "sim")
LI_dexseq <- read.table("Liver_intersect.csv", stringsAsFactors = FALSE, sep = "\t", header=F )[,c(1:5)]
colnames(LI_dexseq) <-c("chr", "start", "stop", "name", "score")
LI_dexseq_intersect <- read.table("LI_dexseq_intersect.csv", stringsAsFactors = FALSE, sep = "\t", header=F )[,c(1:5)]
colnames(LI_dexseq_intersect) <- colnames(LI_dexseq)
LI_dexseq_intersect$chr <- as.character(LI_dexseq_intersect$chr)
LI_dexseq$chr <- as.character(LI_dexseq$chr)


LI_dexseq <- LI_dexseq[ , !(names(LI_dexseq) %in% c("score","name"))]
BM_dexseq <- BM_dexseq[ , !(names(BM_dexseq) %in% c("score","name"))]
BM_dexseq_intersect <- BM_dexseq_intersect[ , !(names(BM_dexseq_intersect) %in% c("score","name"))]
LI_dexseq_intersect <- LI_dexseq_intersect[ , !(names(LI_dexseq_intersect) %in% c("score","name"))]


head(BM_dexseq)

BM_dexseq_repeats <- inner_join(BM_repeats, BM_dexseq)
BM_intersect <- inner_join(BM_repeats, BM_dexseq_intersect)
BM_repeats_wo_dexseq <- anti_join(BM_repeats, BM_dexseq)

LI_dexseq_repeats <- inner_join(LI_repeats, LI_dexseq)
LI_intersect <- inner_join(LI_repeats, LI_dexseq_intersect)
LI_repeats_wo_dexseq <- anti_join(LI_repeats, LI_dexseq)


LI_repeats$type <- "Liver SINE B1 repeats, n=531"

LI_intersect$type <- "Liver subset, n=4"
LI_dexseq_repeats$type <- "Liver DEXSEQ subset, n=139"
# LI_repeats_wo_dexseq$type <- "Liver DEXSEQ subset, n=139"

BM_repeats$type <- "Bone marrow SINE B1 repeats, n=598"
BM_intersect$type <- "Bone marrow subset, n=11"
BM_dexseq_repeats$type <- "Bone marrow DEXSEQ subset, n=47"


BM_repeats_wo_dexseq$type <- "BM_repeats_wo_dexseq"

head(BM_intersect)

# 
tmp1 <- rbind(LI_dexseq_repeats, BM_repeats)
tmp2 <- rbind(BM_dexseq_repeats, LI_repeats)

#tmp1 <- rbind(BM_intersect, BM_repeats)
#tmp2 <- rbind(LI_intersect, LI_repeats)

head(tmp1)
head(tmp2)


main <- rbind(tmp1,tmp2)
head(main)

main$mean_dist <- (main$up_dist+main$down_dist)


pdf(paste("repeat_analysis_v2.pdf", sep = ""), title = "Repeat and intron analysis | Revision | ADAR-deficiency perturbs global splicing landscape in mouse tissues", width = 12, height = 10)


ggplot(data = main,aes(y = mean_dist, x=type, fill =type))+
     scale_fill_viridis_d( option = "D")+
     scale_y_continuous(trans="pseudo_log", breaks=c(100,1000,5000,10000,50000,100000),  labels = scales::comma) +
     geom_violin(alpha=0.4, position = position_dodge(width = .75),size=1,color="black") +
     geom_boxplot(notch = F,  outlier.size = 1, color="black",lwd=1.2, alpha = 0.7)+
     # geom_point( shape = 21,size=2, position = position_jitterdodge(), color="black",alpha=1)+
     theme_pubr()+

     labs(   subtitle="Distance: distance upstream + distance downstream",    title = paste("Distance to pair of SINE-B1 elements with highest similarity", sep=""))+
     ylab(  c("Distance in bp")  )  +
     xlab(  c("Sample")  ) +
     rremove("legend.title")+
     theme(panel.border = element_rect(colour = "black", fill=NA, size=2),
      plot.title = element_text(size = 16, face = "bold"),
            plot.subtitle = element_text(size = 15, face = "italic"),

           axis.ticks = element_line(size=2,color="black"),
           axis.ticks.length=unit(0.2,"cm"),
           legend.position = "none",
           axis.text.x = element_text(angle = 45, hjust = 1, size=14))+
     font("xylab",size=15)+
     font("xy",size=15)+
     font("xy.text", size = 15) +
     font("legend.text",size = 15) +
    geom_signif(comparisons = list(c("Bone marrow DEXSEQ subset, n=47","Bone marrow SINE B1 repeats, n=598"),c("Liver DEXSEQ subset, n=139","Liver SINE B1 repeats, n=531")), map_signif_level = T,  textsize = 6, size = 1, na.rm=T, step_increase = 0.06)
    
    main$mean_dist <- main$up_dist

ggplot(data = main,aes(y = mean_dist, x=type, fill =type))+
     scale_fill_viridis_d( option = "D")+
     scale_y_continuous(trans="pseudo_log", breaks=c(100,1000,5000,10000,50000,100000),  labels = scales::comma) +
     geom_violin(alpha=0.4, position = position_dodge(width = .75),size=1,color="black") +
     geom_boxplot(notch = F,  outlier.size = 1, color="black",lwd=1.2, alpha = 0.7)+
     # geom_point( shape = 21,size=2, position = position_jitterdodge(), color="black",alpha=1)+
     theme_pubr()+

     labs(   subtitle="Distance: distance upstream",    title = paste("Distance to pair of SINE-B1 elements with highest similarity", sep=""))+
     ylab(  c("Distance in bp")  )  +
     xlab(  c("Sample")  ) +
     rremove("legend.title")+
     theme(panel.border = element_rect(colour = "black", fill=NA, size=2),
      plot.title = element_text(size = 16, face = "bold"),
            plot.subtitle = element_text(size = 15, face = "italic"),
           axis.ticks = element_line(size=2,color="black"),
           axis.ticks.length=unit(0.2,"cm"),
           legend.position = "none",
           axis.text.x = element_text(angle = 45, hjust = 1, size=14))+
     font("xylab",size=15)+
     font("xy",size=15)+
     font("xy.text", size = 15) +
     font("legend.text",size = 15) +

    geom_signif(comparisons = list(c("Bone marrow DEXSEQ subset, n=47","Bone marrow SINE B1 repeats, n=598"),c("Liver DEXSEQ subset, n=139","Liver SINE B1 repeats, n=531")), map_signif_level = T,  textsize = 6, size = 1, na.rm=T, step_increase = 0.06)
    main$mean_dist <- main$down_dist

ggplot(data = main,aes(y = mean_dist, x=type, fill =type))+
     scale_fill_viridis_d( option = "D")+
     scale_y_continuous(trans="pseudo_log", breaks=c(100,1000,5000,10000,50000,100000),  labels = scales::comma) +
     geom_violin(alpha=0.4, position = position_dodge(width = .75),size=1,color="black") +
     geom_boxplot(notch = F,  outlier.size = 1, color="black",lwd=1.2, alpha = 0.7)+
     # geom_point( shape = 21,size=2, position = position_jitterdodge(), color="black",alpha=1)+
     theme_pubr()+

     labs(   subtitle="Distance:distance downstream",    title = paste("Distance to pair of SINE-B1 elements with highest similarity", sep=""))+
     ylab(  c("Distance in bp")  )  +
     xlab(  c("Sample")  ) +
     rremove("legend.title")+
     theme(panel.border = element_rect(colour = "black", fill=NA, size=2),
      plot.title = element_text(size = 16, face = "bold"),
            plot.subtitle = element_text(size = 15, face = "italic"),
           axis.ticks = element_line(size=2,color="black"),
           axis.ticks.length=unit(0.2,"cm"),
           legend.position = "none",
           axis.text.x = element_text(angle = 45, hjust = 1, size=14))+
     font("xylab",size=15)+
     font("xy",size=15)+
     font("xy.text", size = 15) +
     font("legend.text",size = 15) +

    geom_signif(comparisons = list(c("Bone marrow DEXSEQ subset, n=47","Bone marrow SINE B1 repeats, n=598"),c("Liver DEXSEQ subset, n=139","Liver SINE B1 repeats, n=531")), map_signif_level = T,  textsize = 6, size = 1, na.rm=T, step_increase = 0.06)
    
    
        
    
    
    
LI_introns <- read.table("LI_flanking_introns.bed", stringsAsFactors = FALSE, sep = "\t", header=F )
colnames(LI_introns) <-c("chr", "start", "stop", "strand", "up_dist", "up_length", "down_dist", "down_length")

BM_introns <- read.table("BM_flanking_introns.bed", stringsAsFactors = FALSE, sep = "\t", header=F )
colnames(BM_introns) <-c("chr", "start", "stop", "strand", "up_dist", "up_length", "down_dist", "down_length")


LI_intersect_intron <- inner_join(LI_introns, LI_dexseq_intersect)
BM_intersect_intron <- inner_join(BM_introns, BM_dexseq_intersect)

BM_dexseq_introns <- inner_join(BM_introns, BM_dexseq)
LI_dexseq_introns <- inner_join(LI_introns, LI_dexseq)



# ips_introns_wo_aug <- anti_join(ips_introns, LI_dexseq_intersect)
# ips_introns_top <- inner_join(ips_introns, ips_aug_top)

# huvec_introns_wo_aug <- anti_join(huvec_introns, huvec_aug)

head(BM_intersect_intron)



# huvec_introns_wo_aug$type <- "huvec_introns_wo_aug"
# ips_introns_wo_aug$type <- "ips_introns_wo_aug"
# ips_introns_top$type <- "top10"

# hearts_introns_wo_aug$type <- "hearts_introns_wo_aug"

LI_intersect_intron$type <-  "Liver subset, n=4"
BM_intersect_intron$type <-  "Bone marrow subset, n=11"

BM_dexseq_introns$type <-  "Bone marrow DEXSEQ subset, n=139"
LI_dexseq_introns$type <-  "Liver DEXSEQ subset, n=47"

BM_introns$type <-  "Bone marrow flanking introns, n=1951"
LI_introns$type <-  "Liver flanking introns, n=1725"
# hearts_aug_introns$type <-  "hearts_aug_introns"

# LI_repeats_wo_dexseq <- anti_join(LI_repeats, LI_dexseq)


tmp1 <- rbind(LI_dexseq_introns,BM_dexseq_introns )

head(tmp1)

head(BM_introns)
head(LI_introns)
tmp2 <- rbind(BM_introns, LI_introns)


main <- rbind(tmp1,tmp2)

main$mean_dist <- (main$up_dist+main$down_dist)
# main$mean_length <- (main$up_length+main$down_length)/2

print(unique((main$type)))

    
ggplot(data = main,aes(y = mean_dist, x=type, fill =type))+
     scale_fill_viridis_d( option = "D")+
     scale_y_continuous(trans="pseudo_log", breaks=c(100,1000,5000,10000,50000,100000),  labels = scales::comma) +
     geom_violin(alpha=0.4, position = position_dodge(width = .75),size=1,color="black") +
     geom_boxplot(notch = F,  outlier.size = 1, color="black",lwd=1.2, alpha = 0.7)+
     # geom_point( shape = 21,size=2, position = position_jitterdodge(), color="black",alpha=1)+
     theme_pubr()+

     labs(   subtitle="Distance: distance upstream + distance downstream",     title = paste("Distance to next Exon (intron length)", sep=""))+
     ylab(  c("Distance in bp")  )  +
     xlab(  c("Sample")  ) +
     rremove("legend.title")+
     theme(panel.border = element_rect(colour = "black", fill=NA, size=2),
      plot.title = element_text(size = 16, face = "bold"),
            plot.subtitle = element_text(size = 15, face = "italic"),           axis.ticks = element_line(size=2,color="black"),
           axis.ticks.length=unit(0.2,"cm"),
           legend.position = "none",
           axis.text.x = element_text(angle = 45, hjust = 1, size=14))+
     font("xylab",size=15)+
     font("xy",size=15)+
     font("xy.text", size = 15) +
     font("legend.text",size = 15) +

        geom_signif(comparisons = list(c("Bone marrow DEXSEQ subset, n=139","Bone marrow flanking introns, n=1951") ,c("Liver DEXSEQ subset, n=47","Liver flanking introns, n=1725")), map_signif_level = T,  textsize = 6, size = 1, na.rm=T, step_increase = 0.06)
main$mean_dist <- main$up_dist
    
    
ggplot(data = main,aes(y = mean_dist, x=type, fill =type))+
     scale_fill_viridis_d( option = "D")+
     scale_y_continuous(trans="pseudo_log", breaks=c(100,1000,5000,10000,50000,100000),  labels = scales::comma) +
     geom_violin(alpha=0.4, position = position_dodge(width = .75),size=1,color="black") +
     geom_boxplot(notch = F,  outlier.size = 1, color="black",lwd=1.2, alpha = 0.7)+
     # geom_point( shape = 21,size=2, position = position_jitterdodge(), color="black",alpha=1)+
     theme_pubr()+

     labs(  subtitle="Distance: distance upstream",      title = paste("Distance to next Exon (intron length)", sep=""))+
     ylab(  c("Distance in bp")  )  +
     xlab(  c("Sample")  ) +
     rremove("legend.title")+
     theme(panel.border = element_rect(colour = "black", fill=NA, size=2),
      plot.title = element_text(size = 16, face = "bold"),
            plot.subtitle = element_text(size = 15, face = "italic"),           axis.ticks = element_line(size=2,color="black"),
           axis.ticks.length=unit(0.2,"cm"),
           legend.position = "none",
           axis.text.x = element_text(angle = 45, hjust = 1, size=14))+
     font("xylab",size=15)+
     font("xy",size=15)+
     font("xy.text", size = 15) +
     font("legend.text",size = 15) +


        geom_signif(comparisons = list(c("Bone marrow DEXSEQ subset, n=139","Bone marrow flanking introns, n=1951") ,c("Liver DEXSEQ subset, n=47","Liver flanking introns, n=1725")), map_signif_level = T,  textsize = 6, size = 1, na.rm=T, step_increase = 0.06)

main$mean_dist <- main$down_dist
    
    
ggplot(data = main,aes(y = mean_dist, x=type, fill =type))+
     scale_fill_viridis_d( option = "D")+
     scale_y_continuous(trans="pseudo_log", breaks=c(100,1000,5000,10000,50000,100000),  labels = scales::comma) +
     geom_violin(alpha=0.4, position = position_dodge(width = .75),size=1,color="black") +
     geom_boxplot(notch = F,  outlier.size = 1, color="black",lwd=1.2, alpha = 0.7)+
     # geom_point( shape = 21,size=2, position = position_jitterdodge(), color="black",alpha=1)+
     theme_pubr()+

     labs(  subtitle="Distance: distance downstream",      title = paste("Distance to next Exon (intron length)", sep=""))+
     ylab(  c("Distance in bp")  )  +
     xlab(  c("Sample")  ) +
     rremove("legend.title")+
     theme(panel.border = element_rect(colour = "black", fill=NA, size=2),
      plot.title = element_text(size = 16, face = "bold"),
            plot.subtitle = element_text(size = 15, face = "italic"),           axis.ticks = element_line(size=2,color="black"),
           axis.ticks.length=unit(0.2,"cm"),
           legend.position = "none",
           axis.text.x = element_text(angle = 45, hjust = 1, size=14))+
     font("xylab",size=15)+
     font("xy",size=15)+
     font("xy.text", size = 15) +
     font("legend.text",size = 15) +


        geom_signif(comparisons = list(c("Bone marrow DEXSEQ subset, n=139","Bone marrow flanking introns, n=1951") ,c("Liver DEXSEQ subset, n=47","Liver flanking introns, n=1725")), map_signif_level = T,  textsize = 6, size = 1, na.rm=T, step_increase = 0.06)



#################################################

dev.off()
