library(data.table)
library(tidyverse)
library(IRanges)
library(GenomicRanges)

gc()

bpp <- fread("fixed_LTR_retrotransposon.bpp")

colnames(bpp) <- c("pos1", "pos2", "prob", "len", "ID", "type")
ids <- unique(bpp$ID) %>% as.factor()

# Add both partners
unfolded1  <- cbind(bpp$pos1, bpp$pos2, bpp$prob, bpp$ID)
unfolded2  <- cbind(bpp$pos2, bpp$pos1, bpp$prob, bpp$ID)

colnames(unfolded1) <- c("pos","pos2", "prob", "ID")
colnames(unfolded2) <- c("pos","pos2", "prob", "ID")

unfolded <- rbind(unfolded1, unfolded2) %>% as.data.frame()
colnames(unfolded) <- c("pos", "pos2", "prob", "ID")
unfolded$prob <- as.numeric(unfolded$prob)
unfolded$pos <- as.numeric(unfolded$pos)

bpp <- NULL
gc()

## Find 'runs' of sequential likely paired bases -- hairpins

### thresh can be changed
thresh <- 0.9
unfolded$above <- ifelse(unfolded$prob >= thresh, TRUE, FALSE)

#only consider bases above threshold "likely paired"
unfolded$pos_filter <- ifelse(unfolded$above, unfolded$pos, NA)

## ranges

unpair <- as.data.frame(unfolded[,c("ID")])
colnames(unpair) <- "ID"
unpair <- unique(unpair)
unpair$range <- str_split(unpair$ID, pattern = ":", simplify = T)[,4]
unpair$min <- str_split(unpair$range, pattern = "-", simplify = T)[,1] %>% as.numeric()
unpair$max <- str_split(unpair$range, pattern = "-", simplify = T)[,2] %>% as.numeric()
gc()

IR_full <- IRanges(start = 0, end = unpair$max-unpair$min)
GR_full <- GRanges(seqnames = unpair$ID, ranges = IR_full)

IR_unfolded <- IRanges(start = unfolded$pos, end = unfolded$pos, width = 1)
GR_unfolded <- GRanges(seqnames = unfolded$ID, ranges = IR_unfolded)



### gapwidth tells GRanges how much of a gap can be between runs. 

# Start one, then allow gaps. Remove runs over gap limit
## gapwidth 1
gapwidth = 1

runs <- filter(unfolded, unfolded$above)
IR_runs <- IRanges(start = runs$pos, end = runs$pos)
GR_runs <- GRanges(seqnames = runs$ID, ranges = IR_runs)
GR_runs <- reduce(GR_runs, min.gapwidth = gapwidth) 

GR_unpair <- setdiff(GR_full, GR_runs)


## gapwidth 3
gapwidth = 3

GR_runs <- reduce(GR_runs, min.gapwidth = gapwidth) 

GR_runs_df <- as.data.frame(GR_runs)

GR_runs_df <- filter(GR_runs_df, GR_runs_df$width > 20)

overlap_gaps <- findOverlaps(GR_runs, GR_unpair) %>% as.data.frame()
mult_overlaps

GR_runs_df <- as.data.frame(GR_runs)

GR_runs_df <- filter(GR_runs_df, GR_runs_df$width > 20)
IR_runs <- IRanges(start = GR_runs_df$start, end = GR_runs_df$end)
GR_runs <- GRanges(seqnames = GR_runs_df$seqnames, ranges = IR_runs)

overlap_gaps <- findOverlaps(GR_runs, GR_unpair) %>% as.data.frame()
mult_overlaps
overlap_gaps$width <- NA

for (i in 1:nrow(overlap_gaps)) {
  overlap_gaps$width[[i]] <- width(GR_unpair[overlap_gaps$subjectHits[[i]]])
}

hasgaps <- unique(overlap_gaps$queryHits)
tdf <- data.frame()
for (i in hasgaps) {
  tmpdf <- filter(overlap_gaps, overlap_gaps$queryHits == i)
  tmpwid <- sum(tmpdf$width)
  tdf <- rbind(tdf, c(i, tmpwid))
}

overlim <- filter(tdf, tdf[,2] >= 5)
c(1:nrow(GR_runs_df))[!overlim[,1]]

# remove hairpins with gaps
GR_runs_df_new <- filter(GR_runs_df, !(row_number(GR_runs_df) %in% overlim[,1]))


# Get chromosomal positions for bed file

GR_runs_df_new$range <- str_split(GR_runs_df_new$seqnames, pattern = ":", simplify = TRUE)[,4]
GR_runs_df_new$TE_start <- str_split(GR_runs_df_new$range, 
                                 pattern = "-", 
                                 simplify = TRUE)[,1] %>% as.numeric()
GR_runs_df_new$TE_stop <- str_split(GR_runs_df_new$range, 
                                 pattern = "-", 
                                 simplify = TRUE)[,2] %>% as.numeric()

GR_runs_df_new$perc_start <- (GR_runs_df_new$start) / (GR_runs_df_new$TE_stop - GR_runs_df_new$TE_start)

GR_runs_df_new$chr <- str_split(GR_runs_df_new$seqnames, pattern = ":", simplify = TRUE)[,3]
GR_runs_df_new$chr_start <- GR_runs_df_new$TE_start + GR_runs_df_new$start -1 # bed file 0 index
GR_runs_df_new$chr_stop <- GR_runs_df_new$TE_start + GR_runs_df_new$end


bed <- GR_runs_df_new[,c("chr", "chr_start", "chr_stop", "seqnames")]

GR_runs_df_new <- filter(GR_runs_df_new, GR_runs_df_new$width > 20)
bed <- GR_runs_df_new[,c("chr", "chr_start", "chr_stop", "seqnames")]
write_delim(bed, "LP_hairpins_gt20_LTR_retrotransposons.bed", delim = "\t")  


