rm(list = ls())
graphics.off()

library(Rsamtools)
library(GenomicRanges)
library(ggplot2)
library(gplots)
library(viridis) #The viridis color palettes
library(RColorBrewer)


file_name = c( "nascent",
               "mature", 
               "bulk")

dir_path <- "~/your-bam-directory-path/"

#data frame with gene coordinates 
gene_table <- read.table("~/sacCer3_ucsc_geneTable.bed", stringsAsFactors = F)

## find genes with greater than 700pb in length
gene_table$length <- gene_table$end - gene_table$start

gene_table$new_start <- NA
gene_table$new_end  <- NA

idx <- which(gene_table$length < 700)
gene_table$new_start[idx] <-  gene_table$start[idx] 
gene_table$new_end[idx] <-  gene_table$end[idx]

ind <-  which(gene_table$length > 700) 

#generate new coordinates for the first 700 bps of the gene body 
for(i in 1:length(ind)){
  if(gene_table$strand[ind[i]] == "-"){
    gene_table$new_start[ind[i]] <-  gene_table$end[ind[i]] - 700
    gene_table$new_end[ind[i]] <-  gene_table$end[ind[i]]
  } else {
    gene_table$new_start[ind[i]] <- gene_table$start[ind[i]] 
    gene_table$new_end[ind[i]] <- gene_table$start[ind[i]] + 700
  }
}

# find the start site of the gene 
for(i in 1:nrow(gene_table)){
  if(gene_table$strand[i] == "-"){
    gene_table$pos[i] <- gene_table$end[i]
  } else {
    gene_table$pos[i] <- gene_table$start[i] 
  }
}


#define the nucleosomal read boudaries 
length_min = 140
length_max = 180

# create a list to store matrices 
data = vector("list")

#setting up parameters
options(scipen=999) 
chr_coordinates.df = data.frame(chr=(c("chrI", "chrII", "chrIII", "chrIV", "chrV", "chrVI", "chrVII", "chrVIII", "chrIX", "chrX", "chrXI", "chrXII",
                                       "chrXIII", "chrXIV", "chrXV","chrXVI")),
                                end=as.numeric(c("230218","813184", "316620", "1531933", "576874", "270161", "1090940", "562643", "439888", "745751", "666816", "1078177", "924431", "784333", "1091291", "948066")), stringsAsFactors=FALSE)

# N = Nascent
# M = Mature
# B = Bulk 

genes.df <- data.frame(gene = character(),
                      N_M_genes = numeric(),
                      N_B_genes = numeric(),
                      M_B_genes = numeric(),
                      
                      chr = character(),
                      N_depth =numeric(),
                      M_depth =numeric(),
                      B_depth =numeric(),
                      
                      acf_N = numeric(),
                      acf_M = numeric(),
                      acf_B = numeric(),
                      
                      strand= character(),
                      startn = numeric(),
                      endn = numeric(),
                      
                      start = numeric(),
                      end = numeric(),
                      
                      mid_pt_dis_N = numeric(),
                      mid_pt_dis_M = numeric(),
                      mid_pt_dis_B = numeric(),
                      
                      length = numeric(),
                      
                      stringsAsFactors=FALSE)


#read in each chromosome information
for(r in 1:nrow(chr_coordinates.df)){
  chr = chr_coordinates.df[r,"chr"]        # chromosome name
  new_start= 1                             # chromosome start 
  new_end= chr_coordinates.df[r, "end"]    # chromosome end
  
  # create a GenomicRanges object with the information above to retrieve the read information from the bam file
  chr.gr = GRanges(seqnames= chr, ranges = IRanges(start =new_start , end = new_end ))
  
  p = ScanBamParam(what = c("rname", "strand", "pos", "isize"),which = chr.gr)
  
  for (f in 1:3){
    
    #data files
    file_name.bam = paste(dir_path,file_name[f],".bam", sep='')
    file_name.bam.bai = paste(dir_path,file_name[f],".bam.bai",sep='')
    
    A_reads.l = scanBam(file = file_name.bam, 
                        index = file_name.bam.bai,
                        param = p)
    
    #create a new GenomicRanges object for the reads from this list:
    A_reads.gr = GRanges(seqnames = A_reads.l[[1]]$rname,
                         ranges = IRanges(start = A_reads.l[[1]]$pos, 
                                          width = A_reads.l[[1]]$isize))
    
    #find bp overlap with midpoints of reads. 
    mat.gr = GRanges(seqnames = chr, ranges = IRanges(start= seq(new_start, new_end, by =1), width=1 ))  
    
    #taking only the midpoint
    subset_data.gr = A_reads.gr[which(width(A_reads.gr) >= length_min & width(A_reads.gr) <= length_max)]
    
    #finding the mipoints of those reads 
    midpoints.gr =IRanges(start=mid(ranges(subset_data.gr)), width=1) 
    
    #convert GenomicRanges to a data frame 
    data[[f]] = as.data.frame(subset_data.gr)
    midpoints = as.data.frame(midpoints.gr)
    data[[f]]$mid=midpoints$start
    
  }
  
  cat(paste("saving midpoints on chromosome", r,"\n"))
  
  dm <- data[[3]]$mid
  
  #calculate turnpoints (nucleosome PEAKS)
  myDensity = density(dm, bw=30, kernel="gaussian", n=(new_end/5))
  tp = turnpoints(myDensity$y)
  
  #finding all of the nucleosome peaks within the chromosome 
  
  if (tp$firstispeak){
    d_peaks=myDensity$x[tp$tppos[seq(1,max(tp$tppos),by=2)]]
    d_peak_scores=myDensity$y[tp$tppos[seq(1,max(tp$tppos),by=2)]]
    print("first is a peak ")
  } else {
    d_peaks=myDensity$x[tp$tppos[seq(0,max(tp$tppos),by=2)]]
    d_peak_scores=myDensity$y[tp$tppos[seq(0,max(tp$tppos),by=2)]]
    print("first is NOT a peak ")
  }
  
  #keeping strong peaks 
  d_peak_scores[which(is.na(d_peak_scores))]=0
  d_peaks=d_peaks[which(d_peak_scores>1e-8)]
  
  tp = turnpoints(myDensity$y)
  
  # finding genes within the chromosome 
  indice <- which(gene_table$chr == chr)
  feature <- gene_table[indice, ]
  
  for(t in 1:nrow(feature)){
    
    positions = vector("list")
    
    rw <- which(gene_table$gene == feature$gene[t])
    
    cat(paste("saving geneselation of gene", rw,"\n"))
    
    
    nstart= feature$new_start[t]
    nend= feature$new_end[t]
    
    data_nascent <- data[[1]]$mid[which(data[[1]]$mid > nstart & data[[1]]$mid < nend)]
    data_mature <- data[[2]]$mid[which(data[[2]]$mid > nstart & data[[2]]$mid < nend)]
    data_bulk  <- data[[3]]$mid[which(data[[3]]$mid > nstart & data[[3]]$mid < nend)]
    
    #setting up matrices to store midpoint information for each nucleosome 
    a<-rep(0,length(data_nascent))
    b<-rep(0,length(data_mature))
    c<-rep(0,length(data_bulk))
    
    tp_ss <- d_peaks[which(d_peaks > nstart & d_peaks < nend)]
      
    for(x in 1:length(data_nascent)){
      a[x]<-min(abs(data_nascent[x]-tp_ss))
    }
    for(x in 1:length(data_mature)){
      b[x]<-min(abs(data_mature[x]-tp_ss))
    }
    for(x in 1:length(data_bulk)){
      c[x]<-min(abs(data_bulk[x]-tp_ss))
    }
    
    if(length(data_nascent) == 0){
      next
    }
    if(length(data_mature) == 0){
      next
    }
    if(length(data_bulk) == 0){
      next
    }
   
    ### determining autogeneselation or level of organization for an individual gene. 
    acfN <-   acf(predict(smooth.spline(density(data_nascent, bw=band)),nstart:nend)$y, lag.max=max(data_nascent-min(data_nascent)), plot=F)
    acfM <-   acf(predict(smooth.spline(density(data_mature, bw=band)),nstart:nend)$y, lag.max=max(data_mature-min(data_mature)), plot=F)
    acfB <-   acf(predict(smooth.spline(density(data_bulk, bw=band)),nstart:nend)$y  , lag.max=max(data_bulk-min(data_bulk)), plot=F)
   

    #### density of full length gene for geneselation calculation.
    
    l_gene <- feature$end[t] - feature$start[t]
    
    pos1 <- data[[1]]$mid[which(data[[1]]$mid > feature$start[t] & data[[1]]$mid < feature$end[t])]
    pos2 <- data[[2]]$mid[which(data[[2]]$mid > feature$start[t] & data[[2]]$mid < feature$end[t])]
    pos3 <- data[[3]]$mid[which(data[[3]]$mid > feature$start[t] & data[[3]]$mid < feature$end[t])]
    
    positions[[1]] <-   pos1 - feature[t, "pos"]
    positions[[2]] <-   pos2 - feature[t, "pos"]
    positions[[3]] <-   pos3 - feature[t, "pos"]
    
    if(feature$strand[t] == "-"){
      positions[[1]] <- positions[[1]] * (-1)  
      positions[[2]] <- positions[[2]] * (-1) 
      positions[[3]] <- positions[[3]] * (-1) 
          } else {
      positions[[1]] <- positions[[1]]   
      positions[[2]] <- positions[[2]] 
      positions[[3]] <- positions[[3]] 
          }
    
    d1 <- density(positions[[1]],bw=band, from=-100, to= (l_gene+100))
    d2 <- density(positions[[2]],bw=band, from=-100, to= (l_gene+100))
    d3 <- density(positions[[3]],bw=band, from=-100, to= (l_gene+100))
    
    genes.df[rw,1] <- feature[t,"gene"] #gene_table$name[rw]
    genes.df[rw,2] <- round(cor(d1$y, d2$y),3) 
    genes.df[rw,3] <- round(cor(d1$y, d3$y),3)
    genes.df[rw,4] <- round(cor(d2$y, d3$y),3)
    
    genes.df[rw,5] <- chr
    genes.df[rw,6]<- length(data_nascent)
    genes.df[rw,7]<- length(data_mature)
    genes.df[rw,8]<- length(data_bulk)

    
    if(length(acfN$acf) < 172){
      acN<- round(max(acfN$acf[(length(acfN$acf)/2):length(acfN$acf)]), 3)
      acM<- round(max(acfM$acf[(length(acfM$acf)/2):length(acfM$acf)]), 3)
      acB<- round(max(acfB$acf[(length(acfB$acf)/2):length(acfB$acf)]), 3)
     
      genes.df[rw,9]<- round(acN, 3)
      genes.df[rw,10]<- round(acM, 3) 
      genes.df[rw,11]<- round(acB, 3) 
       } else {
      genes.df[rw,9]<- round(acfN$acf[172], 3)
      genes.df[rw,10]<- round(acfM$acf[172], 3) 
      genes.df[rw,11]<- round(acfB$acf[172], 3)
       }
    
    genes.df[rw,12]<- feature[t,"strand"]
    genes.df[rw,13]<- nstart
    genes.df[rw,14]<- nend
    
    genes.df[rw,15]<- gene_table$start[rw]
    genes.df[rw,16]<- gene_table$end[rw]
    
    genes.df[rw,17]<- mean(a)
    genes.df[rw,18]<- mean(b)
    genes.df[rw,19]<- mean(c)
    
    genes.df[rw,23]<- feature[t,"end"] - feature[t,"start"]
  }
}


