setwd("~/Desktop/ReturnForRevision/Cap/bam")
TL.files <- c("ScerTSS1-SRR5681110.sorted.bam","ScerTSS2_SRR5681112.sorted.bam",
              "SparTSS1_SRR5681106.sorted.bam","SparTSS2_SRR5681108.sorted.bam")
CAGE.files <- c("ScerYPD.1.sorted.bam","ScerYPD.2.sorted.bam",
                "SL39.SparN17.sorted.bam","SL40.SparN17.sorted.bam")
library(Rsamtools)
library(stringr)
library(data.table)
library(BSgenome.Scerevisiae.UCSC.sacCer3)
library(BSgenome.Sparadoxus.N17.sparN17)
##################################################################################
##Scer TL seq
Genome <- BSgenome.Scerevisiae.UCSC.sacCer3
seqnames(Genome) <- sub("chrM","chrmt",seqnames(Genome))
Scer.TL1 <- .getBam.TLseq("ScerTSS1-SRR5681110.sorted.bam", Genome)
Scer.TL2 <- .getBam.TLseq("ScerTSS2_SRR5681112.sorted.bam", Genome)
# colnames(Scer.TL1)[4] <- "tags"
# colnames(Scer.TL2)[4] <- "tags"
Scer.TL <- rbind(Scer.TL1, Scer.TL2)
Scer.TL <- Scer.TL[, as.integer(sum(tags)), by = list(chr, pos, strand)]
colnames(Scer.TL)[4] <- "tags"
write.table(Scer.TL1, file = "Scer.TL1.tss", sep = "\t", row.names = F, quote = F)
write.table(Scer.TL2, file = "Scer.TL2.tss", sep = "\t", row.names = F, quote = F)
write.table(Scer.TL, file = "Scer.TL.tss", sep = "\t", row.names = F, quote = F)
##################################################################################
##Scer CAGE seq
Scer.CAGE1 <- .getBam.CAGEseq("ScerYPD.1.sorted.bam", Genome)
Scer.CAGE2 <- .getBam.CAGEseq("ScerYPD.2.sorted.bam", Genome)
Scer.CAGE <- rbind(Scer.CAGE1, Scer.CAGE2)
Scer.CAGE <- Scer.CAGE[, as.integer(sum(tags)), by = list(chr, pos, strand, genomeTSS, capped)]
colnames(Scer.CAGE)[6] <- "tags"
write.table(Scer.CAGE1, file = "Scer.CAGE1.tss", sep = "\t", row.names = F, quote = F)
write.table(Scer.CAGE2, file = "Scer.CAGE2.tss", sep = "\t", row.names = F, quote = F)
write.table(Scer.CAGE, file = "Scer.CAGE.tss", sep = "\t", row.names = F, quote = F)
##################################################################################
##################################################################################
##Spar TL seq
Genome <- BSgenome.Sparadoxus.N17.sparN17
seqnames(Genome) <- sub("chrXIII","13",seqnames(Genome))
seqnames(Genome) <- sub("chrVIII","8",seqnames(Genome))
seqnames(Genome) <- sub("chrIII","3",seqnames(Genome))
seqnames(Genome) <- sub("chrXIV","14",seqnames(Genome))
seqnames(Genome) <- sub("chrXVI","16",seqnames(Genome))
seqnames(Genome) <- sub("chrXII","12",seqnames(Genome))
seqnames(Genome) <- sub("chrVII","7",seqnames(Genome))
seqnames(Genome) <- sub("chrIV","4",seqnames(Genome))
seqnames(Genome) <- sub("chrIX","9",seqnames(Genome))
seqnames(Genome) <- sub("chrXV","15",seqnames(Genome))
seqnames(Genome) <- sub("chrXI","11",seqnames(Genome))
seqnames(Genome) <- sub("chrVI","6",seqnames(Genome))
seqnames(Genome) <- sub("chrII","2",seqnames(Genome))
seqnames(Genome) <- sub("chrX","10",seqnames(Genome))
seqnames(Genome) <- sub("chrV","5",seqnames(Genome))
seqnames(Genome) <- sub("chrI","1",seqnames(Genome))
Spar.TL1 <- .getBam.TLseq("SparTSS1_SRR5681106.sorted.bam", Genome)
Spar.TL2 <- .getBam.TLseq("SparTSS2_SRR5681108.sorted.bam", Genome)
Spar.TL <- rbind(Spar.TL1, Spar.TL2)
Spar.TL <- Spar.TL[, as.integer(sum(tags)), by = list(chr, pos, strand, capped)]
colnames(Spar.TL)[4] <- "tags"
Spar.TL$chr[Spar.TL$chr ==1] <- "chrI"
Spar.TL$chr[Spar.TL$chr ==2] <- "chrII"
Spar.TL$chr[Spar.TL$chr ==3] <- "chrIII"
Spar.TL$chr[Spar.TL$chr ==4] <- "chrIV"
Spar.TL$chr[Spar.TL$chr ==5] <- "chrV"
Spar.TL$chr[Spar.TL$chr ==6] <- "chrVI"
Spar.TL$chr[Spar.TL$chr ==7] <- "chrVII"
Spar.TL$chr[Spar.TL$chr ==8] <- "chrVIII"
Spar.TL$chr[Spar.TL$chr ==9] <- "chrIX"
Spar.TL$chr[Spar.TL$chr ==10] <- "chrX"
Spar.TL$chr[Spar.TL$chr ==11] <- "chrXI"
Spar.TL$chr[Spar.TL$chr ==12] <- "chrXII"
Spar.TL$chr[Spar.TL$chr ==13] <- "chrXIII"
Spar.TL$chr[Spar.TL$chr ==14] <- "chrXIV"
Spar.TL$chr[Spar.TL$chr ==15] <- "chrXV"
Spar.TL$chr[Spar.TL$chr ==16] <- "chrXVI"
setorder(Spar.TL, chr)
write.table(Spar.TL, file = "Spar.TL.tss", sep = "\t", row.names = F, quote = F)
##################################################################################
##Spar CAGE seq
Genome <- BSgenome.Sparadoxus.N17.sparN17
Spar.CAGE1 <- .getBam.CAGEseq("SL39.SparN17.sorted.bam", Genome)
Spar.CAGE2 <- .getBam.CAGEseq("SL40.SparN17.sorted.bam", Genome)
Spar.CAGE <- rbind(Spar.CAGE1, Spar.CAGE2)
Spar.CAGE <- Spar.CAGE[, as.integer(sum(tags)), by = list(chr, pos, strand, genomeTSS, capped)]
colnames(Spar.CAGE)[6] <- "tags"
write.table(Spar.CAGE, file = "Spar.CAGE.tss", sep = "\t", row.names = F, quote = F)


####################################################################################################################
####################################################################################################################
.getBam.TLseq <- function(bam.file, Genome, sequencingQualityThreshold = 10
                          ,mappingQualityThreshold = 20){
  what <- c("rname", "strand", "pos", "seq", "qual", "mapq","flag","cigar")
  param <- ScanBamParam( what = what
                         , flag = scanBamFlag(isUnmappedQuery = FALSE,
                                              isNotPassingQualityControls = FALSE)
                         , mapqFilter = mappingQualityThreshold)
  message("\nReading in file: ", bam.file, "...")
  bam <- scanBam(bam.file, param = param)
  message("\t-> Filtering out low quality reads...")
  qual <- bam[[1]]$qual
  start <- 1
  chunksize <- 1e6
  qa.avg <- vector(mode = "integer")
  repeat {
    if (start + chunksize <= length(qual)) {
      end <- start + chunksize
    } else {
      end <- length(qual)
    }
    qa.avg <- c(qa.avg, as.integer(sapply(as(qual[start:end], "IntegerList"),mean)))
    if (end == length(qual)) {
      break
    } else {
      start <- end + 1
    }
  }
  cigar <- bam[[1]]$cigar
  start <- 1
  chunksize <- 1e6
  mapped.length <- vector(mode = "integer")
  repeat {
    if (start + chunksize <= length(cigar)) {
      end <- start + chunksize
    } else {
      end <- length(cigar)
    }
    mapped.length <- c(mapped.length, as.integer(sum(as(str_extract_all(bam[[1]]$cigar[start:end], "([0-9]+)"),"IntegerList"))))
    if (end == length(cigar)) {
      break
    } else {
      start <- end + 1
    }
  }
  readsGR <- GRanges(seqnames = as.vector(bam[[1]]$rname), IRanges(start = bam[[1]]$pos, width = mapped.length),
                     strand = bam[[1]]$strand, qual = qa.avg, mapq = bam[[1]]$mapq, seq = bam[[1]]$seq, read.length = width(bam[[1]]$seq),
                     flag = bam[[1]]$flag)
  ##because Spar bam file chroms do not match with Spar.N17 Bsgenome, run the following two rows to ignore the check.
  readsGR <- readsGR[as.character(readsGR@seqnames) %in% seqnames(Genome)]
  readsGR <- readsGR[!(end(readsGR) > seqlengths(Genome)[as.character(seqnames(readsGR))])]
  GenomicRanges::elementMetadata(readsGR)$mapq[is.na(GenomicRanges::elementMetadata(readsGR)$mapq)] <- Inf
  readsGR.p <- readsGR[(as.character(strand(readsGR)) == "+" & GenomicRanges::elementMetadata(readsGR)$qual >= sequencingQualityThreshold) & GenomicRanges::elementMetadata(readsGR)$mapq >= mappingQualityThreshold]
  readsGR.m <- readsGR[(as.character(strand(readsGR)) == "-" & GenomicRanges::elementMetadata(readsGR)$qual >= sequencingQualityThreshold) & GenomicRanges::elementMetadata(readsGR)$mapq >= mappingQualityThreshold]
  ##call TSS without G Cap, no any correction
  TSS.p <- data.table(chr = as.character(seqnames(readsGR.p)), pos = start(readsGR.p), strand = "+", stringsAsFactors = FALSE)
  TSS.m <- data.table(chr = as.character(seqnames(readsGR.m)), pos = end(readsGR.m), strand = "-", stringsAsFactors = FALSE)
  TSS <- rbind(TSS.p, TSS.m)
  TSS <- TSS[,c("chr", "pos", "strand")]
  TSS$tag_count <- 1
  setDT(TSS)
  TSS <- TSS[, as.integer(sum(tag_count)), by = list(chr, pos, strand)]
  setnames(TSS, colnames(TSS)[4], "tags")
  return(TSS)
}



####################################################################################################################
.getBam.CAGEseq <- function(bam.file, Genome, sequencingQualityThreshold = 10
                            ,mappingQualityThreshold = 20){
  what <- c("rname", "strand", "pos", "seq", "qual", "mapq","flag","cigar")
  param <- ScanBamParam( what = what
                         , flag = scanBamFlag(isUnmappedQuery = FALSE,
                                              isNotPassingQualityControls = FALSE)
                         , mapqFilter = mappingQualityThreshold)
  message("\nReading in file: ", bam.file, "...")
  bam <- scanBam(bam.file, param = param)
  message("\t-> Filtering out low quality reads...")
  qual <- bam[[1]]$qual
  start <- 1
  chunksize <- 1e6
  qa.avg <- vector(mode = "integer")
  repeat {
    if (start + chunksize <= length(qual)) {
      end <- start + chunksize
    } else {
      end <- length(qual)
    }
    qa.avg <- c(qa.avg, as.integer(sapply(as(qual[start:end], "IntegerList"),mean)))
    if (end == length(qual)) {
      break
    } else {
      start <- end + 1
    }
  }
  cigar <- bam[[1]]$cigar
  start <- 1
  chunksize <- 1e6
  mapped.length <- vector(mode = "integer")
  repeat {
    if (start + chunksize <= length(cigar)) {
      end <- start + chunksize
    } else {
      end <- length(cigar)
    }
    mapped.length <- c(mapped.length, as.integer(sum(as(str_extract_all(bam[[1]]$cigar[start:end], "([0-9]+)"),"IntegerList"))))
    if (end == length(cigar)) {
      break
    } else {
      start <- end + 1
    }
  }
  readsGR <- GRanges(seqnames = as.vector(bam[[1]]$rname), IRanges(start = bam[[1]]$pos, width = mapped.length),
                     strand = bam[[1]]$strand, qual = qa.avg, mapq = bam[[1]]$mapq, seq = bam[[1]]$seq, read.length = width(bam[[1]]$seq),
                     flag = bam[[1]]$flag)
  ##because Spar bam file chroms do not match with Spar.N17 Bsgenome, run the following two rows to ignore the check.
  readsGR <- readsGR[as.character(readsGR@seqnames) %in% seqnames(Genome)]
  readsGR <- readsGR[!(end(readsGR) > seqlengths(Genome)[as.character(seqnames(readsGR))])]
  GenomicRanges::elementMetadata(readsGR)$mapq[is.na(GenomicRanges::elementMetadata(readsGR)$mapq)] <- Inf
  readsGR.p <- readsGR[(as.character(strand(readsGR)) == "+" & GenomicRanges::elementMetadata(readsGR)$qual >= sequencingQualityThreshold) & GenomicRanges::elementMetadata(readsGR)$mapq >= mappingQualityThreshold]
  readsGR.m <- readsGR[(as.character(strand(readsGR)) == "-" & GenomicRanges::elementMetadata(readsGR)$qual >= sequencingQualityThreshold) & GenomicRanges::elementMetadata(readsGR)$mapq >= mappingQualityThreshold]

  ##plus strand
  ##1 G.reads
  G.reads.p <- which(substr(elementMetadata(readsGR.p)$seq, start = 1, stop = 1) == "G")
  x1 <- readsGR.p[G.reads.p]
  ##2 nonG.reads
  x2 <- readsGR.p[-G.reads.p]
  ##1.1 G.reads.mismatch
  G.mismatch <- which(getSeq(Genome, resize(x1, width = 1, fix = "start"), as.character = TRUE) != "G")
  x1.1 <- x1[G.mismatch]
  ##1.2 matched G
  x1.2 <- x1[-G.mismatch]
  # x1.1.copied <- x1.1
  start(x1.1) <- start(x1.1) + as.integer(1)

  #
  x2 <- c(x1.2,x2)

  m <- rbind(data.table(chr = as.character(seqnames(x1.1)), pos = start(x1.1), strand = "+",
                        genomeTSS = getSeq(Genome, resize(x1.1, width = 1, fix = "start"), as.character = TRUE),
                        # seqTSS = substr(elementMetadata(x1.1)$seq, start = 2, stop = 2),
                        capped = 1),
             data.table(chr = as.character(seqnames(x2)), pos = start(x2), strand = "+",
                        genomeTSS = getSeq(Genome, resize(x2, width = 1, fix = "start"), as.character = TRUE),
                        # seqTSS = substr(elementMetadata(x2)$seq, start = 1, stop = 1),
                        capped = 0))

  ##minus strand
  ##1 G.reads
  G.reads.m <- which(substr(elementMetadata(readsGR.m)$seq, start = elementMetadata(readsGR.m)$read.length, stop = elementMetadata(readsGR.m)$read.length) == "C")
  x1 <- readsGR.m[G.reads.m]
  ##2 nonG.reads
  x2 <- readsGR.m[-G.reads.m]
  ##1.1 G.reads.mismatch
  G.mismatch <- which(getSeq(Genome, resize(x1, width = 1, fix = "start"), as.character = TRUE) != "G")
  x1.1 <- x1[G.mismatch]
  x1.2 <- x1[-G.mismatch]
  # x1.1.copied <- x1.1
  end(x1.1) <- end(x1.1) - as.integer(1)
  ##prepare for TSS_1
  x2 <- c(x1.2,x2)

  n <- rbind(data.table(chr = as.character(seqnames(x1.1)), pos = end(x1.1), strand = "-",
                        genomeTSS = getSeq(Genome, resize(x1.1, width = 1, fix = "start"), as.character = TRUE),
                        # seqTSS = substr(elementMetadata(x1.1)$seq, start = 2, stop = 2),
                        capped = 1),
             data.table(chr = as.character(seqnames(x2)), pos = end(x2), strand = "-",
                        genomeTSS = getSeq(Genome, resize(x2, width = 1, fix = "start"), as.character = TRUE),
                        # seqTSS = substr(elementMetadata(x2)$seq, start = 1, stop = 1),
                        capped = 0))

  tss <- rbind(m,n)
  tss$tag_count <- 1
  tss <- tss[, as.integer(sum(tag_count)), by = list(chr, pos, strand,genomeTSS, capped)]
  setnames(tss, colnames(tss)[6], "tags")
  return(tss)
}



