## ---- setup, include=FALSE----------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----load-TOP-package, eval=FALSE, message=FALSE, warning=FALSE---------------
#  library(TOP)

## ----get-chrom-sizes, eval=FALSE----------------------------------------------
#  index_fa('hg38.fa', chromsize_file='hg38.chrom.sizes')

## ----get-motif-matches, eval=FALSE--------------------------------------------
#  fimo_motif_matches(motif_file='MA0139.1.meme',
#                     sequence_file='hg38.fa',
#                     thresh_pValue=1e-5,
#                     outname='MA0139.1_1e-5.fimo.txt')

## ----get-candidate-sites, eval=FALSE------------------------------------------
#  # fimo_file: FIMO result file.
#  # thresh_pValue: FIMO p-value threshold.
#  # blacklist_file: file with ENOCDE blacklist regions.
#  sites <- process_candidate_sites(fimo_file='MA0139.1_1e-5.fimo.txt',
#                                   thresh_pValue=1e-5,
#                                   blacklist_file='blacklist.hg38.bed.gz')

## ----cmd-sort-index-stats-ATAC-bam, eval=FALSE, message=FALSE, warning=FALSE----
#  # This BAM file has already been sorted, so we skip the sorting step.
#  sort_index_idxstats_bam('K562.ATAC.bam', sort=FALSE, index=TRUE, idxstats=TRUE)

## ----count_genome_coverage, eval=FALSE----------------------------------------
#  # bam_file: sorted BAM file.
#  # chrom_size_file: file of genome sizes by chromosomes.
#  # data_type: DNase or ATAC.
#  # outdir: directory for saving the BigWig files of genome counts, same as outdir in get_sites_counts().
#  # outname: prefix for the BigWig files, same as genomecount_name in get_sites_counts().
#  count_genome_cuts(bam_file='K562.ATAC.bam',
#                    chrom_size_file='hg38.chrom.sizes',
#                    data_type='ATAC',
#                    outdir='processed_data',
#                    outname='K562.ATAC')

## ----get-motif-counts, eval=FALSE---------------------------------------------
#  # genomecount_dir: directory for genome counts, same as outdir in count_genome_cuts().
#  # genomecount_name: file prefix for genome counts, same as outname in count_genome_cuts().
#  count_matrix <- get_sites_counts(sites,
#                                   genomecount_dir='processed_data',
#                                   genomecount_name='K562.ATAC')
#  saveRDS(count_matrix, "processed_data/CTCF.K562.ATAC.counts.mat.rds")

## ----normalize-bin-counts, eval=FALSE-----------------------------------------
#  # count_matrix: DNase (or ATAC) read counts matrix.
#  # idxstats_file: the 'idxstats.txt' file generated by sort_index_idxstats_bam().
#  # ref_size: Reference library size (default: 50 million for ATAC-seq, 100 million for DNase-seq).
#  # transform: Transformation for DNase (or ATAC) counts (default: 'asinh').
#  bins <- normalize_bin_transform_counts(count_matrix,
#                                         idxstats_file='K562.ATAC.bam.idxstats.txt',
#                                         ref_size=5e7,
#                                         transform='asinh')

## ----combine-sites-bins, eval=FALSE-------------------------------------------
#  combined_data <- data.frame(sites, bins)
#  colnames(combined_data) <- c('chr','start','end','name','pwm.score','strand','p.value', paste0('bin', 1:ncol(bins)))
#  
#  saveRDS(combined_data, 'processed_data/CTCF_MA0139.1_1e-5.K562.ATAC.M5.combined.data.rds')

## ----load-combined-data, eval=TRUE, include=FALSE-----------------------------
combined_data <- readRDS(system.file("extdata/example_data", "CTCF.K562.ATAC.chip.example.data.rds", package = "TOP"))
cols <- c('chr','start','end','name','pwm.score','strand','p.value', paste0('bin', 1:5))
combined_data <- combined_data[, cols]

## ----show-example-data--------------------------------------------------------
head(combined_data, 3)

## ----index-stats-chip-bam-files, eval=FALSE-----------------------------------
#  # The BAM files have already been sorted, so we skip the sorting step.
#  sort_index_idxstats_bam('CTCF.K562.ChIPseq.rep1.bam', sort=FALSE, index=TRUE, idxstats=TRUE)
#  sort_index_idxstats_bam('CTCF.K562.ChIPseq.rep2.bam', sort=FALSE, index=TRUE, idxstats=TRUE)

## ----count-normalize-chip, eval=FALSE-----------------------------------------
#  sites_chip <- count_normalize_chip(sites,
#                                     chip_bam_files=c('CTCF.K562.ChIPseq.rep1.bam',
#                                                      'CTCF.K562.ChIPseq.rep2.bam'),
#                                     chrom_size_file='hg38.chrom.sizes')

## ----combine-sites-bins-chip, eval=FALSE--------------------------------------
#  combined_data <- data.frame(sites, bins, chip = sites_chip$chip)
#  colnames(combined_data) <- c('chr','start','end','name','pwm.score','strand','p.value',
#                                  paste0('bin', 1:ncol(bins)), 'chip')
#  saveRDS(combined_data, 'processed_data/CTCF_MA0139.1_1e-5.K562.ATAC.M5.ChIP.combined.data.rds')

## ----load-combined-data-with-chip, eval=TRUE, include=FALSE-------------------
combined_data <- readRDS(system.file("extdata/example_data", "CTCF.K562.ATAC.chip.example.data.rds", package = "TOP"))

## -----------------------------------------------------------------------------
head(combined_data, 3)

## ----add-chip-peak-labels, eval=FALSE-----------------------------------------
#  sites_chip_labels <- add_chip_peak_labels_to_sites(sites,
#                                                     chip_peak_file='CTCF.K562.ChIPseq.peaks.bed.gz')
#  
#  combined_data <- data.frame(sites, bins, chip_label = sites_chip_labels$chip_label)
#  colnames(combined_data) <- c('chr','start','end','name','pwm.score','strand','p.value',
#                               paste0('bin', 1:ncol(bins)), 'chip_label')
#  saveRDS(combined_data, 'processed_data/CTCF_MA0139.1_1e-5.K562.ATAC.M5.ChIPlabels.combined.data.rds')

