# Summarised are scripts used for the analysis of TSS regions. # The analysis was split into 2 parts # 1, bulk analysis of coverage at TSS (consider all TSS) # 2, focused analysis of coverage at TSS associated with blood cells or cancer related genes ### 1 - BULK TSS analysis ### ############################################################################################################# # get coverage relative to TSS # TSSs are extracted from GTF file # +ve and -ve stranded genes coverages calculated separately and then combined Rscript ############################################################################################################# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ library( tidyverse) library( GenomicRanges) ############################### START OF FUNCTIONS ########################################## get_coverage <- function( ol_reads, resized_width, tss_gr){ read_coverage <- coverage(ol_reads) cov_container <- matrix(ncol = resized_width) for( each_chr in unique(names(read_coverage)) ){ #cat( each_chr, '\n') chr_resized_peaks <- tss_gr[ seqnames(tss_gr) == each_chr] if( length(chr_resized_peaks) > 0 ){ chr_views <- as.matrix(Views(read_coverage[[each_chr]],as(ranges(chr_resized_peaks[ seqnames(chr_resized_peaks) == each_chr]),'IRanges') ) ) cov_container <- rbind(cov_container, chr_views) } } cov_container <- na.omit(cov_container) return(cov_container) } ############################### END OF FUNCTIONS ############################################ path <- '/path/to/data' ext_len <- 1000 gtf_file <- '/path/to/Homo_sapiens.GRCh37.75.gtf' gtf <- rtracklayer::import.gff( gtf_file, format='gtf') tss <-gtf[gtf$source == 'protein_coding' & gtf$type == 'exon' & gtf$exon_number == 1] #elementMetadata(tss) <- NULL pos_tss <- tss[strand(tss) == '+'] end(pos_tss) <- start(pos_tss) + ext_len start(pos_tss) <- start(pos_tss) - ext_len neg_tss <- tss[strand(tss) == '-'] start(neg_tss) <- end(neg_tss) - ext_len end(neg_tss) <- end(neg_tss) + ext_len # load sample sheet s_sheet <- read.csv( file='/path/to/samplesheet_v1.0.csv', stringsAsFactors = F) load( '/path/to/normalised_data.RData') in_files <- c(blood='unique.TSS.int.blood_onlx.normal_DHS_hg19.bed', Cancer='unique.TSS.plus.minus.1kbp.int.TCGA.ATAC.PanCancer.top60kpeaks.bed') for(i in 1:length(in_files)){ each_set <- in_files[i] cat(each_set, '\n') p_data_container <- data.frame( position=numeric(), coverage=numeric(), SampleName=character() ) gene_set_file <- str_c(path, each_set, sep='/') genes_tab <- read_delim( gene_set_file, col_names = FALSE, delim = '\t') %>% dplyr::rename(chr=X1, start=X2, end=X3, gene_name=X4) genes_pos_GR <- pos_tss[elementMetadata(pos_tss)$gene_name %in% genes_tab$gene_name ] elementMetadata(genes_pos_GR) <- NULL genes_neg_GR <- neg_tss[elementMetadata(neg_tss)$gene_name %in% genes_tab$gene_name ] elementMetadata(genes_neg_GR) <- NULL set_type <- names(each_set) cat(set_type, '\n') for( each_sample in s_sheet$SampleName){ cat( each_sample, '\n') resized_width <- width(pos_tss)[1] bam <- get(each_sample) cat( 'working on positive TSS ...', '\n') # get positiveoverlapping reads pos_ol_reads <- subsetByOverlaps( bam, genes_pos_GR) pos_cov <- get_coverage( ol_reads = pos_ol_reads, resized_width = resized_width, tss_gr = genes_pos_GR) cat( 'working on negative TSS ...', '\n') # get negative ovelapping reads neg_ol_reads <- subsetByOverlaps( bam, genes_neg_GR) neg_cov <- get_coverage( ol_reads = neg_ol_reads, resized_width = resized_width, tss_gr = genes_neg_GR) neg_cov <- neg_cov[,ncol(neg_cov):1] # need to reverse for negative strand coverage pos_plus_neg_cov <- rbind(pos_cov, neg_cov) tiny_plot_data <- data.frame( position=c(-ext_len:ext_len), coverage=colSums(pos_plus_neg_cov), SampleName=each_sample, stringsAsFactors = F) p_data_container <- bind_rows(p_data_container, tiny_plot_data) } out_file <- str_c( path, str_c(set_type, 'normalised_coverage_data_for_plots_v1.0.csv', sep='_'), sep='/') write.csv( x=p_data_container, file=out_file, row.names = F) } ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### 2 - TSS_eryt_lymph_cancer_genes_normalised_coverage ### ######################################################################################################################## # for each subset of genes extract TSSs from GTF file and get flanking regions Rscript ######################################################################################################################## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ library( tidyverse) library( GenomicRanges) path <- '../../../data' ext_len <- 1000 gtf_file <- '/path/to/Homo_sapiens.GRCh37.75.gtf' gtf <- rtracklayer::import.gff( gtf_file, format='gtf') tss <-gtf[gtf$source == 'protein_coding' & gtf$type == 'exon' & gtf$exon_number == 1] #elementMetadata(tss) <- NULL pos_tss <- tss[strand(tss) == '+'] end(pos_tss) <- start(pos_tss) + ext_len start(pos_tss) <- start(pos_tss) - ext_len neg_tss <- tss[strand(tss) == '-'] start(neg_tss) <- end(neg_tss) - ext_len end(neg_tss) <- end(neg_tss) + ext_len out_path <- '/path/to/coverage_merged_down_sampled/TSS' # load sample sheet if( !dir.exists(paths = out_path)){ dir.create(path=out_path, recursive = T) } bed_path <- str_c( path, 'data_xxx_xxx', sep='/') in_files <- list.files( bed_path, full.names = T) %>% .[str_detect(., '.bed$')] %>% .[str_detect(., 'eythrocyte|lymphocytes|top5kpeaks')] for(i in 1:length(in_files)){ each_set <- basename(in_files[i]) gene_set_file <- in_files[i] cat(each_set, '\n') #gene_set_file <- str_c(path, each_set, sep='/') genes_tab <- read_delim( gene_set_file, col_names = FALSE, delim = '\t') %>% dplyr::rename(chr=X1, start=X2, end=X3, gene_name=X4) tss <-gtf[gtf$source == 'protein_coding' & gtf$type == 'exon' & gtf$exon_number == 1] #tss <- tss[tss$gene_name %in% genes_tab$gene_name] pos_tss <- tss[strand(tss) == '+'] end(pos_tss) <- start(pos_tss) + ext_len start(pos_tss) <- start(pos_tss) - ext_len neg_tss <- tss[strand(tss) == '-'] start(neg_tss) <- end(neg_tss) - ext_len end(neg_tss) <- end(neg_tss) + ext_len genes_pos_GR <- pos_tss[elementMetadata(pos_tss)$gene_name %in% genes_tab$gene_name ] elementMetadata(genes_pos_GR) <- NULL genes_neg_GR <- neg_tss[elementMetadata(neg_tss)$gene_name %in% genes_tab$gene_name ] elementMetadata(genes_neg_GR) <- NULL all_ext_TSS <- unique(c(genes_pos_GR, genes_neg_GR) )%>% as.data.frame() out_file <- str_c( out_path, str_c( '1kb_extended_TSS', each_set, sep='_'), sep='/') write_delim( x=all_ext_TSS, file=out_file, delim = '\t') } ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ######################################################################################################################## # merge healthy, high and low ctDNA samples # downsample to lowest number of reads available from above three sets Rscript ######################################################################################################################## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ library( tidyverse) library( GenomicRanges) ############################### START OF FUNCTIONS ############################################ down_sample_GR <- function( bam_gr, lowest_reads){ rand_sample <- sample( 1:length(bam_gr),size = lowest_reads, replace = FALSE) ds_bam <- bam_gr[rand_sample] return(sort(ds_bam)) } ############################### END OF FUNCTIONS ############################################ path <- '../../../data' # load sample sheet s_sheet <- read.csv( file='/path/to/Figure_03_sample_sheet_v1.0.csv', stringsAsFactors = F) load( '/path/to/Figure_03_bams.RData') healthy_bams <- GRanges() high_ctDNA_bams <- GRanges() low_ctDNA_bams <- GRanges() read_counter <- 0 for( i in 1:nrow(s_sheet)){ each_sample <- s_sheet$unique_SampleName[i] cat(each_sample, '\n') ctDNA_level <- s_sheet$ctDNA_level[i] bam <- get(each_sample) if(ctDNA_level == 'healthy'){ healthy_bams <- c(healthy_bams, bam) } if(ctDNA_level == 'high'){ high_ctDNA_bams <- c(high_ctDNA_bams, bam) } if(ctDNA_level == 'low'){ low_ctDNA_bams <- c(low_ctDNA_bams, bam) } read_counter <- read_counter + length(bam) } # down sample bams lowest_reads <- min( length(healthy_bams), length(high_ctDNA_bams), length(low_ctDNA_bams)) ds_healthy <- down_sample_GR(bam_gr=healthy_bams, lowest_reads=lowest_reads) ds_high_ctDNA <- down_sample_GR(bam_gr=high_ctDNA_bams, lowest_reads=lowest_reads) ds_low_ctDNA <- down_sample_GR(bam_gr=low_ctDNA_bams, lowest_reads=lowest_reads) save( ds_healthy,ds_high_ctDNA, ds_low_ctDNA, file='/path/to/down_sampled_healthy_high_low_ctDNA.RData' ) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ######################################################################################################################## # randomly downsample TSS sets to lowest counts available # 10 random sets Rscript ######################################################################################################################## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ library(tidyverse) path = '/path/to/data' files_list <- list.files(path=path) %>% .[str_detect(., 'bed$')] # which file has lowest number of line min_lines = 10000000 min_lines_file = '' for ( each_file in files_list) { cat(each_file, '\n') in_file = str_c(path, each_file, sep='/') tab <- read_delim(file=in_file, col_names = T, delim = '\t', col_types = c('ciiic')) nrow_file = nrow(tab) if ( nrow_file < min_lines){ min_lines = nrow_file min_lines_file = each_file } } for( each_file in files_list ){ cat(each_file, '\n') in_file = str_c(path, each_file, sep='/') tab <- read_delim(file=in_file, col_names = T, delim = '\t', col_types = c('ciiic')) out_dir = str_remove(each_file , '.bed$') out_path = str_c(path, 'downsample', out_dir, sep='/') if ( ! dir.exists(out_path) ){ dir.create(out_path, recursive = T) } iters = ifelse( each_file == min_lines_file, 1, 10 ) for( i in 1:iters ){ rand_rows = sample( 1:nrow(tab), size = min_lines, replace = FALSE) rand_tab = tab[rand_rows,] %>% arrange(seqnames, start, end) %>% select(-width) out_file = str_c( 'tss_', i, '.csv') out_file_p <- str_c(out_path, out_file, sep='/') write_csv(x=rand_tab, file=out_file_p) } } ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ######################################################################################################################## # get coverage on TSS regions Rscript ######################################################################################################################## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ library( tidyverse) library( GenomicRanges) ############################### START OF FUNCTIONS ########################################## get_coverage <- function( ol_reads, resized_width, tss_gr){ read_coverage <- coverage(ol_reads) cov_container <- matrix(ncol = resized_width) for( each_chr in unique(names(read_coverage)) ){ #cat( each_chr, '\n') chr_resized_peaks <- tss_gr[ seqnames(tss_gr) == each_chr] if( length(chr_resized_peaks) > 0 ){ chr_views <- as.matrix(Views(read_coverage[[each_chr]],as(ranges(chr_resized_peaks[ seqnames(chr_resized_peaks) == each_chr]),'IRanges') ) ) cov_container <- rbind(cov_container, chr_views) } } cov_container <- na.omit(cov_container) return(cov_container) } ############################### END OF FUNCTIONS ############################################ ext_len <- 1000 path <- '/path/to/data/coverage_merged_down_sampled/TSS' files_list <- list.files(path=path) %>% .[str_detect(., 'bed$')] load( '/path/to/processed/Figure_03/down_sampled_healthy_high_low_ctDNA.RData') p_data_container <- tibble() for( each_file in files_list ){ cat(each_file, '\n') dir = str_remove(each_file , '.bed$') in_path = str_c(path, 'downsample', dir, sep='/') tss_files <- list.files(path=in_path) %>% .[str_detect(., 'csv$')] for(each_ds_tss in tss_files){ cat(each_ds_tss, '\n') tss_in_file <- str_c( in_path, each_ds_tss, sep='/') tab <- read_csv( file=tss_in_file, col_types = c('ciic')) tss <- with(tab, GRanges( seqnames = seqnames, ranges = IRanges(start=start, end=end), strand = strand)) pos_tss <- tss[strand(tss) == '+'] neg_tss <- tss[strand(tss) == '-'] for(each_type in c('ds_healthy', 'ds_high_ctDNA', 'ds_low_ctDNA')){ cat(each_type, '\n') resized_width <- width(pos_tss)[1] bam <- get(each_type) cat( 'working on positive TSS ...', '\n') # get positiveoverlapping reads pos_ol_reads <- subsetByOverlaps( bam, pos_tss) pos_cov <- get_coverage( ol_reads = pos_ol_reads, resized_width = resized_width, tss_gr = pos_tss) cat( 'working on negative TSS ...', '\n') # get negative ovelapping reads neg_ol_reads <- subsetByOverlaps( bam, neg_tss) neg_cov <- get_coverage( ol_reads = neg_ol_reads, resized_width = resized_width, tss_gr = neg_tss) neg_cov <- neg_cov[,ncol(neg_cov):1] # need to reverse for negative strand coverage pos_plus_neg_cov <- rbind(pos_cov, neg_cov) tiny_plot_data <- tibble( position=c(-ext_len:ext_len), coverage=colSums(pos_plus_neg_cov), ct_type=each_type, tss_type=each_file, rand_sample=str_remove(each_ds_tss, '.csv') ) p_data_container <- bind_rows(p_data_container, tiny_plot_data) } } } out_path <- str_c(path, 'downsample', 'coverage', sep='/') if ( ! dir.exists(out_path) ){ dir.create(out_path, recursive = T) } out_file <- str_c(out_path, 'downsampled_coverege_on_randomly_downsampled_TSS_v1.0.csv', sep='/') write.csv(x=p_data_container, file=out_file) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ######################################################################################################################## # get mean coverage from ten random TSS coverage sets Rscript ######################################################################################################################## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ library(tidyverse) tab <- read_csv( file='/path/to/downsampled_coverege_on_randomly_downsampled_TSS_v1.0.csv') %>% dplyr::select(-X1) tab_ori <- tab tab <- tab %>% group_by(tss_type, ct_type, position) %>% summarise( mean_coverage = mean(coverage)) %>% mutate( tss_type = str_remove(tss_type, '.bed')) %>% mutate( tss_type = str_remove(tss_type, '_hg19' )) write_csv(x=tab, file='/path/to/downsampled_mean_coverege_on_randomly_downsampled_TSS_v1.0.csv') ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~