Supplemental_Code_1 – R code used to generate the haplotypes for oligos input into the STARR-seq library ## required input information # peak.file -- a HOMER-formatted peak file containing 7 columns. # gen -- genome to pull sequences from (e.g., hg38), and that the vcf is annotated in. note needs to match the name of the genome as HOMER calls it # out.dir -- path where output files will be written # input.type -- either 'peak' for 'bed' (traditional bed-formatted file), peak are 1-based in the genome; bed are 0-based. This parameter tells the script how to use the start and end coordinates # vcf.file -- full filename and path to the vcf file used to annotate. # gen.path -- path to HOMER genome files ### note regarding vcf.file: as in the example input vcf, this is written to take in two 'indivdiuals', each representing a haplotype. Haplotypes to be compared need to be pre-formatted into VCFs. # required R packages: library("seqinr") #### OUTPUT FILE OF INTEREST # "annot.wvar.and.seq.txt" has each peak with the genomic sequences for the two haplotypes in two columns to the right. Note that the number of SNPs refers to the total SNPs from the VCF, not necessarily only those that vary between the two haplotypes. ## after the function motMutAnalysis() is defined (see below), this command will run it: motMutAnalysis( peak.file = , gen = , out.dir = , input.type = , vcf.file = , gen.path = ) #### this script defines the function. motMutAnalysis <- function(peak.file, gen, out.dir, input.type, vcf.file, gen.path) { require("seqinr") setwd(out.dir) factor = "" dirs.tagcounts = "" wout.variants.output = paste(out.dir, factor, "peaksWITHOUTvariants.txt", sep = "") wvar.output.and.seq = paste(out.dir, factor, "peaksWITHvariants.incl.seq.txt", sep = "") bed.file = "/path/to/0-based/bed/file.txt" # will append .fasta and .fasta.motif.locations to the out.dir below output.summary = paste(out.dir, factor, "peaksWITHvariantsSSmotifs.summary.txt", sep = "") annot.counts.file = paste(out.dir, factor, "TFcounts.txt", sep="") # if beginning with peak file (1-based), this converts to a bed file and will proceed with that if ( input.type == "peak") { peaks = read.delim(peak.file, as.is = T, stringsAsFactors = F, header = T) as.bedfile = data.frame(peaks[,2], peaks[,3] -1, peaks[,4] -1 , peaks[, c(1, 7, 5)]) bet = unlist(strsplit(peak.file,split="/")) bet = bet[length(bet)] bed_peak.file = sub(".txt", ".bed.txt", bet) write.table(as.bedfile, paste(out.dir, bed_peak.file, sep = ""), col.names = F, quote = F, sep = "\t", row.names = F) } # if beginning with bed file (0-based), will use the path to file at "bed.file" as input if ( input.type == "bed") { bed_peak.file = peak.file # if only has 3 columns, add another to work with script } # Create command for Homer that annotates regions using -vcf # NOTE: if you say " -size given " it annotates vcf column as absolute position within peak instead of relative to peak centers!!!!! string=paste("annotatePeaks.pl ", paste( bed_peak.file, sep = ""), " ", gen, " -vcf ", vcf.file, " -size given -nogene -noann > ", paste( sub(".txt", "", bed_peak.file),sep=""), ".vcf", sep="") system(string) annot= read.delim(paste(sub(".txt", "", paste( bed_peak.file, sep = "")), ".vcf", sep=""), header=T, sep="\t", stringsAsFactors=F) # Split peaks into ones with variants (annot.wvar) and those without (annot.woutvar) annot.wvar = annot[annot[,8]!="",] annot.woutvar = annot[annot[,8]=="",] # Writes out set of peaks with no variants listed in vcf.file write.table(annot.woutvar, wout.variants.output, quote=F, row.names=F, sep = "\t") rm(annot) annot.wvar[["first.seq"]]="NA" annot.wvar[["second.seq"]]="NA" # split up vcf annotations to get positions and alleles for (i in 1:nrow(annot.wvar)) { #get # variants in region aaa=strsplit(annot.wvar[i,8], split="()")[[1]] no.variants=length(which(unlist(aaa)=="(")) #go thru variants to make a table for (j in 1:no.variants) { #get positions of variants first=which(unlist(aaa)=="(")[j] first.close=which(unlist(aaa)==")")[j] if (j > 1) div=which(unlist(aaa)==",") if (j == 1) bbb = as.numeric(paste(aaa[seq(1,first-1)], collapse="")) if (j > 1) bbb = as.numeric(paste(aaa[seq(div[j-1]+1,first-1)], collapse="")) pos.in.seq=bbb-1 #initiate temp table if ( j ==1 ) { temp=as.data.frame(matrix(ncol=7, nrow=no.variants+1)) names(temp)=c("chr", "start", "end", "pos.in.region", "length.ref.allele", "first.allele", "second.allele") } #fill in temp table temp[,1]=annot.wvar[i,2] temp[1,2]=annot.wvar[i,3] # This begins the start coordinates in table: temp[j,4]=pos.in.seq temp[no.variants+1, 6:7]="" # this sets the ref and alt alleles of last row of table to "" temp[j,3]=temp[1,2]+pos.in.seq # this is the starting coordiantes of region + the position of the variant -1 so that it will document the last coordinate before the varinat of the row #get allele in reg/alt pos=seq(first+1, first.close-1) #get length of variant in reference len.var=length(pos) #extract expected genotype from annot.wvar[i,8] exp.geno=paste(aaa[seq(first+1,first.close-1,1)], collapse="") temp[j,5]=len.var pull1=seq(which(aaa=="=")[j]+1, which(aaa=="=")[j]+len.var, 1) pull2=seq(which(aaa==";")[j]+1, which(aaa=="/")[j*2]-1, 1) #two = ifelse(pos.in.seq-1>0, paste(aaa[pull], collapse=""), "") ### this seems to break it if variant in in "0" position one = paste(aaa[pull1], collapse="") two = paste(aaa[pull2], collapse="") temp[j,6]=one temp[j,7]=two #fill in start of sequences to get temp[j+1,2]=temp[1,2]+as.numeric(temp[j,4]) + as.numeric(temp[j,5]) # this is the starting of the following row: it is the ending of the previus row plus the length of the ref variant #this completes last row of "temp" if (j == no.variants) { temp[no.variants+1,3]= annot.wvar[i,4] # this sets last row's end coord. temp[no.variants+1,4:5]="" temp[1,2]=temp[1,2] } } # completes iterations thru j - temp table loop # make bed file of regions to extract & write bed file # Note that this bed file is indexed the same as the temp file was created. Hence, it was created to be used as a bed file (0-based) bed = data.frame(temp[,1:3], paste(annot.wvar[i,1], seq(1:(no.variants+1)), sep="-"), "+") write.table(bed, paste(out.dir, "bed.txt", sep=""), sep="\t", row.names=F,quote=F) # extract regions using homer and save to .fa file string = paste("homerTools extract ", out.dir, "bed.txt", " ", gen.path, "/", gen, "/ -fa > ", out.dir, "bed.txt.fa " , sep="") system(string) #read in sequence library("seqinr") sequence=read.fasta(paste(out.dir, "bed.txt.fa", sep=""), seqonly=T, as.string=T) sequence=unlist(sequence) #add sequence to temp temp=data.frame(temp, sequence, stringsAsFactors=F) #add complete ref and alt sequences for region to annot table first.seq=paste(temp[,8],temp[,6], sep="", collapse="") second.seq=paste(temp[,8],temp[,7], sep="", collapse="") annot.wvar[["first.seq"]][i]=first.seq annot.wvar[["second.seq"]][i]=second.seq annot.wvar[["num.snps"]][i]=no.variants print( paste(i, " of ", nrow(annot.wvar))) gc() } write.table(annot.wvar, paste(out.dir, "annot.wvar.and.seq.txt",sep=""), row.names=F, quote=F, sep="\t") #################### fasta.names = c() for (i in 1:nrow(annot.wvar)){ interm = paste(annot.wvar[i,1:8], sep="", collapse=",") fasta.names[i] = paste( ">" , interm, sep="") print(paste(i, "of ", nrow(annot.wvar)), sep="") gc() } # Get names for tab-delimited sequence file for.names = annot.wvar[,1:8] names.tog = apply(for.names, 1, paste, collapse = ",") # Paste strain-of-origin together with peak and variant info first.names = paste(names.tog, ",FIRST", sep = "") second.names = paste(names.tog, ",SECOND", sep = "") # Paste names with sequence into tab-delimited file for homer first.fasta = data.frame(first.names, annot.wvar$first.seq) second.fasta = data.frame(second.names, annot.wvar$second.seq) # write fasta files write.table(first.fasta, paste(out.dir, "first.tabDelimSeq.txt", sep=""), row.names=F, col.names=F, quote=F, sep="\t") write.table(second.fasta, paste(out.dir, "second.tabDelimSeq.txt", sep=""), row.names=F, col.names=F, quote=F, sep="\t") return(print("done")) }