Supplemental_Code_1 – R code used to generate the haplotypes for oligos input into the STARR-seq library


## required input information
# peak.file   --   a HOMER-formatted peak file containing 7 columns. 
# gen         --   genome to pull sequences from (e.g., hg38), and that the vcf is annotated in. note needs to match the name of the genome as HOMER calls it
# out.dir     --   path where output files will be written
# input.type  --   either 'peak' for 'bed' (traditional bed-formatted file), peak are 1-based in the genome; bed are 0-based. This parameter tells the script how to use the start and end coordinates
# vcf.file    --   full filename and path to the vcf file used to annotate. 
# gen.path    --   path to HOMER genome files

### note regarding vcf.file: as in the example input vcf, this is written to take in two 'indivdiuals', each representing a haplotype. Haplotypes to be compared need to be pre-formatted into VCFs.
# required R packages: library("seqinr")

#### OUTPUT FILE OF INTEREST

# "annot.wvar.and.seq.txt" has each peak with the genomic sequences for the two haplotypes in two columns to the right. Note that the number of SNPs refers to the total SNPs from the VCF, not necessarily only those that vary between the two haplotypes.

## after the function motMutAnalysis() is defined (see below), this command will run it:
motMutAnalysis(
  peak.file = <path and filename of HOMER peak file or bed file>,
  gen = <genome name as HOMER calls it>,
  out.dir = <path to output directory>,
  input.type = <either 'peak' or 'bed' depending on input file format>,
  vcf.file = <path and filename of .vcf file>,
  gen.path = <path to HOMER genome files>
)

#### this script defines the function. 
motMutAnalysis <- function(peak.file, gen, out.dir, input.type, vcf.file, gen.path) {
    require("seqinr")
        setwd(out.dir)
        factor = ""
        dirs.tagcounts = ""
        wout.variants.output = paste(out.dir, factor, "peaksWITHOUTvariants.txt", sep = "")
        wvar.output.and.seq = paste(out.dir, factor, "peaksWITHvariants.incl.seq.txt", sep = "")
        
        bed.file = "/path/to/0-based/bed/file.txt"
        
        # will append <strain>.fasta and <strain>.fasta.motif.locations to the out.dir below
        
        output.summary = paste(out.dir, factor, "peaksWITHvariantsSSmotifs.summary.txt", sep = "")
        annot.counts.file = paste(out.dir, factor, "TFcounts.txt", sep="")
        
        # if beginning with peak file (1-based), this converts to a bed file and will proceed with that
        if ( input.type == "peak") {
            
            peaks = read.delim(peak.file, as.is = T, stringsAsFactors = F, header = T)
            as.bedfile = data.frame(peaks[,2], peaks[,3] -1, peaks[,4] -1 , peaks[, c(1, 7, 5)])
            bet = unlist(strsplit(peak.file,split="/"))
            bet = bet[length(bet)]
            bed_peak.file = sub(".txt", ".bed.txt", bet)
            write.table(as.bedfile, paste(out.dir, bed_peak.file, sep = ""), col.names = F, quote = F, sep = "\t", row.names = F)
        }
        
        # if beginning with bed file (0-based), will use the path to file at "bed.file" as input
        if ( input.type == "bed") {
            
            bed_peak.file = peak.file
            
            # if only has 3 columns, add another to work with script
            
        }
        
        # Create command for Homer that annotates regions using -vcf
        # NOTE: if you say " -size given " it annotates vcf column as absolute position within peak instead of relative to peak centers!!!!!
        
        string=paste("annotatePeaks.pl ", paste( bed_peak.file, sep = ""), " ", gen, " -vcf ", vcf.file, " -size given -nogene -noann > ", paste( sub(".txt", "", bed_peak.file),sep=""), ".vcf", sep="")
        
        system(string)
        
        annot= read.delim(paste(sub(".txt", "", paste( bed_peak.file, sep = "")), ".vcf", sep=""), header=T, sep="\t", stringsAsFactors=F)

        
        # Split peaks into ones with variants (annot.wvar) and those without (annot.woutvar)
        annot.wvar = annot[annot[,8]!="",]
        annot.woutvar = annot[annot[,8]=="",]
        
        # Writes out set of peaks with no variants listed in vcf.file
        write.table(annot.woutvar, wout.variants.output, quote=F, row.names=F, sep = "\t")
        
        rm(annot)
        
        annot.wvar[["first.seq"]]="NA"
        annot.wvar[["second.seq"]]="NA"
        
        # split up vcf annotations to get positions and alleles
        
        for (i in 1:nrow(annot.wvar)) {

            #get # variants in region
            aaa=strsplit(annot.wvar[i,8], split="()")[[1]]
            no.variants=length(which(unlist(aaa)=="("))
            
            #go thru variants to make a table
            for (j in 1:no.variants) {
                
                #get positions of variants 
                first=which(unlist(aaa)=="(")[j]
                first.close=which(unlist(aaa)==")")[j]
                if (j > 1) div=which(unlist(aaa)==",")
                if (j == 1) bbb = as.numeric(paste(aaa[seq(1,first-1)], collapse=""))
                if (j > 1) bbb = as.numeric(paste(aaa[seq(div[j-1]+1,first-1)], collapse=""))
                pos.in.seq=bbb-1  
                
                #initiate temp table
                if ( j ==1 ) {
                    temp=as.data.frame(matrix(ncol=7, nrow=no.variants+1))
                    names(temp)=c("chr", "start", "end", "pos.in.region", "length.ref.allele", "first.allele", "second.allele")
                }
                
                #fill in temp table    
                temp[,1]=annot.wvar[i,2]
                temp[1,2]=annot.wvar[i,3] # This begins the start coordinates in table: 
                temp[j,4]=pos.in.seq 
                temp[no.variants+1, 6:7]="" # this sets the ref and alt alleles of last row of table to ""
                temp[j,3]=temp[1,2]+pos.in.seq   # this is the starting coordiantes of region + the position of the variant -1 so that it will document the last coordinate before the varinat of the row
                #get allele in reg/alt
                pos=seq(first+1, first.close-1)
                #get length of variant in reference
                len.var=length(pos)
                #extract expected genotype from annot.wvar[i,8]
                exp.geno=paste(aaa[seq(first+1,first.close-1,1)], collapse="")
                temp[j,5]=len.var
                pull1=seq(which(aaa=="=")[j]+1, which(aaa=="=")[j]+len.var, 1) 
                pull2=seq(which(aaa==";")[j]+1, which(aaa=="/")[j*2]-1, 1) 
                #two = ifelse(pos.in.seq-1>0, paste(aaa[pull], collapse=""), "") ### this seems to break it if variant in in "0" position
                one = paste(aaa[pull1], collapse="")
                two = paste(aaa[pull2], collapse="")
                temp[j,6]=one
                temp[j,7]=two
                #fill in start of sequences to get
                temp[j+1,2]=temp[1,2]+as.numeric(temp[j,4]) + as.numeric(temp[j,5]) # this is the starting of the following row: it is the ending of the previus row plus the length of the ref variant   
                
                #this completes last row of "temp"
                if (j == no.variants) { 
                    temp[no.variants+1,3]= annot.wvar[i,4]   # this sets last row's end coord.
                    temp[no.variants+1,4:5]=""
                    temp[1,2]=temp[1,2]   
                }
                
            } # completes iterations thru j - temp table loop
            
            # make bed file of regions to extract & write bed file
            # Note that this bed file is indexed the same as the temp file was created. Hence, it was created to be used as a bed file (0-based)
            bed = data.frame(temp[,1:3], paste(annot.wvar[i,1], seq(1:(no.variants+1)), sep="-"), "+")
            write.table(bed, paste(out.dir, "bed.txt", sep=""), sep="\t", row.names=F,quote=F)
            
            # extract regions using homer and save to .fa file
            string = paste("homerTools extract ", out.dir, "bed.txt", " ", gen.path, "/",
                           gen, "/ -fa > ", out.dir, "bed.txt.fa " , sep="")
            system(string)
            
            #read in sequence
            library("seqinr")
            sequence=read.fasta(paste(out.dir, "bed.txt.fa", sep=""), seqonly=T, as.string=T)
            sequence=unlist(sequence)
            
            #add sequence to temp
            temp=data.frame(temp, sequence, stringsAsFactors=F)
            
            #add complete ref and alt sequences for region to annot table
            first.seq=paste(temp[,8],temp[,6], sep="", collapse="")
            second.seq=paste(temp[,8],temp[,7], sep="", collapse="")
            
            annot.wvar[["first.seq"]][i]=first.seq
            annot.wvar[["second.seq"]][i]=second.seq
            annot.wvar[["num.snps"]][i]=no.variants
            
            print( paste(i, " of ", nrow(annot.wvar)))
            gc()
            
        }
        
        write.table(annot.wvar, paste(out.dir, "annot.wvar.and.seq.txt",sep=""), row.names=F, quote=F, sep="\t")
        
        ####################

        fasta.names = c()
        for (i in 1:nrow(annot.wvar)){
            interm = paste(annot.wvar[i,1:8], sep="", collapse=",")
            fasta.names[i] = paste( ">" , interm, sep="")
            print(paste(i, "of ", nrow(annot.wvar)), sep="")
            gc()
        }
        
        # Get names for tab-delimited sequence file
        for.names = annot.wvar[,1:8]
        names.tog = apply(for.names, 1, paste, collapse = ",")
        
        # Paste strain-of-origin together with peak and variant info
        first.names = paste(names.tog, ",FIRST", sep = "")
        second.names = paste(names.tog, ",SECOND", sep = "")
        
        # Paste names with sequence into tab-delimited file for homer
        first.fasta = data.frame(first.names, annot.wvar$first.seq)
        second.fasta = data.frame(second.names, annot.wvar$second.seq)
        
        # write fasta files
        write.table(first.fasta, paste(out.dir, "first.tabDelimSeq.txt", sep=""), row.names=F, col.names=F, quote=F, sep="\t")
        write.table(second.fasta, paste(out.dir, "second.tabDelimSeq.txt", sep=""), row.names=F, col.names=F, quote=F, sep="\t")
        return(print("done"))
}