Using SelexGLM to Fit a Feature-Based Protein-DNA Recognition model to SELEX-seq Data

Gabriella D. Martini and Harmen J. Bussemaker

2017-11-21

library(SELEX)
## Loading required package: rJava
## Warning: package 'rJava' was built under R version 3.2.5
## Loading required package: Biostrings
## Warning: package 'Biostrings' was built under R version 3.2.2
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:rJava':
## 
##     anyDuplicated, duplicated, sort, unique
## The following object is masked from 'package:stats':
## 
##     xtabs
## The following objects are masked from 'package:base':
## 
##     Filter, Find, Map, Position, Reduce, anyDuplicated, append,
##     as.data.frame, as.vector, cbind, colnames, do.call,
##     duplicated, eval, evalq, get, intersect, is.unsorted, lapply,
##     mapply, match, mget, order, paste, pmax, pmax.int, pmin,
##     pmin.int, rank, rbind, rep.int, rownames, sapply, setdiff,
##     sort, table, tapply, union, unique, unlist, unsplit
## Loading required package: S4Vectors
## Warning: package 'S4Vectors' was built under R version 3.2.2
## Loading required package: stats4
## Creating a generic function for 'nchar' from package 'base' in package 'S4Vectors'
## Loading required package: IRanges
## Warning: package 'IRanges' was built under R version 3.2.2
## Warning in .recacheSubclasses(def@className, def, doSubclasses, env):
## undefined subclass "externalRefMethod" of class "expressionORfunction";
## definition not updated
## Warning in .recacheSubclasses(def@className, def, doSubclasses, env):
## undefined subclass "externalRefMethod" of class "functionORNULL";
## definition not updated
## Loading required package: XVector
library(stringi)
## Warning: package 'stringi' was built under R version 3.2.5
library(Biostrings)
library(SelexGLM)
## Loading required package: RColorBrewer
library(devtools)
library(reshape2)
## Warning: package 'reshape2' was built under R version 3.2.5
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.5
library(Rmisc)
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 3.2.5
## Loading required package: plyr
## Warning: package 'plyr' was built under R version 3.2.5
## 
## Attaching package: 'plyr'
## The following object is masked from 'package:XVector':
## 
##     compact
## The following object is masked from 'package:IRanges':
## 
##     desc
## The following object is masked from 'package:S4Vectors':
## 
##     rename

We start with some initialization related to the SELEX package:

options(java.parameters = "-Xmx4000M")
workDir = tempdir()
selex.config(workingDir=workDir, maxThreadNumber=4)

Next, we will define the SELEX samples that we want to analyze. We will use the example data from the SELEX package:

selex.loadAnnotation(system.file("extdata", "config.xml", package="SELEX"))
selex.sampleSummary()
##        seqName   sampleName rounds leftBarcode rightBarcode
## 3 R0.libraries R0.barcodeCG      0         TGG      CCACGTC
## 2 R0.libraries R0.barcodeGC      0         TGG      CCAGCTG
## 1 R2.libraries    ExdHox.R2      2         TGG      CCAGCTG
##                       leftFlank                   rightFlank
## 3 GTTCAGAGTTCTACAGTCCGACGATCTGG CCACGTCTCGTATGCCGTCTTCTGCTTG
## 2 GTTCAGAGTTCTACAGTCCGACGATCTGG CCAGCTGTCGTATGCCGTCTTCTGCTTG
## 1 GTTCAGAGTTCTACAGTCCGACGATCTGG CCAGCTGTCGTATGCCGTCTTCTGCTTG
##                                                                                    seqFile
## 3 /Library/Frameworks/R.framework/Versions/3.2/Resources/library/SELEX/extdata/R0.fastq.gz
## 2 /Library/Frameworks/R.framework/Versions/3.2/Resources/library/SELEX/extdata/R0.fastq.gz
## 1 /Library/Frameworks/R.framework/Versions/3.2/Resources/library/SELEX/extdata/R2.fastq.gz
r0.train = selex.sample(seqName = 'R0.libraries', sampleName='R0.barcodeGC', round = 0)
r0.test = selex.sample(seqName = 'R0.libraries', sampleName='R0.barcodeCG', round = 0)
dataSample = selex.sample(seqName = 'R2.libraries', sampleName = 'ExdHox.R2', round = 2)

Markov model is built, information gain is used to identify k-mer length of binding site, kmer tables are built, and probes are counted in a way that corrects for the zero-deflated nature of data corrected.

# MARKOV MODEL BUILT
kmax = selex.kmax(sample = r0.test)
## Counting [R0.libraries.R0.barcodeCG.0][ K = 1 ]
## Counting [R0.libraries.R0.barcodeCG.0][ K = 2 ]
## Counting [R0.libraries.R0.barcodeCG.0][ K = 3 ]
## Counting [R0.libraries.R0.barcodeCG.0][ K = 4 ]
## Counting [R0.libraries.R0.barcodeCG.0][ K = 5 ]
## Counting [R0.libraries.R0.barcodeCG.0][ K = 6 ]
## [ sample id : R0.libraries.R0.barcodeCG.0,  filter:  variableRegionIncludeRegex:null,variableRegionExcludeRegex:null,variableRegionGroupRegex:null ]
## [ R0.libraries.R0.barcodeCG.0.kmax = 5 ]
# Train Markov model on Hm 16bp library Round 0 data
mm = selex.mm(sample = r0.train, order = NA, crossValidationSample =r0.test, Kmax = kmax, mmMethod = "TRANSITION")
## Overwriting Kmax = 5
## Counting [R0.libraries.R0.barcodeGC.0][ K = 1 ]
## Counting [R0.libraries.R0.barcodeGC.0][ K = 2 ]
## Counting [R0.libraries.R0.barcodeGC.0][ K = 3 ]
## Counting [R0.libraries.R0.barcodeGC.0][ K = 4 ]
## Counting [R0.libraries.R0.barcodeGC.0][ K = 5 ]
## [ markovLength = 3 ]
## [ maxR = 0.989094 ]
## [ Model = MarkovModelInfo [markovLength=3, markovLengthTotalCount=483784, markovR2=0.9890939798281818, markovCountsPath=/var/folders/6r/52dcl0sj1yg0z69w89t6fjyr0000gp/T/Rtmp5eCqOw//R0.libraries.R0.barcodeGC.0.3.dat_A7FE7F4E2E78A43F892C7F3227FFA520, markovObjPath=/var/folders/6r/52dcl0sj1yg0z69w89t6fjyr0000gp/T/Rtmp5eCqOw//R0.libraries.R0.barcodeGC.0.3.dat_A7FE7F4E2E78A43F892C7F3227FFA520.prob.obj, sample=config.ExperimentReference@133c3b45, markovModelMethod=TRANSITION, crossValidationSample=config.ExperimentReference@f5bfdbd, filter=variableRegionIncludeRegex:null,variableRegionExcludeRegex:null,variableRegionGroupRegex:null,kmerIncludeRegex:null,kmerExcludeRegex:null,kmerIncludeOnly:null] ]
mmscores = selex.mmSummary(sample = r0.train)
ido = which(mmscores$R==max(mmscores$R))
mm.order = mmscores$Order[ido]

More preliminaries:

# INFOGAIN USED TO CALCULATE KLEN
libLen = as.numeric(as.character(selex.getAttributes(dataSample)$VariableRegionLength))
selex.infogain(sample = dataSample, k = c((mm.order+1):libLen), markovModel = mm)
## Counting [InfoGain][ K = 3 ]
## Counting [InfoGain][ K = 4 ]
## Counting [InfoGain][ K = 5 ]
## Counting [InfoGain][ K = 6 ]
## Counting [InfoGain][ K = 7 ]
## Counting [InfoGain][ K = 8 ]
## Counting [InfoGain][ K = 9 ]
## Counting [InfoGain][ K = 10 ]
## Counting [InfoGain][ K = 11 ]
## Counting [InfoGain][ K = 12 ]
## Counting [InfoGain][ K = 13 ]
## Counting [InfoGain][ K = 14 ]
## Counting [InfoGain][ K = 15 ]
## Counting [InfoGain][ K = 16 ]
## [1] 2.420417
infoscores = selex.infogainSummary(sample = dataSample)

#information gain barplot
idx = which(infoscores$InformationGain==max(infoscores$InformationGain))
colstring = rep('BLUE', nrow(infoscores))
colstring[idx] = 'RED'
barplot(height=infoscores$InformationGain, names.arg=infoscores$K, col=colstring,
        xlab="Oligonucleotide Length (bp)", ylab="Information Gain (bits)")

kLen = infoscores$K[idx]
# For the sake of previous analysis on the Hox data used in this example, I will use kLen.f = 12 as my k-mer length, even though kLen identified through the information gain analysis has kLen = 13
data.kmerTable = selex.affinities(sample=dataSample, k=kLen, markovModel=mm)
## Counting [R2.libraries.ExdHox.R2.2][ K = 9 ]
## [ Lowest Count =  1 ]
data.kmerTable = data.kmerTable[order(-data.kmerTable$Affinity), ]
rownames(data.kmerTable) = NULL

data.probeCounts = getProbeCounts(dataSample, markovModel = mm)
## Counting [R2.libraries.split.1.ExdHox.R2.split.1.2][ K = 16 ]
## [ Lowest Count =  1 ]
## Counting [R2.libraries.split.2.ExdHox.R2.split.2.2][ K = 16 ]
## [ Lowest Count =  1 ]
summary(data.probeCounts)
##     Probe           ObservedCount     Probability            Round  
##  Length:24504       Min.   :0.0000   Min.   :3.036e-11   Min.   :2  
##  Class :character   1st Qu.:0.0000   1st Qu.:2.537e-10   1st Qu.:2  
##  Mode  :character   Median :0.0000   Median :3.882e-10   Median :2  
##                     Mean   :0.0375   Mean   :4.493e-10   Mean   :2  
##                     3rd Qu.:0.0000   3rd Qu.:5.725e-10   3rd Qu.:2  
##                     Max.   :3.0000   Max.   :3.307e-09   Max.   :2
print(data.probeCounts[1:10,])
##               Probe ObservedCount  Probability Round
## 1  GTTGATTGATGGGTTT             3 1.198516e-09     2
## 2  GATGATTGATTGTTAT             3 1.081091e-09     2
## 3  GATGATTGATCGATGT             3 5.086886e-10     2
## 4  GAGAATGATTGATTAC             3 3.691869e-10     2
## 5  GAATGATTGATTACAT             3 5.432474e-10     2
## 6  ATGTTTGATTGATTAT             3 1.425483e-09     2
## 7  ATGATTGATGAGTCTA             3 4.712116e-10     2
## 8  AATGATTGATTATTGT             3 1.051582e-09     2
## 9  AAATGATTGATTAGCT             3 4.940620e-10     2
## 10 AAATGATTGATTACTT             3 6.086128e-10     2
# Inputs about library are data specific 
model = new("model",
             varRegLen = libLen,
             leftFixedSeq =  "GTTCAGAGTTCTACAGTCCGACGATCTGG", 
             rightFixedSeq ="CCAGCTGTCGTATGCCGTCTTCTGCTTG", 
             seedLen = kLen, 
             leftFixedSeqOverlap = 4,
             initialAffinityCutoff = 0.00,
             missingValueSuppression = 1,
             minSeedValue = .001, 
             upFootprintExtend = 2,
             includeWindowFactor = FALSE,
             confidenceLevel = .95, 
             verbose = FALSE, 
             useFixedValuesOffset.N = FALSE,
             rounds = list(c(2)),
             rcSymmetric = FALSE,
             minAffinity = 0.01
          )

Inspect current state of model object:

model@features@N
## An object of class 'N'
## 
## Slot "seedLen":  9 
## 
## Slot "N.upFootprintExtend":  2 
## 
## Slot "N.downFootprintExtend":  2 
## 
## Slot "fS.upFootprintExtend":  2 
## 
## Slot "fS.downFootprintExtend":  2 
## 
## Slot "fpLen":  13 
## 
## Slot "N.set":  1 2 3 4 5 6 7 8 9 10 11 12 13 
## 
## Slot "N.equivMat":
## 13  x  13  null equivalence matrix
## 
## Slot "N.values":
##     1 2 3 4 5 6 7 8 9 10 11 12 13
## N.A 0 0 0 0 0 0 0 0 0  0  0  0  0
## N.C 0 0 0 0 0 0 0 0 0  0  0  0  0
## N.G 0 0 0 0 0 0 0 0 0  0  0  0  0
## N.T 0 0 0 0 0 0 0 0 0  0  0  0  0
## 
## 
## Slot "N.errors":
##     1 2 3 4 5 6 7 8 9 10 11 12 13
## N.A 0 0 0 0 0 0 0 0 0  0  0  0  0
## N.C 0 0 0 0 0 0 0 0 0  0  0  0  0
## N.G 0 0 0 0 0 0 0 0 0  0  0  0  0
## N.T 0 0 0 0 0 0 0 0 0  0  0  0  0
## 
## 
## Slot "N.z":
##     1 2 3 4 5 6 7 8 9 10 11 12 13
## N.A 0 0 0 0 0 0 0 0 0  0  0  0  0
## N.C 0 0 0 0 0 0 0 0 0  0  0  0  0
## N.G 0 0 0 0 0 0 0 0 0  0  0  0  0
## N.T 0 0 0 0 0 0 0 0 0  0  0  0  0
## 
## 
## Slot "N.sig":
##     1 2 3 4 5 6 7 8 9 10 11 12 13
## N.A 0 0 0 0 0 0 0 0 0  0  0  0  0
## N.C 0 0 0 0 0 0 0 0 0  0  0  0  0
## N.G 0 0 0 0 0 0 0 0 0  0  0  0  0
## N.T 0 0 0 0 0 0 0 0 0  0  0  0  0
## 
## 
## Slot "N.oldValues":
## <4 x 13 x 0 array of double>
## 
## Slot "N.oldErrors":
## <4 x 13 x 0 array of double>
## 
## Slot "N.oldZ":
## <4 x 13 x 0 array of double>
## 
## Slot "N.oldSig":
## <4 x 13 x 0 array of double>
# Model nucleotide Betas before seed PSAM is added
addSeedPsam(model) = seedTable2psam(model, data.kmerTable)
# Model nucleotide Betas after seed PSAM is added
model@features@N
## An object of class 'N'
## 
## Slot "seedLen":  9 
## 
## Slot "N.upFootprintExtend":  2 
## 
## Slot "N.downFootprintExtend":  2 
## 
## Slot "fS.upFootprintExtend":  2 
## 
## Slot "fS.downFootprintExtend":  2 
## 
## Slot "fpLen":  13 
## 
## Slot "N.set":  1 2 3 4 5 6 7 8 9 10 11 12 13 
## 
## Slot "N.equivMat":
## 13  x  13  null equivalence matrix
## 
## Slot "N.values":
##     1 2          3         4         5         6         7         8
## N.A 0 0  0.0000000 -1.100719 -1.748610  0.000000 -3.042044 -1.808151
## N.C 0 0 -1.5065161 -3.042044 -3.042044 -3.042044 -1.841388 -1.016157
## N.G 0 0 -0.5525097 -3.042044  0.000000 -3.042044 -3.042044 -1.012792
## N.T 0 0 -0.4682265  0.000000 -3.042044 -3.042044  0.000000  0.000000
##             9        10        11 12 13
## N.A -1.293313  0.000000 -3.042044  0  0
## N.C -3.042044 -3.042044 -3.042044  0  0
## N.G  0.000000 -3.042044 -3.042044  0  0
## N.T -2.042044 -3.042044  0.000000  0  0
## 
## 
## Slot "N.errors":
##     1 2 3 4 5 6 7 8 9 10 11 12 13
## N.A 0 0 0 0 0 0 0 0 0  0  0  0  0
## N.C 0 0 0 0 0 0 0 0 0  0  0  0  0
## N.G 0 0 0 0 0 0 0 0 0  0  0  0  0
## N.T 0 0 0 0 0 0 0 0 0  0  0  0  0
## 
## 
## Slot "N.z":
##     1 2 3 4 5 6 7 8 9 10 11 12 13
## N.A 0 0 0 0 0 0 0 0 0  0  0  0  0
## N.C 0 0 0 0 0 0 0 0 0  0  0  0  0
## N.G 0 0 0 0 0 0 0 0 0  0  0  0  0
## N.T 0 0 0 0 0 0 0 0 0  0  0  0  0
## 
## 
## Slot "N.sig":
##     1 2 3 4 5 6 7 8 9 10 11 12 13
## N.A 0 0 0 0 0 0 0 0 0  0  0  0  0
## N.C 0 0 0 0 0 0 0 0 0  0  0  0  0
## N.G 0 0 0 0 0 0 0 0 0  0  0  0  0
## N.T 0 0 0 0 0 0 0 0 0  0  0  0  0
## 
## 
## Slot "N.oldValues":
## <4 x 13 x 0 array of double>
## 
## Slot "N.oldErrors":
## <4 x 13 x 0 array of double>
## 
## Slot "N.oldZ":
## <4 x 13 x 0 array of double>
## 
## Slot "N.oldSig":
## <4 x 13 x 0 array of double>
#Use this definition of data for complete analysis
data = data.probeCounts

data = topModelMatch(data, model)
# Uses aligned probes to build design matrix
data = addDesignMatrix(data, model)
# Constructs regression expression with independent features using design matrix
regressionFormula = updatedRegressionFormula(data, model)
fit = glm(regressionFormula, 
          data=data, 
          family = poisson(link="log"))

model = addNewBetas(model, data, fit)
# Nucleotide Features after first round of fitting

# GABRIELLA: this plotting commmand is not working, can you fix it?
#plot(model, Nplot.ddG = TRUE, verticalPlots = TRUE)

data = data.probeCounts

data.nrow = nrow(data)
for (i in 2:3) {
  data = topModelMatch(data, model)
  data = addDesignMatrix(data, model)
  if (data.nrow == nrow(data)) {
    print ("Stability Reached")
    break
  } else {
    data.nrow = nrow(data)
  }

          
  regressionFormula = updatedRegressionFormula(data, model)
  fit = glm(regressionFormula, 
            data=data, 
            family = poisson(link="log"))

  model = addNewBetas(model,data,fit)
  # Nucleotide Features after i'th round of fitting
}
## Warning: glm.fit: fitted rates numerically 0 occurred
model@features@N@N.values
##               1           2          3         4         5          6
## N.A  0.04866333  0.05268261  0.0000000 -8.007723 -1.141010  0.0000000
## N.C -0.29778959 -0.22239446 -0.9323824 -7.813690 -7.686484 -7.3486044
## N.G  0.00000000  0.00000000 -0.5159343 -8.431065  0.000000  0.2000812
## N.T -0.25621848 -0.08685875 -0.3569731  0.000000 -8.066953 -8.0201639
##             7          8          9        10         11          12
## N.A -8.382165 -8.0709673 -0.8518232  0.000000 -1.0000000 -0.73702293
## N.C -7.601662 -1.3972185 -8.3405496 -1.000000 -1.0000000 -0.14461245
## N.G -7.793026 -0.6016221  0.0000000 -7.628075 -0.8703377 -0.07539009
## N.T  0.000000  0.0000000 -1.4875563 -7.653693  0.0000000  0.00000000
##             13
## N.A  0.0000000
## N.C -0.9846406
## N.G -0.1673405
## N.T -0.9563936

Save model object for future reference:

save(model, file = "HowToFitMononucleotideModel.Result.RData")