# Processing HT-SELEX data
options(java.parameters = "-Xmx100000M")
library(SELEX)
library(SelexGLM)
library(grid)
workDir = "/bigjoe/liyang/SELEXseq/HT-SELEX/Ar_ESP_TCTAAT20NCG_4/"
dir.create(file.path(workDir), showWarnings = FALSE, recursive = TRUE)
selex.config(workingDir=workDir, maxThreadNumber=800)

selexDir = "/bigjoe/liyang/SELEXseq/HT-SELEX"
processedDataDir = "/bigjoe/liyang/SELEXseq/HT-SELEX/Raw_read"

saveDir = "/Result/Ar_ESP_TCTAAT20NCG_4/Final"
dir.create(file.path(selexDir, saveDir), showWarnings = FALSE, recursive = TRUE)
selex.defineSample('ZeroCycle_ES0_TCTAAT20NCG_0',
                   paste(processedDataDir, "/R0/ZeroCycle_ES0_TCTAAT20NCG_0.txt", sep = ""),
                   'ZeroCycle_ES0_TCTAAT20NCG_0',
                   0, 20, '', '')

selex.defineSample('Ar_ESP_TCTAAT20NCG_4',
                   paste(processedDataDir, "/AR/Ar_ESP_TCTAAT20NCG_4.txt", sep = ""),
                   'Ar_ESP_TCTAAT20NCG_4',
                   4, 20, '', '')


r0 = selex.sample(seqName = 'ZeroCycle_ES0_TCTAAT20NCG_0', sampleName='ZeroCycle_ES0_TCTAAT20NCG_0', round = 0)
r0.split = selex.split(r0)
r0.train = r0.split$train
r0.test = r0.split$test
dataSample = selex.sample(seqName = 'Ar_ESP_TCTAAT20NCG_4', sampleName = 'Ar_ESP_TCTAAT20NCG_4', round = 4)

# MARKOV MODEL BUILT
kmax = selex.kmax(sample = r0.test,threshold=20)
mm = selex.mm(sample = r0.train, order = NA, crossValidationSample =r0.test, Kmax = kmax, mmMethod = "DIVISION")
mmscores = selex.mmSummary(sample = r0.train)
ido = which(mmscores$R==max(mmscores$R))
mm.order = mmscores$Order[ido]

selex.mmSummary()
mm.r2 = selex.mmSummary()

idx = which(mm.r2$R==max(mm.r2$R))
colstring = rep('BLUE',nrow(mm.r2))
colstring[idx]='RED'
jpeg (filename=paste(selexDir, saveDir,"/Markov_model.jpeg",sep =""))
barplot(height=mm.r2$R,names.arg=(mm.r2$Order), ylim=c(.90,1), xpd=FALSE, col=colstring,
        xlab="Markov Model Order", ylab=expression(Markov ~ Model ~ R^{2}))
dev.off()
mm.order = mm.r2$K[idx]
mm = selex.mm(sample=r0.train, order=mm.order, crossValidationSample=r0.test, Kmax=kmax,
              seqfilter=NULL,mmMethod="DIVISION", mmWithLeftFlank=FALSE)

#build Table 1
table1 = selex.counts(sample=r0.train, k=kmax, minCount=50, numSort=TRUE, markovModel=mm)
table1$Probability = NULL
names(table1) = c("Kmer","R0Counts", "R0ExpectedCounts")
head(table1)
tail(table1)

#build markov model scatterplot
fit = lm(table1$R0Counts ~ table1$R0ExpectedCounts)
jpeg(filename=paste(selexDir, saveDir,"/Markov_model_k_mer_regression.jpeg",sep=""))
plot(y=table1$R0Counts, x=table1$R0ExpectedCounts, xlab="Expected Counts", ylab="Observed Counts", col='BLUE')
abline(a=fit$coefficients[1], b=fit$coefficients[2], col='RED', lwd=2)
text(x=4500, y=2000, labels=expression(R^{2} ~ '=' ~ 0.996), cex=1.5)
dev.off()

#calculate information gain
selex.infogain(sample=dataSample,k=c(7:20),markovModel=mm)
selex.infogainSummary()
infoscores = selex.infogainSummary()

#information gain barplot
idx = which(infoscores$InformationGain==max(infoscores$InformationGain))
colstring = rep('BLUE', nrow(infoscores))
colstring[idx] = 'RED'
jpeg(filename=paste(selexDir, saveDir,"/Info_gain.jpeg",sep=""))
barplot(height=infoscores$InformationGain, names.arg=infoscores$K, col=colstring,
        xlab="Oligonucleotide Length (bp)", ylab="Information Gain (bits)")
dev.off()

libLen = as.numeric(as.character(selex.getAttributes(dataSample)$VariableRegionLength))
kLen = 15


data.probeCounts = getProbeCounts(dataSample, markovModel = mm)
save(data.probeCounts, file = paste(selexDir, saveDir, "/data.probeCounts.RData", sep = ""))
load(file = paste(selexDir, saveDir, "/data.probeCounts.RData", sep = ""))
data.kmerTable = getKmerCountAffinities(dataSample, k = kLen, minCount = 0, markovModel = mm)
save(data.kmerTable, file = paste(selexDir, saveDir, "/data.kmerTable.RData", sep = ""))
load(file = paste(selexDir, saveDir, "/data.kmerTable.RData", sep = ""))

# Inputs about library are data specific 
ModelTest = model(name = "Ar_ESP_TCTAAT20NCG_4_sym",
                  varRegLen = libLen,
                  leftFixedSeq =  "CGACGCTCTTCCGATCTCTAAT",
                  rightFixedSeq ="CGATCGTATGCCGTCTTCTGCTTGCCGACTCCG", 
                  consensusSeq = "RGWACANNNTGTWCY",
                  affinityType = "AffinitySym",
                  leftFixedSeqOverlap = 10,
                  minAffinity = 0.01,
                  missingValueSuppression = 0.5,
                  minSeedValue = 0.01, 
                  upFootprintExtend = 4,
                  confidenceLevel = 0.95, 
                  rounds = list(c(4)),
                  rcSymmetric = TRUE,
                  verbose = FALSE)

summary(ModelTest)

# Model nucleotide Betas before seed PSAM is added
print(getValues(getN(ModelTest)))

# Add seed model
addSeedPsam(ModelTest) = seedTable2psam(ModelTest, data.kmerTable)

# Model nucleotide Betas after seed PSAM is added
print(getValues(getN(ModelTest)))
plot(ModelTest@features@N, title = "AR_ESAD_TCTTCT20NCTG_R4_sym_Seed", ddG = FALSE)

data = topModelMatch(data.probeCounts, ModelTest)


# Uses aligned probes to build design matrix
data = addDesignMatrix(data, ModelTest)
designMatrixSummary = getDesignMatrix(ModelTest, data)
print("Round summary: ")
print (designMatrixSummary$Round)
print("View/strand orientation summary: ")
print (designMatrixSummary$Intercept)
print("Mono-nucleotide summary: ")
print (designMatrixSummary$N)
# # Constructs regression expression with independent features using design matrix
regressionFormula = updatedRegressionFormula(data, ModelTest)
print("Regression Formula: ")
print (regressionFormula)
fit = glm(regressionFormula, 
          data=data, 
          family = poisson(link="log"))
summary(fit)
ModelTest = addNewBetas(ModelTest, data, fit)
# # Nucleotide Features after first round of fitting
summary(ModelTest)

pM <- plot(ModelTest, design = data, plotTitle = "Ar_ESP_TCTAAT20NCG_4_sym", Nplot.ddG = FALSE, verticalPlots = TRUE)

ggplot2::ggsave(pM, file = paste(selexDir, saveDir, "/modelPlot.pdf", sep = ""), height = 6, width = 6)

ggplot2::ggsave(pM, file = paste(selexDir, saveDir, "/modelPlot.",1, ".pdf", sep = ""), height = 6, width = 6)

data = data.probeCounts
data.nrow = nrow(data)
data = topModelMatch(data, ModelTest)
data = addDesignMatrix(data, ModelTest)
designMatrixSummary.v2 = getDesignMatrix(ModelTest, data)

if ((all(designMatrixSummary.v2$N == designMatrixSummary$N)) & (all(designMatrixSummary.v2$Round == designMatrixSummary$Round)) & (all(designMatrixSummary.v2$Intercept == designMatrixSummary$Intercept)))  {
  print ("Stability Reached")
}

for (i in 2:20) {
  if (data.nrow == nrow(data)) {
    break
  }
  data.nrow = nrow(data)
  print (paste("i =",i))
  
  designMatrixSummary = getDesignMatrix(ModelTest, data)
  print("\n")
  print("Round summary: ")
  print (designMatrixSummary$Round)
  print("\n")
  print("Mono-nucleotide summary: ")
  print (designMatrixSummary$N)
  print("\n")
  print("View/strand orientation summary: ")
  print (designMatrixSummary$Intercept)
  # # Constructs regression expression with independent features using design matrix
  regressionFormula = updatedRegressionFormula(data, ModelTest)
  print("\n")
  print("Regression Formula: ")
  print (regressionFormula)
  fit = glm(regressionFormula, 
            data=data, 
            family = poisson(link="log"))
  summary(fit)
  ModelTest = addNewBetas(ModelTest, data, fit)
  # # Nucleotide Features after first round of fitting
  summary(ModelTest)
  pM <- plot(ModelTest, design = data, plotTitle = "Ar_ESP_TCTAAT20NCG_4_sym", Nplot.ddG = TRUE, verticalPlots = TRUE)
  ggplot2::ggsave(pM, file = paste(selexDir, saveDir, "/modelPlot.",i, ".pdf", sep = ""), height = 6, width = 6)
  ggplot2::ggsave(pM, file = paste(selexDir, saveDir, "/modelPlot.pdf", sep = ""), height = 6, width = 6)
  data = topModelMatch(data, ModelTest)
  data = addDesignMatrix(data, ModelTest)
  designMatrixSummary.v2 = getDesignMatrix(ModelTest, data)
  print(paste("Number of Observations in Design Matrix: ",nrow(data), sep = ""))
  if ((all(designMatrixSummary.v2$N == designMatrixSummary$N)) & (all(designMatrixSummary.v2$Round == designMatrixSummary$Round)) & (all(designMatrixSummary.v2$Intercept == designMatrixSummary$Intercept)))  {
    print (paste("Stability Reached after ", i, " iterations.", sep = ""))
    break
  } else if (nrow(data) == 0) {
    print ("Algorithm failed to converge: No probes meet the confidence level requirement (Confidence Level:", ModelTest@confidenceLevel, ")", sep = "")
  }
}

save(ModelTest, file = paste(selexDir, saveDir, "/model.RData",sep = ""))

PSAM <- getPSAM (ModelTest)
PSAM <- t(PSAM)
write.table (PSAM, file = paste(selexDir, saveDir, "/Ar_ESP_TCTAAT20NCG_4_sym_PSAM.txt",sep = ""), col.names = FALSE, row.names = FALSE, quote = FALSE)