# Processing AR R4 SELEX-seq data
options(java.parameters = "-Xmx70000M")
library(SELEX)
library(SelexGLM)
library(grid)
workDir = "/bigjoe/liyang/SELEXseq/SELEX_package_processed/AR_GR_final/AR_R4_processing/"
dir.create(file.path(workDir), showWarnings = FALSE, recursive = TRUE)
selex.config(workingDir=workDir, maxThreadNumber=800)

selexDir = "/bigjoe/liyang/SELEXseq/SELEX_package_processed/AR_GR_final"
processedDataDir = "/bigjoe/liyang/SELEXseq/SELEX_package_processed/AR_GR_final/Raw_reads"

saveDir = "/Result/AR_R4_final"
dir.create(file.path(selexDir, saveDir), showWarnings = FALSE, recursive = TRUE)
selex.defineSample('R0',
                   paste(processedDataDir, "/GR_DBD_R0.fastq", sep = ""),
                   'R0',
                   0, 23, '', 'TGGAA')

selex.defineSample('AR_R4',
                   paste(processedDataDir, "/AR_DBD_R4.fastq", sep = ""),
                   'AR_R4',
                   4, 23, '', 'TGGAA')


r0 = selex.sample(seqName = 'R0', sampleName='R0', round = 0)
r0.split = selex.split(r0)
r0.train = r0.split$train
r0.test = r0.split$test
dataSample = selex.sample(seqName = 'AR_R4', sampleName = 'AR_R4', round = 4)

# MARKOV MODEL BUILT
kmax = selex.kmax(sample = r0.test,threshold=100)
mm = selex.mm(sample = r0.train, order = NA, crossValidationSample =r0.test, Kmax = kmax, mmMethod = "TRANSITION")
mmscores = selex.mmSummary(sample = r0.train)
ido = which(mmscores$R==max(mmscores$R))
mm.order = mmscores$Order[ido]

selex.mmSummary()
mm.r2 = selex.mmSummary()

idx = which(mm.r2$R==max(mm.r2$R))

#build Table 1
table1 = selex.counts(sample=r0.train, k=kmax, minCount=100, numSort=TRUE, markovModel=mm)
table1$Probability = NULL
names(table1) = c("Kmer","R0Counts", "R0ExpectedCounts")
head(table1)
tail(table1)

#calculate information gain
selex.infogain(sample=dataSample,k=c(7:23),markovModel=mm)
selex.infogainSummary()
infoscores = selex.infogainSummary()

#information gain barplot
idx = which(infoscores$InformationGain==max(infoscores$InformationGain))

libLen = as.numeric(as.character(selex.getAttributes(dataSample)$VariableRegionLength))
kLen = 15

data.probeCounts = getProbeCounts(dataSample, markovModel = mm)
save(data.probeCounts, file = paste(selexDir, saveDir, "/data.probeCounts.RData", sep = ""))
load(file = paste(selexDir, saveDir, "/data.probeCounts.RData", sep = ""))
data.kmerTable = getKmerCountAffinities(dataSample, k = kLen, minCount = 0, markovModel = mm)
save(data.kmerTable, file = paste(selexDir, saveDir, "/data.kmerTable.RData", sep = ""))
load(file = paste(selexDir, saveDir, "/data.kmerTable.RData", sep = ""))

# Inputs about library are data specific 
ModelTest = model(name = "AR_DBD_R4_sym",
                  varRegLen = libLen,
                  leftFixedSeq =  "GTTCAGAGTTCTACAGTCCGACGATC",
                  rightFixedSeq ="TGGAATTCTCGGGTGCCAAGG", 
                  consensusSeq = "RGWACANNNTGTWCY",
                  affinityType = "AffinitySym",
                  leftFixedSeqOverlap = 10,
                  minAffinity = 0.01,
                  missingValueSuppression = 0.5,
                  minSeedValue = 0.01, 
                  upFootprintExtend = 8,
                  confidenceLevel = 0.95, 
                  rounds = list(c(4)),
                  rcSymmetric = TRUE,
                  verbose = FALSE)

summary(ModelTest)

# Model nucleotide Betas before seed PSAM is added
print(getValues(getN(ModelTest)))

# Add seed model
addSeedPsam(ModelTest) = seedTable2psam(ModelTest, data.kmerTable)

# Model nucleotide Betas after seed PSAM is added
print(getValues(getN(ModelTest)))
data = topModelMatch(data.probeCounts, ModelTest)


# Uses aligned probes to build design matrix
data = addDesignMatrix(data, ModelTest)
designMatrixSummary = getDesignMatrix(ModelTest, data)
print("Round summary: ")
print (designMatrixSummary$Round)
print("View/strand orientation summary: ")
print (designMatrixSummary$Intercept)
print("Mono-nucleotide summary: ")
print (designMatrixSummary$N)
# # Constructs regression expression with independent features using design matrix
regressionFormula = updatedRegressionFormula(data, ModelTest)
print("Regression Formula: ")
print (regressionFormula)
fit = glm(regressionFormula, 
          data=data, 
          family = poisson(link="log"))
summary(fit)
ModelTest = addNewBetas(ModelTest, data, fit)
# # Nucleotide Features after first round of fitting
summary(ModelTest)

pM <- plot(ModelTest, design = data, plotTitle = "AR_DBD_R4_sym", Nplot.ddG = FALSE, verticalPlots = TRUE)

ggplot2::ggsave(pM, file = paste(selexDir, saveDir, "/modelPlot.pdf", sep = ""), height = 6, width = 6)

ggplot2::ggsave(pM, file = paste(selexDir, saveDir, "/modelPlot.",1, ".pdf", sep = ""), height = 6, width = 6)

data = data.probeCounts
data.nrow = nrow(data)
data = topModelMatch(data, ModelTest)
data = addDesignMatrix(data, ModelTest)
designMatrixSummary.v2 = getDesignMatrix(ModelTest, data)

if ((all(designMatrixSummary.v2$N == designMatrixSummary$N)) & (all(designMatrixSummary.v2$Round == designMatrixSummary$Round)) & (all(designMatrixSummary.v2$Intercept == designMatrixSummary$Intercept)))  {
  print ("Stability Reached")
}

for (i in 2:20) {
  if (data.nrow == nrow(data)) {
    break
  }
  data.nrow = nrow(data)
  print (paste("i =",i))
  
  designMatrixSummary = getDesignMatrix(ModelTest, data)
  print("\n")
  print("Round summary: ")
  print (designMatrixSummary$Round)
  print("\n")
  print("Mono-nucleotide summary: ")
  print (designMatrixSummary$N)
  print("\n")
  print("View/strand orientation summary: ")
  print (designMatrixSummary$Intercept)
  # # Constructs regression expression with independent features using design matrix
  regressionFormula = updatedRegressionFormula(data, ModelTest)
  print("\n")
  print("Regression Formula: ")
  print (regressionFormula)
  fit = glm(regressionFormula, 
            data=data, 
            family = poisson(link="log"))
  summary(fit)
  ModelTest = addNewBetas(ModelTest, data, fit)
  # # Nucleotide Features after first round of fitting
  summary(ModelTest)
  pM <- plot(ModelTest, design = data, plotTitle = "AR_DBD_R4_sym", Nplot.ddG = TRUE, verticalPlots = TRUE)
  ggplot2::ggsave(pM, file = paste(selexDir, saveDir, "/modelPlot.",i, ".pdf", sep = ""), height = 6, width = 6)
  ggplot2::ggsave(pM, file = paste(selexDir, saveDir, "/modelPlot.pdf", sep = ""), height = 6, width = 6)
  data = topModelMatch(data, ModelTest)
  data = addDesignMatrix(data, ModelTest)
  designMatrixSummary.v2 = getDesignMatrix(ModelTest, data)
  print(paste("Number of Observations in Design Matrix: ",nrow(data), sep = ""))
  if ((all(designMatrixSummary.v2$N == designMatrixSummary$N)) & (all(designMatrixSummary.v2$Round == designMatrixSummary$Round)) & (all(designMatrixSummary.v2$Intercept == designMatrixSummary$Intercept)))  {
    print (paste("Stability Reached after ", i, " iterations.", sep = ""))
    break
  } else if (nrow(data) == 0) {
    print ("Algorithm failed to converge: No probes meet the confidence level requirement (Confidence Level:", ModelTest@confidenceLevel, ")", sep = "")
  }
}

save(ModelTest, file = paste(selexDir, saveDir, "/model.RData",sep = ""))

PSAM <- getPSAM (ModelTest)
PSAM <- t(PSAM)
write.table (PSAM, file = paste(selexDir, saveDir, "/AR_DBD_R4_sym_PSAM.txt",sep = ""), col.names = FALSE, row.names = FALSE, quote = FALSE)