
#
# Copyright (C) 2018-2019 Mario Abdelmessih
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
#


##################################################
##################################################
FeatureGenerator <- function(TheSequences,SelectedFeat){

  require("Biostrings")
  TrainData=matrix("",dim(TheSequences)[1],length(SelectedFeat))
  for(k in 1:dim(TheSequences)[1]){
    GeneID=TheSequences[k,1]
    SeqLen=nchar(as.character(TheSequences[k,2]))
    WindowSeq=TheSequences[k,2]
    Motif1=oligonucleotideFrequency(DNAString(WindowSeq), 1, step=1, as.prob=T)[SelectedFeat[which(nchar(SelectedFeat)==1)]]
    Motif2=oligonucleotideFrequency(DNAString(WindowSeq), 2, step=1, as.prob=T)[SelectedFeat[which(nchar(SelectedFeat)==2)]]
    Motif3=oligonucleotideFrequency(DNAString(WindowSeq), 3, step=1, as.prob=T)[SelectedFeat[which(nchar(SelectedFeat)==3)]]
    Motif4=oligonucleotideFrequency(DNAString(WindowSeq), 4, step=1, as.prob=T)[SelectedFeat[which(nchar(SelectedFeat)==4)]]
    Motif5=oligonucleotideFrequency(DNAString(WindowSeq), 5, step=1, as.prob=T)[SelectedFeat[which(nchar(SelectedFeat)==5)]]
    Motif6=oligonucleotideFrequency(DNAString(WindowSeq), 6, step=1, as.prob=T)[SelectedFeat[which(nchar(SelectedFeat)==6)]]
    Motif7=oligonucleotideFrequency(DNAString(WindowSeq), 7, step=1, as.prob=T)[SelectedFeat[which(nchar(SelectedFeat)==7)]]
    Motif8=oligonucleotideFrequency(DNAString(WindowSeq), 8, step=1, as.prob=T)[SelectedFeat[which(nchar(SelectedFeat)==8)]]
    TrainData[k,]=c(Motif1,Motif2,Motif3,Motif4,Motif5,Motif6,Motif7,Motif8)
  }
  colnames(TrainData)=c(names(Motif1),names(Motif2),names(Motif3),names(Motif4),names(Motif5),names(Motif6),names(Motif7),names(Motif8))
  return(TrainData)
}
##################################################
##################################################
ModelCrossValidation <-function(SelectedFeat){

  require("randomForest")
  require("doMC")
  require("foreach")
  require("doParallel")
  registerDoMC()

  SlidingWindows=read.csv(file="WindowSequences_Demo.csv",sep="\t")
  DataFeat=FeatureGenerator(SlidingWindows[,c("WindID","WindowSeq")],SelectedFeat)
  TrainData=apply(DataFeat,2,as.numeric)
  FullData=cbind(as.numeric(SlidingWindows[,"WindowRESA"]),TrainData)
  colnames(FullData)=c("RESAStability",SelectedFeat)

  CV=5
  CrossValidationResults=TakenInd={}
  FeatImportance=matrix(0,CV,length(SelectedFeat))
  for(c in 1:CV){
    TestInd=sample(setdiff(c(1:dim(FullData)[1]),TakenInd),floor(dim(FullData)[1]/CV),replace=F)
    TrainInd=setdiff(c(1:dim(FullData)[1]),TestInd)
    TakenInd=c(TakenInd,TestInd)
    TrainData=FullData[TrainInd,c("RESAStability",SelectedFeat)]
    TrainData=t(apply(TrainData,1,as.numeric))
    TestData=FullData[TestInd,c("RESAStability",SelectedFeat)]
    TestData=t(apply(TestData,1,as.numeric))
    colnames(TrainData)=colnames(TestData)=c("RESAStability",SelectedFeat)
    ###########
    RF <- randomForest(TrainData[,-1], TrainData[,1], ntree=500,importance=T)
    FeatImportance[c,]=RF$importance[,1]
    RFPredicted=predict(RF, TestData[,-1])
    CrossValidationResults=rbind(CrossValidationResults,cbind(TestData[,1],RFPredicted))
    ############
  }
  CrossValidation={}
  CrossValidation$prediction=CrossValidationResults
  CrossValidation$FeatureImportance=apply(FeatImportance,2,mean)
  names(CrossValidation$FeatureImportance)=SelectedFeat
  return(CrossValidation)

}
##################################################
##################################################
ModelTraining <-function(SelectedFeat){

  require("randomForest")
  require("doMC")
  require("foreach")
  require("doParallel")
  registerDoMC()

  SlidingWindows=read.csv(file="WindowSequences_Demo.csv",sep="\t")
  DataFeat=FeatureGenerator(SlidingWindows[,c("WindID","WindowSeq")],SelectedFeat)

  TrainData=apply(DataFeat,2,as.numeric)
  FullData=cbind(as.numeric(SlidingWindows[,"WindowRESA"]),TrainData)
  colnames(FullData)=c("RESAStability",SelectedFeat)

  TrainedRF <- randomForest(FullData[,-1], FullData[,1], ntree=500)
  save(TrainedRF,file="Trained_RFmodel_RESATotal_WT_.RData")
  return()

}
##################################################
##################################################
######################################################################################################################################################
######################################################################################################################################################
load("SelectedFeatures_RESATotal_WT_.RData") # load the selcted features according to feature selection filter procedure

CrossValidationResults=ModelCrossValidation(SelectedFeat)

ModelTraining(SelectedFeat)

######################################################################################################################################################
######################################################################################################################################################
