library(dREG);
library(Rgtsvm);
library(snowfall);
library(parallel);

#source("../../script5/auc-pr.R");
source("/home/zw355/proj/prj10-dreg/script5/auc-pr.R");

eval_reg_svmx <- function(gdm, asvm, positions, bw_plus_path, bw_minus_path, batch_size=50000, ncores=3, use_rgtsvm=FALSE, use_snowfall=FALSE, scale.method="logistic", debug= TRUE) {

  if(batch_size>NROW(positions)) batch_size= NROW(positions)
  n_elem <- NROW(positions)
  n_batches <- floor(n_elem/batch_size)
  interval <- unique(c( seq( 1, n_elem+1, by = batch_size ), n_elem+1))

  pos.ord <- order(positions[,1], positions[,2], positions[,3]);
  pos.sorted <- positions[pos.ord,];

  if (use_rgtsvm)
  {
    if(!requireNamespace("Rgtsvm"))
      stop("Rgtsvm has not been installed fotr GPU computing.");

    predict = Rgtsvm::predict.gtsvm;
  }

  if( class(asvm)=="svm" && use_rgtsvm) class(asvm)<-"gtsvm";
  if( class(asvm)=="gtsvm" && !use_rgtsvm) class(asvm)<-"svm";

  do.predict <- function( mat_features ){
     if(asvm$type == 0) { ## Probabilistic SVM
       batch_pred <- predict( asvm, mat_features, probability=TRUE );
     }
     else { ## epsilon-regression (SVR)
       batch_pred <- predict( asvm, mat_features, probability=FALSE)
     }
     return(batch_pred);
  }

  ## Do elements of each intervals
  if(!use_rgtsvm)
  {
    scores<- unlist(mclapply(c(1:(length(interval)-1)), function(x) {
      ##print(paste(x, "of", length(interval)-1) );
      batch_idx <- c( interval[x]:(interval[x+1]-1) );
      feature <- read_genomic_data(gdm, pos.sorted[batch_idx,,drop=F], bw_plus_path, bw_minus_path, scale.method=scale.method);

      pred <- do.predict( feature );
      gc();
      return( pred );
    }, mc.cores= ncores))
  }
  else
  {
    n.loop <- ceiling((length(interval)-1)/ncores);
    scores <- unlist( lapply(1:n.loop, function(i) {
      n.start = (i-1)*ncores+1;
      n.stop = ifelse( length(interval)-1 <= i*ncores, length(interval)-1, i*ncores );

      feature_list <- list();
      if(!use_snowfall)
      {
         feature_list<- mclapply(n.start:n.stop, function(x) {
            ##print(paste(x, "of", length(interval)-1) );
            batch_indx<- c( interval[x]:(interval[x+1]-1) );
            return(read_genomic_data(gdm, pos.sorted[batch_indx,,drop=F], bw_plus_path, bw_minus_path, scale.method=scale.method));
         }, mc.cores= ncores);
      }
      else
      {
        if(!requireNamespace("snowfall"))
           stop("Snowfall has not been installed fotr big data.");

        pos.list = list();
        for(x in n.start:n.stop)
        {   ##print(paste(x, "of", length(interval)-1) );
            batch_indx<- c( interval[x]:(interval[x+1]-1) );
            pos.list[[x-n.start+1]] <- pos.sorted[batch_indx,,drop=F];
        }

        cpu.fun <- function(pos.bed)
        {
            library("dREG");

            cat("PID=", Sys.getpid(), "\n");
            return(read_genomic_data(gdm, pos.bed, bw_plus_path, bw_minus_path, scale.method=scale.method));
        }

	    if(ncores>1)
	    {
        	sfInit(parallel = TRUE, cpus = ncores, type = "SOCK" )
        	sfExport("gdm", "bw_plus_path", "bw_minus_path", "scale.method");

			fun <- as.function(cpu.fun);
			environment(fun)<-globalenv();

        	feature_list <- sfClusterApplyLB( pos.list, fun=fun);
        	sfStop();
	    }
	    else
        	feature_list <- lapply( pos.list, cpu.fun );
      }

      feature_list <- do.call("rbind", feature_list); gc(verbose=F, reset=T);

      pred <- do.predict( feature_list );

      rm( feature_list );

      return( pred );
     } ));
  }

  ## Test code
  ## all( pos.sorted[ order(pos.ord),  ] == positions );

  ## sort back the genome loci.
  scores <- scores[ order(pos.ord) ];

  return(as.double(scores))
}

get_test_set <- function(positions, positive.bed, negative.bed, n_samp, n_samp_neg) {
  #if(enrich_negative_near_pos < 0 | enrich_negative_near_pos > 1) stop('ERROR: enrich_negative_near_pos must be in the rage [0,1]!')
  #if(is.null(extra_enrich_bed)) {
  #  extra_enrich_frac=0
  #}
  #else {
  #  if(extra_enrich_frac < 0 | extra_enrich_frac > 1) stop('ERROR: extra_enrich_frac must be in the rage [0,1]!')
  #}

  ########################################
  ## Divide into positives and negatives.
  all_feat  <- feat( seqname= positions[,1], start= positions[,2], end= (positions[,3]) )
  positive_feat <- feat(seqname= positive.bed[,1], start= positive.bed[,2], end= positive.bed[,3])
  ol <- overlap.feat(x= all_feat, filter= positive_feat)
  pos_indx <- match(paste(ol$seqname, ol$start, ol$end), paste(all_feat$seqname, all_feat$start, all_feat$end))

  all_feat  <- feat( seqname= positions[,1], start= positions[,2], end= (positions[,3]) )
  negative_feat <- feat(seqname= negative.bed[,1], start= negative.bed[,2], end= negative.bed[,3])
  ol <- overlap.feat(x= all_feat, filter= negative_feat)
  neg_indx <- match(paste(ol$seqname, ol$start, ol$end), paste(all_feat$seqname, all_feat$start, all_feat$end))

  cat("Pos:", NROW(pos_indx), "Neg:", NROW(neg_indx), "Fuzzy:", NROW(positions)-NROW(neg_indx)-NROW(pos_indx), "\n");

  pos_indx_train_test <- sample(pos_indx, n_samp, replace=FALSE)
  neg_indx_train_test <- sample(neg_indx, n_samp_neg, replace=FALSE)

  return(cbind(positions[c(pos_indx_train_test, neg_indx_train_test),], c(rep(1,n_samp), rep(0,n_samp_neg)))) ## Train on the first n_train examples.
}

regulatory_svmx <- function(gdm, bw_plus_path, bw_minus_path, positions, positive.bed, negative.bed,
			allow= NULL,
			n_train=25000,
			n_eval=1000,
			pdf_path= "roc_plot.pdf",
			plot_raw_data=TRUE,
			file_raw_rdata=NULL,
			extra_enrich_bed= NULL,
			extra_enrich_frac= 0.1,
			enrich_negative_near_pos= 0.15,
			use_rgtsvm=FALSE,
			svm_type= "SVR",
			ncores=1,
			scale.method="logistic",
			...,
			debug= TRUE)
{
  ########################################
  ## Divide into positives and negatives.

  batch_size = 10000;

  if (use_rgtsvm)
  {
    if(!requireNamespace("Rgtsvm"))
      stop("Rgtsvm has not been installed fotr GPU computing.");

    predict = Rgtsvm::predict.gtsvm;
    svm = Rgtsvm::svm;
  }

  #if( class(asvm)=="svm" && use_rgtsvm) class(asvm)<-"gtsvm";
  #if( class(asvm)=="gtsvm" && !use_rgtsvm) class(asvm)<-"svm";

  n_train_pos <- round( n_train * 1.0 )
  n_train_neg <- round( n_train * g.neg.amp )
  n_eval_pos  <- round( n_eval * 1.0 )
  n_eval_neg  <- round( n_eval * g.neg.amp )

  indx_train <- c(1:n_train_pos, (n_train_pos + n_eval_pos + 1):(n_train_pos + n_eval_pos + n_train_neg) )
  indx_eval  <- c((n_train_pos+1):(n_train_pos+n_eval_pos), (n_train_pos + n_eval_pos + n_train_neg + 1):(n_train_pos + n_eval_pos + n_train_neg + n_eval_neg) )

  parallel_read_genomic_data <- function( x_train_bed, bw_plus_file, bw_minus_file, scale.method )
  {
      interval <- unique(c( seq( 1, NROW(x_train_bed)+1, by=batch_size ), NROW(x_train_bed)+1))
      feature_list<- mclapply(1:(length(interval)-1), function(x) {
            ##print(paste(x, "of", length(interval)-1) );
            batch_indx<- c( interval[x]:(interval[x+1]-1) );
            return(read_genomic_data(gdm, x_train_bed[batch_indx,,drop=F], bw_plus_file, bw_minus_file, scale.method=scale.method));
      }, mc.cores= ncores);

	  return( do.call("rbind", feature_list) );
  }

  ## Read genomic data.
  if(debug) print("Collecting training data.")

  x_train <- y_train <- x_predict <- y_predict <- NULL;
  x_train_bed <- x_predict_bed <- list();

	if(length(bw_plus_path) == 1) {
	    tset <- get_test_set(positions= positions, positive.bed=positive.bed, negative.bed=negative.bed, n_samp= n_train_pos + n_eval_pos, n_samp_neg= n_train_neg + n_eval_neg )

	    ## Get training indices.
	    x_train_bed[[1]] <- tset[indx_train,c(1:3)]
	    y_train <- tset[indx_train,4]
	    x_predict_bed[[1]] <- tset[indx_eval,c(1:3)]
	    y_predict <- tset[indx_eval,4]

		## Write out a bed of training positions to avoid during test ...
	    if(debug) {
	      write.table(x_train_bed[[1]], "TrainingSet.bed", quote=FALSE, row.names=FALSE, col.names=FALSE, sep="\t")
		  write.table(indx_train, "TrainIndx.Rflat")
	    }
		
		x_train <- c();
		max.sect <- ceiling(NROW(x_train_bed[[1]])/500000);
		for(i in 1:max.sect)
		{
			i.start <- (i-1)* 500000 + 1;
			i.end <-  i* 500000;
			if (i.end > NROW(x_train_bed[[1]]) ) i.end <- NROW(x_train_bed[[1]]);
		    x_train <- rbind( x_train, parallel_read_genomic_data( x_train_bed[[1]][i.start:i.end, ], bw_plus_path, bw_minus_path, ifelse(remove.q>0, "linear",scale.method) ) );
		    gc(reset=TRUE);
	    }

	    x_predict <- parallel_read_genomic_data( x_predict_bed[[1]], bw_plus_path, bw_minus_path, ifelse(remove.q>0, "linear",scale.method) );
	    gc(reset=TRUE);

		scaled <- TRUE;
	    if(remove.q>0)
	    {
			rf <- filter(x_train, y_train, remove.q);

	       	if(NROW(x_train)<=1000)
	       	{
	         	mat <- cbind(x_train, y_train, P=rf$log_pr, index=1:NROW(x_train));
	         	pdf("dreg.train.curve.pdf")
	         	#mat <- mat[-rf$idx.rem,];
	         	mat <- mat[order(mat[,"P"]),];
	         	mat <- mat[,c( 1:20, 41:60, 81:140, 201:240, 281:320, 21:40, 61:80, 141:200, 241:280, 321:363)]

	         	for(i in 1:NROW(mat))
	         	{
	      	   		outlier <-"";
	    	   		if( round(mat[i,"index"]) %in% rf$idx.rem) outlier <-"**";

	    	   		plot(1,1, xlim=c(1, NCOL(mat)/2 + 40), ylim=c(-max(mat[,1:360])/8,max(mat[,1:360])/8), xlab="Feature", ylab="Training data", type="n", main=paste("index=", i, outlier, "log(p)=", round(mat[i,"P"])  ) );
	    	   		lines(1:180, -1*mat[i,1:180], col="blue",  lwd=1);
	    	   		lines(1:180, mat[i,181:360],  col="red",  lwd=1);
	     	   		points( 200, mat[i,361]*20, pch=19, col="green");
	     	   		segments(20, -10, 20, 10, lty="22", lwd=1);
	    	   		segments(40, -10, 40, 10, lty="22", lwd=1);
	    	   		segments(100, -10, 100, 10, lty="22", lwd=1);
	    	   		segments(140, -10, 140, 10, lty="22", lwd=1);
	    	   		segments(180, -10, 180, 10, lty="22", lwd=1);
	         	}
	         	dev.off();
	       	}

	     	x_train <- x_train[-rf$idx.rem,];
	     	y_train <- y_train[-rf$idx.rem];

	     	if( scale.method != "linear" )
	     	{
	    		cat("scale linear -> logistic\n");
	     	   	x_train_bed[[1]] <- x_train_bed[[1]][-rf$idx.rem,];
	     	 	x_train <- parallel_read_genomic_data( x_train_bed[[1]], bw_plus_path, bw_minus_path, scale.method  );
	     	}
	    }

	}
	else
	{
    	stopifnot(NROW(bw_plus_path) == NROW(bw_minus_path) & NROW(bw_plus_path) == NROW(positive.bed));

		if(file.exists(file_raw_rdata))
		{
			cat("RAW data is extisting:", file_raw_rdata, "\n");
			load(file_raw_rdata);
			x_train <- x_predict <- c();
			for(x in 1:length(bw_plus_path)){
				cat("Training data set:",  bw_plus_path[[x]], "\n");
				x_train0 <- parallel_read_genomic_data( x_train_bed[[x]], bw_plus_path[[x]], bw_minus_path[[x]], ifelse(remove.q>0, "linear",scale.method) ) ;
				x_predict0 <- parallel_read_genomic_data( x_predict_bed[[x]], bw_plus_path[[x]], bw_minus_path[[x]], ifelse(remove.q>0, "linear",scale.method));
				x_train <- rbind( x_train,  x_train0 );
				x_predict <- rbind( x_predict, x_predict0);
			}
		}
		else
		{
			for(x in 1:length(bw_plus_path)){
				tset_x <- get_test_set(positions= positions[[x]], positive.bed= positive.bed[[x]], negative.bed=negative.bed[[x]], n_samp= n_train_pos + n_eval_pos , n_samp_neg= n_train_neg + n_eval_neg )

				x_train_bed[[x]] <- tset_x[ indx_train, c(1:3)]
				x_predict_bed[[x]] <- tset_x[ indx_eval, c(1:3)]
				y_train0 <- tset_x[indx_train,4];
				y_predict0 <- tset_x[indx_eval,4];
				x_train0 <- parallel_read_genomic_data( x_train_bed[[x]], bw_plus_path[[x]], bw_minus_path[[x]], ifelse(remove.q>0, "linear",scale.method) ) ;
				x_predict0 <- parallel_read_genomic_data( x_predict_bed[[x]], bw_plus_path[[x]], bw_minus_path[[x]], ifelse(remove.q>0, "linear",scale.method));

				if(remove.q>0)
				{
					rf <- filter(x_train0, y_train0, remove.q);
					x_train0 <- x_train0[-rf$idx.rem,];
					y_train0 <- y_train0[-rf$idx.rem];
					if( scale.method != "linear" )
					{
						x_train_bed[[x]] <- x_train_bed[[x]][-rf$idx.rem,];
						x_train0   <- parallel_read_genomic_data( x_train_bed[[x]], bw_plus_path[[x]], bw_minus_path[[x]], , ifelse(remove.q>0, "linear",scale.method) );
					}
				}

				x_train <- rbind( x_train,  x_train0 );
				x_predict <- rbind( x_predict, x_predict0);
				y_train <- c(y_train, y_train0);
				y_predict <- c(y_predict, y_predict0);
			}
		}

   		if(scale.method=="linear")
   			scaled <- TRUE
   		else
   			scaled <- FALSE;
  		}

  		if(!is.null(file_raw_rdata))
  		{
  			save(x_train_bed, x_predict_bed, x_train, y_train, x_predict, y_predict, file=file_raw_rdata);
  		}

  		gc();

  		#######################################
  		## Train the model.
  		if(debug) print("Fitting SVM.")
  		if (svm_type == "SVR")
  		{
  		  	if (g.svmtuning)
    		{
			   	tune.params <- svm_tuning(x_train, y_train, x_predict, y_predict, paste(file_raw_rdata, ".tuning.rdata", sep="") );
				cost <- tune.params$cost;
				gamma<- tune.params$gamma;
				epsilon<- tune.params$epsilon;
			}
			else
			{
				cost <- 1;
				gamma<- 1/NCOL(x_train);
				epsilon<- 0.1;
			}

    		if(debug) print("Training a epsilon-regression SVR.")

    		asvm <- Rgtsvm::svm( x_train, y_train, type="eps-regression", cost=cost, gamma=gamma, epsilon=epsilon, scale=scaled )
  		}

  		if (svm_type == "P_SVM")
  		{
    		if(debug) print("Training a probabilistic SVM.")

    		asvm <- Rgtsvm::svm( x_train, as.factor(y_train), probability=TRUE, scale=scaled )
  		}

  		gc(verbose=F, reset=T);

  		########################################
  		## If a PDF file is specified, test performance, and write ROC plots to a PDF file.
  		## Currently *NOT* supported when training with >1 dataset.
  		if(!is.null(pdf_path) && !is.na(pdf_path) && n_eval>0) {

  	  	pdf(pdf_path);
  	  	# Plot raw data, if desired.
	    if(plot_raw_data) {
      		plot_x_curve( x_train, y_train, ylab="Training data", title="All data", ...)
      		## Plot raw prediction data, if desired.
      		plot_x_curve( x_predict, y_predict, ylab="Prediction data", title="All data", ...)

      		if(length(bw_plus_path)>1)
      			for(i in 2:length(bw_plus_path))
      			{
      				plot_x_curve( scale(x_train)[(c(1:n_train)*2)+(i-1)*n_train*2, ], y_train[(c(1:n_train)*2)+(i-1)*n_train*2], ylab="Training data", title=bw_plus_path[1], ...)

       				## Plot raw prediction data, if desired.
       				plot_x_curve( scale(x_predict)[(c(1:n_eval)*2)+(i-1)*n_eval*2, ], y_predict[(c(1:n_eval)*2)+(i-1)*n_eval*2], ylab="Prediction data", title=bw_plus_path[1], ...)
      			}
    	}

    	## Predict on a randomly chosen set of sequences.
    	if(debug) print("Collecting predicted data.");
    	pred <- predict( asvm, x_predict );

    	## Write ROC plots.
    	roc_values <- logreg.roc.calc(y_predict, pred);
    	AUC<- roc.auc(roc_values);
    	roc.plot(roc_values, main=AUC, ...);
    	print(paste("Model ROC: ",AUC));
    	remove(roc_values);

		AUC <- round( auc_pr ( y_predict, pred ),3);
    	print(paste("Model PR: ",AUC));

		dev.off();

 	}

 	remove(x_train);
 	remove(x_predict);

  	return(asvm);
}

svm_tuning<-function(x_train, y_train, x_predict, y_predict, file.auc.rdata)
{
	ranges <- list(cost = c(0.1, 10, 100),
	gamma = c( 36/360, 6/360, 1/360, 1/360*1/4 ),
	epsilon = c(0.01, 0.1) )
	parameters <- expand.grid(ranges);
	pr.auc <- roc.auc<- c();
	for ( i in 1:NROW(parameters))
	{
		auc.i <- tryCatch(
		{
			asvm <- do.call( svm,  c(list( x_train, y=y_train), lapply(parameters[i,,drop = FALSE], unlist), type="eps-regression", scale=TRUE)  )
			y.pred <- predict.gtsvm(asvm, x_predict);
			roc.i <- roc.auc( logreg.roc.calc( y_predict, y.pred ) )
			pr.i <- auc_pr ( y_predict, y.pred );
			cat("Tuning param:cost/gamma/epsilon=", unlist(parameters[i,,drop = FALSE]), "AUC=",pr.i, roc.i,"\n");

			c( pr.i, roc.i );
		},
		error=function(cond) {
			 message(cond);
			 return(c(NA,NA));

		});

		pr.auc <- c(pr.auc, auc.i[1]);
		roc.auc <- c(roc.auc, auc.i[2]);
	}

	auc.svm <- cbind(parameters, PR=pr.auc, ROC=roc.auc);

	svm.def <- svm( x_train, y_train, type="eps-regression", scale=TRUE )
	y.pred <- predict(svm.def, x_predict);
	roc.def <- roc.auc( logreg.roc.calc( y_predict, y.pred ) )
	pr.def <- auc_pr ( y_predict, y.pred );

	auc.svm <- rbind(auc.svm ,c(1, 1/NCOL(x_predict), 0.1, PR=pr.def, ROC=roc.def));
	save(auc.svm, file=file.auc.rdata);

	idx.max <- which.max( auc.svm[,"PR"]+auc.svm[,"ROC"] );

	return( list(cost=auc.svm[idx.max,1], gamma=auc.svm[idx.max,2], epsilon=auc.svm[idx.max,3]) );

}

filter <- function(x_train, y_train, q=0.05)
{
	est_pr1 <- function(x)
	{
		x<-x*1e9;
		ppois( round(rowSums(x)),  median(rowSums(x)), log.p=T);
	}

	log_pr <- rep(NA, NROW(x_train));

	y0.idx <- which(y_train==0);
	log_pr0 <- est_pr1(  x_train[y0.idx,1:20] ) +
	   est_pr1( x_train[y0.idx, 21:40]) +
	   est_pr1( x_train[y0.idx, 41:40]) +
	   est_pr1( x_train[y0.idx, 61:40]) +
	   est_pr1( x_train[y0.idx, 81:140]) +
	   est_pr1( x_train[y0.idx, 141:200]) +
	   est_pr1( x_train[y0.idx, 201:240]) +
	   est_pr1( x_train[y0.idx, 241:280]) +
	   est_pr1( x_train[y0.idx, 281:320]) +
	   est_pr1( x_train[y0.idx, 321:360])

	y1.idx <- which(y_train==1);
	log_pr1 <- est_pr1( x_train[y1.idx,1:20] ) +
	   est_pr1( x_train[y1.idx, 21:40]) +
	   est_pr1( x_train[y1.idx, 41:40]) +
	   est_pr1( x_train[y1.idx, 61:40]) +
	   est_pr1( x_train[y1.idx, 81:140]) +
	   est_pr1( x_train[y1.idx, 141:200]) +
	   est_pr1( x_train[y1.idx, 201:240]) +
	   est_pr1( x_train[y1.idx, 241:280]) +
	   est_pr1( x_train[y1.idx, 281:320]) +
	   est_pr1( x_train[y1.idx, 321:360])

	log_pr[y1.idx] <- log_pr1;
	log_pr[y0.idx] <- log_pr0;

	q105 <- quantile(log_pr1, q);
	q095 <- quantile(log_pr0, 1-q);
cat("q=", q, "q .05 for positive=", q105, "q .95 for negative=", q095, "\n");
	idx.rem0 <- y0.idx[ which( log_pr0 > q095 ) ]
	idx.rem1 <- y1.idx[ which( log_pr1 < q105 ) ]

	return( list( idx.rem= c(idx.rem0,idx.rem1), log_pr=log_pr, q105=q105, q095=q095) );
}

plot_x_curve <- function(x_train0, y_train0, ylab, title, ...)
{
  colQuantile <- function(mat, probs)
  {
    unlist(apply(mat, 2, function(v){quantile(v, probs)}));
  }

  colMedian <- function(mat)
  {
    unlist(apply(mat, 2, function(v){median(v)}));
  }

  plot(1 ,1, xlim=c(1, NCOL(x_train0)), ylim=c(min(x_train0), max(x_train0)), ylab=ylab, main=title, type="n", ...);

  x <- c( seq(1, NCOL(x_train0)), seq(NCOL(x_train0),1,-1))
  y <- c( colQuantile(x_train0[ y_train0 == 1,], 0.45),
	             colQuantile(x_train0[ y_train0 == 1,], 0.55)[seq(NCOL(x_train0),1,-1)]);

  polygon(x, y, density = NULL, angle = 45,  border = rgb(1,0.6,0.6,0.2), col = rgb(1,0.6,0.6,0.2) );

  x <- c( seq(1, NCOL(x_train0)), seq(NCOL(x_train0),1,-1))
  y <- c( colQuantile(x_train0[ y_train0 == 0,], 0.45),
	             colQuantile(x_train0[ y_train0 == 0,], 0.55)[seq(NCOL(x_train0),1,-1)]);
  polygon(x, y, density = NULL, angle = 45,  border = rgb(0.6,0.6,1,0.2), col = rgb(0.6,0.6,1,0.2) );

  lines(1:NCOL(x_train0), colMedian(x_train0[ y_train0 == 1,]), col="red", lwd=1, lty=3, ...);
  lines(1:NCOL(x_train0), colMedian(x_train0[ y_train0 == 0,]), col="blue", lwd=1, lty=3, ...);

  lines(1:NCOL(x_train0), colMeans(x_train0[ y_train0 == 1,]), col="red", lwd=2, ...);
  lines(1:NCOL(x_train0), colMeans(x_train0[ y_train0 == 0,]), col="blue", lwd=2, ...);
}


dreg.train<-function( gdm, train.bigwig.plus, train.bigwig.minus, positive.bed, negative.bed, allow.bed, extra.enrich.bed, n.train, n.eval, file.pdf.data, file.raw.rdata, inf_positions=NULL, scale.method="logistic", ncores=15, use_rgtsvm=TRUE)
{
	## Train the SVM, Get informative positions.
	if(is.null(inf_positions))
		inf_positions <- lapply( 1:length(train.bigwig.plus), function(i){
			inf <- get_informative_positions( train.bigwig.plus[i], train.bigwig.minus[i], depth= 0, step=50, use_ANDOR=TRUE, use_OR=FALSE)
			print(paste("Number of inf. positions: ", NROW(inf)));
			return(inf);});

	n_bigwig <- length(train.bigwig.plus);

	if (n_bigwig==1)
		asvm <- regulatory_svmx( gdm,
				train.bigwig.plus,
				train.bigwig.minus,
				inf_positions[[1]],
				positive.bed,
				negative.bed,
				NULL, #allow.bed,
				pdf_path= file.pdf.data,
				n_train = n.train,
				n_eval = n.eval,
				file_raw_rdata = file.raw.rdata,
				extra_enrich_bed= NULL, #extra.enrich.bed,
				use_rgtsvm = use_rgtsvm,
				scale.method=scale.method,
				ncores = ncores,
				debug=TRUE)
	else
		asvm <- regulatory_svmx( gdm,
				train.bigwig.plus,
				train.bigwig.minus,
				inf_positions,
				positive.bed,
				negative.bed,
				allow = NULL, #lapply(1:n_bigwig, function(i){return(allow.bed)}),
				pdf_path= file.pdf.data,
				n_train = n.train,
				n_eval = n.eval,
				file_raw_rdata = file.raw.rdata,
				extra_enrich_bed = NULL, #lapply(1:n_bigwig, function(i){return(extra.enrich.bed)}),
				use_rgtsvm = use_rgtsvm,
				scale.method=scale.method,
				ncores = ncores,
				debug=TRUE)

	return(asvm);
}

pred_holdout <- function( positive.bed, tb.negative, asvm, gdm, infp_list, sample.ratio=0.1, ncores=16, scale.method="logistic", return.only.label=T)
{
	inf_positions <- infp_list$infp;
	if( sample.ratio != 1)
	   inf_positions <- inf_positions[ sample(1:NROW(inf_positions))[1:round(NROW(inf_positions)*sample.ratio)], ]

	pred <- eval_reg_svmx(gdm, asvm, inf_positions, infp_list$plus, infp_list$minus, batch_size=20000, ncores=ncores, use_rgtsvm=use_rgtsvm, use_snowfall=TRUE, scale.method=scale.method, debug= TRUE )
	pred_bed <- cbind( inf_positions, PRED=pred);

	all_feat <- feat( seqname= inf_positions[,1], start= inf_positions[,2], end= (inf_positions[,3]) )
	positive_feat <- feat(seqname= positive.bed[,1], start= positive.bed[,2], end= positive.bed[,3])
	ol <- overlap.feat(x= all_feat, filter = positive_feat)
	pos_indx <- match(paste(ol$seqname, ol$start, ol$end), paste(all_feat$seqname, all_feat$start, all_feat$end))

	all_feat <- feat( seqname= inf_positions[,1], start= inf_positions[,2], end= (inf_positions[,3]) )
	negative_feat <- feat(seqname= tb.negative[,1], start= tb.negative[,2], end= tb.negative[,3])
	ol <- overlap.feat(x= all_feat, filter = negative_feat)
	neg_indx <- match(paste(ol$seqname, ol$start, ol$end), paste(all_feat$seqname, all_feat$start, all_feat$end))

	y_true <- rep(NA, NROW(inf_positions));
	y_true[pos_indx] <- 1;
	y_true[neg_indx] <- 0;
	pred_bed <- cbind( pred_bed, "TRUE"=y_true);

	if(return.only.label)
		return( pred_bed [!is.na(pred_bed$"TRUE"),] )
	else
		return( pred_bed  )
}


plot_all_models <- function( pred_bed_list, file.png, labels, plot.type="pr", title )
{
	png( file.png, width=600, height=600)
	plot(1, 1, xlim=c(0,1), ylim=c(0,1), type="n");

	cols6 <- c("black", "red", "green", "blue", "orange", "pink", "darkgreen");

	AUC <- c();
	for(i in 1:length(pred_bed_list))
	{
		if(plot.type=="pr")
		{
			xy<- rocdf( pred_bed_list[[i]][,4], pred_bed_list[[i]][,5], type="pr") ;
			points(xy[,1], xy[,2], cex=0.5, col=cols6[i]);
			AUC <- c(AUC, round( auc_pr ( pred_bed_list[[i]][,5], pred_bed_list[[i]][,4]),3) );
		}
		else
		{
			xy<- rocdf( pred_bed_list[[i]][,4], pred_bed_list[[i]][,5], type="roc") ;
			points(xy[,1], xy[,2], cex=0.5, col=cols6[i]);

			AUC <- c(AUC, round( roc.auc( logreg.roc.calc(pred_bed_list[[i]][,5], pred_bed_list[[i]][,4]) ),3) );
		}
	}

	if(plot.type=="pr")
	{
		cat("PR AUC=", AUC, "\n");
		legend("topright",  legend=paste(labels, "AUC =", AUC), col = cols6, text.col = cols6 )
		title( main=paste(toupper(plot.type), ":", title), xlab="Recall", ylab="Precision");
	}
	else
	{
		cat("ROC AUC=", AUC, "\n");
		legend("bottomright", legend=paste(labels, "AUC =", AUC), col = cols6, text.col = cols6 )
		title( main=paste(toupper(plot.type), ":", title), xlab="False Positive Rate", ylab="True Positive Rate");
	}

	dev.off();

	return(AUC);
}

dregpipe<-function( gdm, train.bigwig.plus, train.bigwig.minus, n.train, n.eval, infp_list, infp_cur=NULL, prefix.fig="", pdf.title="", scale.method=NULL, positive.bed=NULL, negative.bed=NULL, ncores=5)
{
	if (is.null(positive.bed)) 
	{
	    load("k562.positive.bed.rdata");
	    positive.bed <- lapply(1:NROW(train.bigwig.plus), function(i){return(positive_bed);}) 
	}
	
	if (is.null(positive.bed)) 
	{
	    load("k562.negative.bed.rdata");
	    negative.bed <- lapply(1:NROW(train.bigwig.plus), function(i){return(negative_bed);}) 
    }
    
	extra_enrich_bed <- read.table( file.genecode.merge );
	allow_bed <- read.table( file.enh.prom.bed );

	file_pdf_data <- paste(prefix.fig, ".asvm.pdf", sep="");
	file_raw_rdata <- paste(prefix.fig, ".raw.rdata", sep="");

	if(is.null(scale.method)) scale.method <-"logistic";
    cat("scale.method=", scale.method, "\n");
    cat("remove.q=", remove.q, "\n");

	asvm <- dreg.train( gdm, train.bigwig.plus, train.bigwig.minus, positive.bed, negative.bed, allow_bed,  extra_enrich_bed, n.train, n.eval, file_pdf_data, file_raw_rdata, inf_positions=infp_cur, scale.method=scale.method, ncores=ncores)
	save(asvm, file=paste(prefix.fig, ".rdata", sep=""));

	pred_bed_list<-list();
	labels <- c();
	for(i in 1:length(infp_list) )
	{
		pred_bed_list[[i]] <- pred_holdout( positive.bed[[i]], negative.bed[[i]], asvm, gdm, infp_list[[i]],scale.method=scale.method);
		labels <- c(labels, paste("Holdout", i, sep=""));
	}

	save(asvm, pred_bed_list, file=paste(prefix.fig, ".rdata", sep=""));

	file_png <- paste(prefix.fig, ".hold.pr.png", sep="");
	AUC.pr <- plot_all_models ( pred_bed_list, file_png, labels=labels, plot.type="pr", title=pdf.title)
	file_png <- paste(prefix.fig, ".hold.roc.png", sep="");
	AUC.roc <- plot_all_models ( pred_bed_list, file_png, labels=labels, plot.type="roc", title=pdf.title)

	return(list(bigwig=train.bigwig.plus, prefix.fig=prefix.fig, sample=n.train, pr=AUC.pr, roc=AUC.roc))

}

# !!!important
make_positive_bed<-function()
{
	#using the peak calling by MACS and get the region containing at least one GRO-CAP sites on Dec 26
	positive_bed0 <- read.table( pipe(paste("bedtools intersect -a GSM646567_hg19_wgEncodeUwDgfK562Pk.macs2.narrowPeak -b hg19.k562.new_hmm2b.post2.bed -wa ", sep="")) );
	positive_bed1 <- read.table( pipe(paste("bedtools intersect -a wgEncodeOpenChromDnaseK562PkV2.narrowPeak -b GSM646567_hg19_wgEncodeUwDgfK562Pk.narrowPeak.txt ", sep="")) );

	tmp.file0 = tempfile(fileext = ".bed" );
	tmp.file1 = tempfile(fileext = ".bed" );
	write.table( positive_bed0, file=tmp.file0, quote=F, row.name=F, col.names=F, sep="\t" );
	write.table( positive_bed1, file=tmp.file1, quote=F, row.name=F, col.names=F, sep="\t" );

	positive_bed <- read.table( pipe(paste("bedtools intersect -a ", tmp.file0, " -b ", tmp.file1, sep="")) );
	save(positive_bed, file="k562.positive.bed.rdata");

	tmp.file2 = tempfile(fileext = ".bed" );
	system( paste("cat wgEncodeOpenChromDnaseK562PkV2.narrowPeak GSM646567_hg19_wgEncodeUwDgfK562Pk.narrowPeak.txt GSM646567_hg19_wgEncodeUwDgfK562Pk.macs2.narrowPeak > ", tmp.file2), intern=TRUE );
	negative_bed <- read.table( pipe(paste("sort-bed ", tmp.file2," | bedtools merge -i - -d 100 ", sep="" ) ) );
	negative_bed[,2] <- negative_bed[,2] - 100
	idx.mis <- which(negative_bed[,2]<0);
	if(length(idx.mis)>0) negative_bed[idx.mis,2] <- 0;
	negative_bed[,3] <- negative_bed[,3] + 100
	write.table( negative_bed, file=tmp.file2, quote=F, row.name=F, col.names=F, sep="\t" );
	negative_bed <- read.table( pipe(paste("bedtools complement -i ", tmp.file2, " -g chrom_info_hg19.tab ") ) );

	save(negative_bed, file="k562.negative.bed.rdata");
}

