# Enter df (dataframe) w/o quotes, "score" w quotes. score is continuous predictor. "test" w quotes.  test is categorical true result (often "RH", tp" or true positive in my data frames.) N is total number samples.
# Uses random sampling to get null distribution of AUC

auc_sample_2 <- function(df,score,test,N=100){

	# library(pROC)
	# rm(SUMM)
	SUMM <- data.frame() #initialize data frame
	n <- 0
	
	roc_test_1 <- roc(df[[test]], df[[score]]) #make dataframe w 2 cols, result and predictor. 
	auc <- auc(roc_test_1) #get observed roc
	print(paste0('observed auc is ', auc)) #print observed roc


	repeat {
	    n <- n + 1
	    
		b <- sample(c(1:nrow(df)), size = nrow(df), replace = FALSE) #make vector of shuffled row numbers from df
	
	    df_test <- df #copy df
	    df_test[[test]] <- df[b,][[test]] #replace result column of df copy with shuffled result column
	    # df_test[, colnames(df_test) != score] <- df[, colnames(df) != score]
	        
		roc_test <- roc(df_test[[test]], df_test[[score]],percent=FALSE) #get roc from df copy w shuffled result column
		auc_test <- auc(roc_test) #get shuffled auc
	
	    SUMMX <- data.frame(shuffle = n,  AUC = auc_test) #make dataframe (one row) w latest result
	    SUMM <- rbind(SUMM, SUMMX) #add row to initializer df
	    print(SUMM[n,]) #print latest
	
	
	    if (n == N) {
	        break
	    }
	}

	# return(SUMM) # use to analyze results by hand, if "Error in delongPlacements(roc)" bug occurs. Otherwise, comment out.
	
	dev.new()
	boxplot(SUMM$AUC, ylim = c(0, 1))
	
	sem <- function(x) {sqrt(var(x,na.rm=TRUE)/sum(!is.na(x)))}
	
	mean_auc_null_sample <- mean(SUMM$AUC,na.rm=TRUE)
	sem_auc_null_sample <-  sem(SUMM$AUC)
	sd_auc_null_sample <- sd(SUMM$AUC,na.rm=TRUE)
	
	cat(paste0('observed auc = ', auc))
	cat(paste0('\nmean null sample = ', mean_auc_null_sample))
	cat(paste0('\nsem null sample = ', sem_auc_null_sample))
	cat(paste0('\nsd null sample = ', sd_auc_null_sample))
	cat(paste0('\nn = ', n))


	p_upper <- (1-(length(SUMM[SUMM$AUC-auc < 0,]$AUC)/length(SUMM$AUC)))
		if(length(SUMM[SUMM$AUC-auc < 0,]$AUC) == length(SUMM$AUC)){
		   cat(paste0('\nP upper < ', 1/n))
		} else {
			cat(paste0('\nP upper = ', p_upper))
		}
	
	
	p_lower <- (1-(length(SUMM[SUMM$AUC-auc > 0,]$AUC)/length(SUMM$AUC)))
		if(length(SUMM[SUMM$AUC-auc > 0,]$AUC) == length(SUMM$AUC)){
		   cat(paste0('\nP lower < ', 1/n))
		} else {
			cat(paste0('\nP lower = ', p_lower))
		}

}

# --------------------- Calx -----------------------------


# roc.txt is from "roc_fig_2_ttest.R"

roc <- read.table("roc.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE)

# takes ~ 20 min
HeLa <- auc_sample_2(roc[roc$cell=="HeLa",],score="score",test="RH",N=1e2)

sum(HeLa$AUC > 0.05)
# [1] 100

sum(HeLa$AUC > 0.5)
# [1] 86

# p seems wrong:
1-(86/100)
# [1] 0.14


# Using Wilcox (https://stats.stackexchange.com/questions/75050/in-r-how-to-compute-the-p-value-for-area-under-roc)
# One sided (alternative="less")
wilcox.test(score ~ RH, data=roc[roc$cell=="HeLa",], alternative="less")

	Wilcoxon rank sum test with continuity correction

data:  score by RH
W = 5685300, p-value = 1.334e-10
alternative hypothesis: true location shift is greater than 0


library(verification)
roc.area(obs=as.numeric(as.character(roc[roc$cell=="HeLa",]$RH)), pred=roc[roc$cell=="HeLa",]$score)
# $A
# [1] 0.5922358

# $n.total
# [1] 17648

# $n.events
# [1] 405

# $n.noevents
# [1] 17243

# $p.value
# [1] 1.037361e-10

# p val in roc.area uses wilcox.test, so not surprisingly, p vals agree


library(pROC)
roc(as.numeric(as.character(roc[roc$cell=="HeLa",]$RH)), roc[roc$cell=="HeLa",]$score)$auc
# Area under the curve: 0.5922


var(roc(as.numeric(as.character(roc[roc$cell=="HeLa",]$RH)), roc[roc$cell=="HeLa",]$score))
# [1] 0.0002229199

# bit surprising that pnorm is more conservative than Wilcox. One sided used.
pnorm(q=as.numeric(roc(as.numeric(as.character(roc[roc$cell=="HeLa",]$RH)), roc[roc$cell=="HeLa",]$score)$auc), mean = 0.5, sd = sqrt(var(roc(as.numeric(as.character(roc[roc$cell=="HeLa",]$RH)), roc[roc$cell=="HeLa",]$score))), lower.tail=FALSE)
# [1] 3.252592e-10

# bootstrap gives nearly identical values for variance
var(roc(as.numeric(as.character(roc[roc$cell=="HeLa",]$RH)), roc[roc$cell=="HeLa",]$score),method="bootstrap")
  # |===========================================================================================================================================================| 100%
# [1] 0.0002222616


# In the end, Wilcox seems to give most reliable values:



wilcox.test(score ~ RH, data=roc[roc$cell=="geneLength_cr",], alternative="less")

	# Wilcoxon rank sum test with continuity correction

# data:  score by RH
# W = 2106500, p-value < 2.2e-16
# alternative hypothesis: true location shift is less than 0


wilcox.test(score ~ RH, data=roc[roc$cell=="geneLength_cr",], alternative="less")$p.value
# [1] 3.33976e-76



wilcox.test(score ~ RH, data=roc[roc$cell=="geneLength_nc",], alternative="less")

	# Wilcoxon rank sum test with continuity correction

# data:  score by RH
# W = 5037500, p-value < 2.2e-16
# alternative hypothesis: true location shift is less than 0


wilcox.test(score ~ RH, data=roc[roc$cell=="geneLength_nc",], alternative="less")$p.value
# [1] 1.765718e-43





wilcox.test(score ~ RH, data=roc[roc$cell=="HeLa",], alternative="less")

	# Wilcoxon rank sum test with continuity correction

# data:  score by RH
# W = 2847600, p-value = 1.037e-10
# alternative hypothesis: true location shift is less than 0




wilcox.test(score ~ RH, data=roc[roc$cell=="160",], alternative="less")

	# Wilcoxon rank sum test with continuity correction

# data:  score by RH
# W = 3490200, p-value = 1.792e-08
# alternative hypothesis: true location shift is less than 0



wilcox.test(score ~ RH, data=roc[roc$cell=="995",], alternative="less")

	# Wilcoxon rank sum test with continuity correction <<<<<<<< omit from ROC graph, even though significant by t test

# data:  score by RH
# W = 162710, p-value = 0.1191
# alternative hypothesis: true location shift is less than 0



wilcox.test(score ~ RH, data=roc[roc$cell=="RH_exp",], alternative="less")

	# Wilcoxon rank sum test with continuity correction    <<<<<<<<< use in paper

# data:  score by RH
# W = 2865800, p-value = 0.0004881
# alternative hypothesis: true location shift is less than 0




# bootstrap of weakest p val
 var(roc(as.numeric(as.character(roc[roc$cell=="RH_exp",]$RH)), roc[roc$cell=="RH_exp",]$score),method="bootstrap")
  # |===========================================================================================================================================================| 100%
# [1] 0.000198544

# Similar to Wilcox
pnorm(q=as.numeric(roc(as.numeric(as.character(roc[roc$cell=="RH_exp",]$RH)), roc[roc$cell=="RH_exp",]$score)$auc), mean = 0.5, sd = sqrt(0.000198544), lower.tail=FALSE)
# [1] 0.0002644269




















