## usage ###
## TO BE USED IN SPLUS###
## Input is a dataframe with chromosome and ratio in two columns.
## Example -- 
# output_cancer.seg(df,chrom.col="CHROM",ratio.col="RATIO")
## output will have two objects - one is the list of segments and the second is the decompressed version of the segments to make it easier to plot


cancer.seg_function(df.name, chrom.col = "CHROM", ratio.col = "GEOMEAN.RATIO", stringency1= 10^-5, ratio=1, k=3, nrandoms=3){
		bt_proc.time()
	

	chrom.places_make.chrom.place.1(df.name, col.name = chrom.col)[[1]]
	chrom.places_sort(union(chrom.places[-1],1+chrom.places[-length(chrom.places)]))
	n.chrom_length(chrom.places)/2
	places_c()
	

	for (i in 1:n.chrom){
			int_chrom.places[2*i-1]:chrom.places[2*i]
			df.temp_df.name[int,]

			z_stringent.segment.T.1(df.temp, chrom.col, ratio.col, stringency1, use.chrom=F)
			
			places_c(places,z[[1]][[2]]+int[1]-1)
	
	}
	
	places_sort(union(places[-1],1+places[-length(places)]))
	broad.places_sort(union(places,chrom.places))

	
	n.broad_length(broad.places)/2
	lval_log(df.name[,ratio.col])		
	
	mean.broad_rep(0,n.broad)
	median.broad_rep(0,n.broad)
	std.broad_rep(0,n.broad)
	up.down.broad_rep(0,n.broad)
	length.broad_rep(0,n.broad)
	id.to.broad_rep(0,n.broad)
	
	mean.broad.long_rep(0,length(lval))
	median.broad.long_rep(0,length(lval))
	std.broad.long_rep(0,length(lval))
	up.down.probes_rep(0,length(lval))
	id.to.broad.long_rep(0,length(lval))
	probe.num_rep(0,length(lval))

	for (i in 1:n.broad){
		int_broad.places[2*i-1]:broad.places[2*i]
		mean.broad[i]_mean(lval[int])
		median.broad[i]_median(lval[int])
		length.broad[i]_length(int)
		std.broad[i]_stdev(lval[int])
		up.down.broad[i]_ifelse(mean.broad[i]>0,1,-1)
		id.to.broad[i]_i
		
		mean.broad.long[int]_mean.broad[i]
		median.broad.long[int]_median.broad[i]
		std.broad.long[int]_std.broad[i]
		up.down.probes[int]_up.down.broad[i]
		id.to.broad.long[int]_i
		probe.num[int]_int
	}


	broad.short.table_data.frame("id.to.broad"=id.to.broad,"mean.broad"=mean.broad,"std.broad"=std.broad,"length.broad"=length.broad,"sign.broad"=up.down.broad)
	broad.long_data.frame("probe.num"=probe.num,"id.to.broad"=id.to.broad.long,"lval"=lval,"mean.broad"=mean.broad.long,"std.broad"=std.broad.long,"median.broad"=median.broad.long,"sign.broad"=up.down.probes)
	
	return(broad.short.table,broad.long)
}

#################################New Stringent Segment############################################################

stringent.segment.T.1_function(df.name, chrom.col = "CHROM", ratio.col = "GeoMeanRatio", 
	stringency1= 10^-5,use.chrom = T)
{
	#this version stops at one stringency view
	
	bt_proc.time()
#	cat("starting stringent.segment.T.1\n")
	#meant for tumor data
	
	#probe data must be chromosome position order
	#if it is not, sort your data sheet...
	
	#column containing chromosome will be assumed to be "CHROM"
	#unless you specify otherwise
	#as a column number or a quoted string
	
	#similarly, ratio values will be assumed to be in column named "GeoMeanRatio"
	#unless otherwise specified
	#as a column number or a quoted string
	
	#and the vector of logarithmic ratios
	lval_log(df.name[,ratio.col])
	
	#make place vector for chromosome boundaries
	if (use.chrom == T) {
		ans_make.chrom.place.1(df.name, col.name = chrom.col)
		place_ans[[1]]
		chrom.names_ans[[2]]
	}	
	if (use.chrom == F) {
		place_c(0,length(lval))
		chrom.names_NA}
	#run the program
	z_assemble.4.T.2(lval, place = place, stringency1 = stringency1)
	
	#now make a dataframe with all the relevant values (long stats):
	#broad means and medians
	#and companion short stats:
	#breaks,intervals,maybe positions,means,standard deviations
	
	
	
#	cat("Total Time=... ",proc.time()-bt,"\n")
	return(z)
}





assemble.4.T.2 _ function(lval, place = c(0, length(lval)), seglen = 100, 
	stringency1 = 10^-5, n = 5, minlen = 6)
{
#modified assemble.4.T.1 to do just one stringency	
#assumed for use with tumors
	t1 <- proc.time()
#	cat("starting assemble.4.T.2\n")
	places.1 <- seg.n.merge.2(lval, place = place, seglen = seglen, stringency1 = stringency1, 
		minlen = minlen)[[3]]
#seg.n.merge will require surgery to use dynamic programming to accelerate minimization of variance
	places.2 <- check.spots(lval, places.1, upto = 2, minlen)
#places contains the starts of new segments, by probe index
#if needed, can always break places into immutable chromosome boundaries and the rest
	places.3 _ sort(union(places.2, place))
#cat("places.3=... ",places.3,"\n")
	stats.1 <- make.stats.only.1(lval, places.3)

	tt <- proc.time() - t1
#	cat("total time elapsed ... ", tt, "\n")	
#	df.ret.test_
	
	return(list("places" = list("original.places"=place,
							"broad.places" = places.3), 
		"long.stats"=stats.1[[1]],
		"short.stats"=stats.1[[2]]))
		

}




make.stats.only.1 _ function(lval, place = c(0, length(lval)) )
{
	#just makes stats given place
	
	#NOTE: stats are based on log ratios, and a case can be made
	#that stats should be based on ratios
	
	bt <- proc.time()
#	cat("starting make.stats.only.1\n")
	
	len <- length(lval)
	len.place <- length(place)-1
	
#	cat("len",len,len.place,"\n")
	loc_rep(0,len)
	val.mean <- rep(0, len)
	val.median <- rep(0, len)
	val.stdev <- rep(0, len)
	
	loc.start_rep(0,len.place)
	loc.end_rep(0,len.place)
	short.mean <- rep(0, len.place)
	short.median <- rep(0, len.place)
	short.stdev <- rep(0, len.place)
	
	for(i in 2:(1+len.place)) {	

#i believe this is the needed correction
		int <- (place[i - 1] + 1):place[i]
		
		loc.start[i-1]<-place[i-1]
		loc.end[i-1]<-place[i]
		short.mean[i-1] <- exp(mean(lval[int]))
		short.median[i-1] <- exp(median(lval[int]))
		short.stdev[i-1] <- exp(stdev(lval[int]))
		
#		cat(place[i-1],place[i],short.mean[i-1],short.stdev[i-1],"\n")
		loc
		val.mean[int] <- short.mean[i-1] 
		val.median[int] <- short.median[i-1] 
		val.stdev[int] <- short.stdev[i-1] 
	}
	z1 <- data.frame(val.mean = val.mean, val.stdev = val.stdev, val.median = val.median)
	z2 <- data.frame(loc.start=loc.start,loc.end=loc.end, short.mean = short.mean, short.stdev = short.stdev, short.median = short.median)
	stats <- list(long = z1, short = z2)
#	cat("elapsed time is... ", proc.time() - bt, "\n")
	return(stats)
}	
	


seg.n.merge.2 _ function(lval, place = c(0, length(lval)), 
	seglen = 100, stringency1 = 10^-5, minlen = 6)
{
#altered to accomodate chromosome breaks
#altered to consider single stringency and dynamic programming
	all.place <- segmate.2(lval, place = place, seglen, stringency = stringency1, minlen = minlen)
	merged.place <- merge.segs(lval, all.place, stringency = stringency1, minlen = minlen)
	list(all.place, merged.place[[1]], merged.place[[2]])
}





segmate.2 _ function(lval, place = c(0, length(lval.sim)), 
	stringency = 10^-5, seglen = 100, minlen = 6)
{
#altered to accomodate chromosome breaks
#first, breaks data into seglen chunks and finds breaks
#then removes artificial breaks
#then repeats act with seglen chunks offset by seglen/2
#finally combines all the breaks
#with given breaks (place) to accomodate chromosome segmentation
#and asks finally for any breaks that were missed
#all breaks are right most bound of interval
	make.var.sum.val(lval)  #works with all.tile to give dynamic programming speed to minimization of variance
	len <- length(lval)
	place1 <- place	#note: all.tile.5 will make the default place vector
	seg2 <- seglen * (1:(floor(len/seglen)))
	place2 <- sort(union(place1, seg2))	#adds chromosome boders if any, plus first and last place
	place2 <- all.tile.5(lval, place2, stringency = stringency, minlen = minlen)
	place2 <- setdiff(place2, seg2)
	seg3 <- floor(seglen/2) + seglen * (1:(-1 + floor(len/seglen)))
	place3 <- sort(union(place1, seg3))	#adds chromosome boders if any, plus first and last place
	place3 <- all.tile.5(lval, place3, stringency = stringency, minlen = minlen)
	place3 <- setdiff(place3, seg3)
	all.place <- sort(union(place1, union(place2, place3)))	#adds chromosome boders if any 
	all.place <- all.tile.5(lval, all.place, stringency = stringency, minlen = minlen)
	all.place
}

################################# unchanged accessories #############################################################
make.chrom.place.1_function(df, col.name = "CHROM")
{
	chrom.names <- union(c(),df[, col.name])
	chrom.bounds <- match(chrom.names,df[,col.name])
	place <- c(0,chrom.bounds[-1]-1,nrow(df))
	list(place,chrom.names)
}





check.spots _ function(lval, merged.place, upto = 2, minlen = 6)
{
	bt_proc.time()
#	cat("starting check.spots\n")
	len <- length(merged.place)
	if(len == 2) {
		return(merged.place)
	}
	while(upto < len) {
		if(len == 2) {
			return(merged.place)
		}
		i <- upto
		testval <- lval[(1 + merged.place[i - 1]):merged.place[i + 1]]
		spot <- merged.place[i] - merged.place[i - 1]
		true.spot <- borderwar(testval, spot, minlen = minlen)
		merged.place[i] <- true.spot + merged.place[i - 1]	#  cat(i,true.spot,"\n")
		if(true.spot < (minlen/2) | (true.spot > (1 + length(testval) - (minlen/2)))) {
#then we eliminate that boundary
			upto <- upto - 1
			merged.place <- merged.place[ - i]
			len <- len - 1
		}
		upto <- upto + 1
	}
#	cat("elapsed time is... ",proc.time()-bt,"\n")
	merged.place
}




merge.segs _ function(lval, all.place, stringency = 0.005, minlen = 6)
{
	bt_proc.time()
#	cat("starting merge.seg \n")	#note: this is my first recursive function 
#warning: it is not really.... it is really a while loop
#and this can get us in trouble some day
	len <- length(all.place)
	p <- rep(0, len)
	if(len == 2) {
		return(list(p, all.place))
	}
	for(i in 2:(len - 1)) {
		p[i] <- ks.gof(lval[(all.place[i - 1] + 1):all.place[i]], lval[(all.place[i] + 1):all.place[i + 1]])[[2]]	
	##########
	}
#cat("merge ", "stringency= ", stringency,"\n")
#cat(round(p,6),"\n")
#rule:
#if p[i]>stringency
#delete i if p[i]=max(p[(i-1):(i+1)])
#repeat until all p[i]<=stringency
	count <- 0
	elim <- c()
	max.p <- max(p)
	if(max.p > stringency) {
		count <- 1
		for(i in 2:(len - 1)) {
			if(p[i] > stringency & p[i] == max(p[(i - 1):(i + 1)])) {
				elim <- c(elim, i)
			}
		}
	}
	if(count == 1) {
		all.place <- all.place[ - elim]
		ans <- merge.segs(lval, all.place, stringency = stringency, minlen = minlen)
		all.place <- ans[[2]]
		p <- ans[[1]]
	}
#	cat("elapsed time is... ",proc.time()-bt,"\n")
	list(round(p, 4), all.place)
}




borderwar _ function(lval, spot, dir = 0, minlen = 6, num.o.calls = 0)
{
#	bt_proc.time()
#	cat("starting borderwar \n")
	lim.o.calls <- 200
	while(num.o.calls < lim.o.calls) {
		len <- length(lval)
		if(spot < (minlen/2)) {
#			cat("elapsed time is... ",proc.time()-bt,"\n")
			return(0)
		}
#the break dissappeared
		if(spot > (1 + len - (minlen/2))) {
#			cat("elapsed time is... ",proc.time()-bt,"\n")
			return(len)
		}
#the break dissappeared  
		x <- lval[spot]
		mu.A <- mean(lval[1:(spot - 1)])
		sig.A <- stdev(lval[1:(spot - 1)])
		mu.B <- mean(lval[(spot + 1):len])
		sig.B <- stdev(lval[(spot + 1):len])
		p <- comp.p(x, mu.A, sig.A, mu.B, sig.B)
#cat("lval, spot=...",lval,spot,"\n" )
#cat("mu.A, sig.A, mu.B, sig.B=... ",mu.A, sig.A, mu.B, sig.B,"\n")
#cat("p=... ",p,"\n")
		if(x <= mu.A & mu.A <= mu.B) {
			p <- 1
		}
		if(x >= mu.A & mu.A >= mu.B) {
			p <- 1
		}
		if(x <= mu.B & mu.B <= mu.A) {
			p <- 0
		}
		if(x >= mu.B & mu.B >= mu.A) {
			p <- 0
		}
		if(p < 0.5) {
#ie spot belongs to B, we had it wrong
			if(dir < 1) {
#and if moving backwards, keep on going
				dir <- -1	
				num.o.calls <- num.o.calls + 1
				spot <- spot - 1
				next
			}
			if(dir == 1) {
#if we were moving forward
#then we have grabbed a probe too many
#and the previous probe was the end point
#of the A segment
#				cat("elapsed time is... ",proc.time()-bt,"\n")
				return(spot - 1)
			}
		}
		if(p >= 0.5) {
#spot does belong to A
			if(dir > -1) {
#if we are moving forward
#lets grab the next spot
				dir <- 1
				num.o.calls <- num.o.calls + 1
				spot <- spot + 1
				next
			}
			if(dir == -1) {
#if we were backing up
#we can finally stop
#and return spot
#				cat("elapsed time is... ",proc.time()-bt,"\n")
				return(spot)
			}
		}
	}
#	cat("limit on border wars breached\n")
#	cat("elapsed time is... ",proc.time()-bt,"\n")
	return(spot)
}




comp.p _ function(x, mu.A, sig.A, mu.B, sig.B)
{
	if (sig.A*sig.B==0){return(0)}
	pA <- (1/sig.A) * exp(-0.5 * ((x - mu.A)/sig.A)^2)
	pB <- (1/sig.B) * exp(-0.5 * ((x - mu.B)/sig.B)^2)
	p <- pA/(pA + pB)
	p
}

########################################### all.tile.5 ###########################################################


all.tile.5	 _ function(lval, place = c(0, length(lval)), upto = 1, stringency = 0.005, minlen = 6)
{
#mods that can be made
#1.	instead of calling make.var.sum.val once per all.tile, make it once per overall run
#2.	instead of computing cumsum for all lval, do so in overlapping chunks of lval
#	because accuracy will increase
#3.	dont compute minvar for every  position, do so for every third or forth, then track inward	

#	bt <- proc.time()
#	cat("starting all.tile.5\n")
	len <- length(lval)
#	make.var.sum.val(lval)  	#moved to higher level "segmate.2"
#	cat((place[upto] < (len - minlen)),"\n")
	while(place[upto] < (len - minlen)) {
#  cat(upto,"\n")
		spot <- find.left.break.5(lval, place, upto, stringency = stringency, minlen = minlen)	
#		  cat(spot,"\n")
		if(spot == place[upto + 1]) {
			upto <- upto + 1
			if(upto == length(place)) {
				break
			}
			next
		}
		place <- sort(c(place, spot))
	}
#	cat("time elapsed ", et <- proc.time() - bt, "\n")
	place
}



find.left.break.5 _ function(lval, place, upto, stringency, minlen)
{
#		cat("starting find.left.break.5\n")

#assumes function crack
#that provides a valid break...
#with place[upto] the index of leftmost tamed border of lval
#returns new place if there is a new break or else place[upto+1]
#will not try to break a tile shorter than minlen
#and will judge significance after ntries
	if((place[upto+1] - place[upto]) <= 2*ceiling(minlen/2)){
#wont break a piece smaller than 2*ceiling(minlen/2) 
		return(place[upto + 1])
	}
#	ans <- crack.5(lval[(place[upto] + 1):place[upto + 1]], stringency, minlen)
	ans _ crack.5(lval,place[upto]+1,place[upto+1],stringency,minlen)
	if(ans == 0) {
		return(place[upto + 1])
	}
	else {
		return(ans)
	}
}


crack.5 _ function(lval, I, J, stringency, minlen)
{
#	cat("starting crack.5\n")
	ans <- apply.divide.5(lval, I, J, minlen)[[1]]
	pval <- ks.gof(lval[(I+1):ans], lval[(1 + ans):J])[[2]]
	if(pval > stringency) {
		return(0)
	}
	return(ans)
}



apply.divide.5 _ function(lval, I, J, minlen)
{
#	cat ("starting apply.divide.5\n")
	#no segment smaller than ceiling(minlen/2) can be made
	minsum <- maxsum <- var.sum(lval,(I+1),J)	
	for(i in (I + ceiling(minlen/2)):(J - ceiling(minlen/2))) {
		sum.i <- var.sum(lval,(I+1),i) + var.sum(lval,(i+1),J)
		if(sum.i <= minsum) {
			index <- i
			minsum <- sum.i
		}
	}
	list(index, maxsum, minsum)
}

make.var.sum.val _ function(lval){
	#makes values that var.sum uses
#	cat ("starting make.var.sum.val\n")
	assign("cumsum.val",cumsum(lval),0)
	assign("cumsum.sq.val",cumsum(lval^2),0)
}

var.sum _ function(lval, I, J){
#	cat ("starting var.sum\n")

	#lval is not needed of course
	sum.I.J _ cumsum.val[J] - cumsum.val[I-1]
	sum.sq.I.J _ cumsum.sq.val[J] - cumsum.sq.val[I-1]
	return(sum.sq.I.J - sum.I.J^2/(J-I+1))
}


############################################# OUTLIERS #########################################################
outlier.up_function(lval,nval,bval,places=c(1,length(nval)),ratio=1,k=3){
#	cat ("starting outlier.up\n")
#	bt_proc.time()

	#first converts nval to binary string bval
	#where 0 is less than or equal to mean plus ratio*stdev 
	#and 1 is  more
	#to save coding time and separate tasks
	#nval is assumed to be in standard mode
	#each segment with mean of 0 and standard deviation of 1
	#modified on 10/23/2003
	#outliers are defined as top (1-q)% of data based on ranks
	# both nval and bval comes as an input ---- both nval&bval are computed in bridge.4
	
#	bval_nval>ratio
	
	bval_convert.new(bval,places)	####note convert replaced with convert.new
	
	#assumes places is properly parsed, as all-inclusive pairs of segment boundaries
	
	#now make string outliers of indices into all runs 
	#with breaks at places
	#and no run less than k	
	outliers_c()
	i_1
	plen_length(places)
	while (i <= plen) {
		int _ places[i]:places[i+1]
		outliers_c(outliers,sliced(bval[int],places[i]-1,k))
		i_i+2
	}
#	cat("outliers are ... ",outliers,"\n")
	#now make short stats
	df.outliers_make.outlier.stats(lval,nval,outliers)
#	cat("elapsed time is ... ",proc.time()-bt,"\n")
	return(df.outliers)

}

sliced_function(bval,index,k){
#	cat ("starting sliced\n")
	
	#parses runs of 1's of length k, and, if any,
	#parses plus index

	#if there are no outliers, return null
	if (sum(bval)==0){
		return()
	}
	
	#parses runs of 1's
	transitions_which(bval[-length(bval)]!=bval[-1])
	if(bval[1]==1){
		transitions_c(0,transitions)
	}
	if (length(transitions)%%2==1){
		transitions_c(transitions,length(bval))
	}
	transitions_transitions+rep(c(1,0),length(transitions)/2)
	
	#eliminate runs of less than k
	trans.mat_index+matrix(transitions,nrow=2)
	runs_trans.mat[2,]-trans.mat[1,]+1
	keep_which(runs>=k)
	outlier.parse_as.vector(trans.mat[,keep])
}

make.outlier.stats_function(lval,nval,outliers){
#	cat ("starting make.outlier.stats\n")
#	bt_proc.time()
	#given nval and parsing "outliers",
	#yields p-vals of KS-test and T-tests
	#in the form of a dataframe 
	#(4 columns: start, end index, ks-test,T-test)
	#(n rows of parsed events)
	len_length(outliers)
	if (len==0){return()}
#	nulls_rep(0,len/2)
#	df_data.frame(start=nulls,end=nulls,KS.test=nulls,T.test=nulls)
	df_matrix(0,nrow=len/2,ncol=6)
	int_1:(len/2)
	df[,1]_outliers[2*int-1]
	df[,2]_outliers[2*int]
	for (i in int){
#		df[i,1]_outliers[2*i-1]
#		df[i,2]_outliers[2*i]
		df[i,3]_exp(mean(lval[outliers[2*i-1]:outliers[2*i]]))
		df[i,4]_exp(stdev(lval[outliers[2*i-1]:outliers[2*i]]))
		df[i,5]_100*ks.gof(nval[outliers[2*i-1]:outliers[2*i]],sample(nval,10^3))[[2]]
		df[i,6]_100*t.test(nval[outliers[2*i-1]:outliers[2*i]],sample(nval,10^3))[[3]]
	}
	df_data.frame(df)
	names(df)_c("start","end","seg.mean","seg.stdev","KS.test","T.test")
#	cat("elapsed time equals ... ",proc.time()-bt,"\n")
	return(df)
}

convert.new_function(bval,places=c(1,length(bval))){
	cat ("starting convert.new\n")
#	bt_proc.time()
	#assumes places are inclusive indices into bval segments
	
	#whenever 1,1,0,0,1,1 occurs converts to 1,1,1,1,1,1
	#and thereafter, iteratively, converts 1,1,0,1 to 1,1,1,1
	#until string stabilizes
	#uses functions close.big.gap() and close.little.gap()
	#which in turn uses function near.val()
	
	#return as converted binary values
	
	plen_length(places)
#	cat("plen is ... ",plen,"\n")
	i_1
	while (i <= plen) {
		int _ places[i]:places[i+1]
																		if(length(int)>=4){
#		cat("length of bval... ",length(int),"\n")																			
		bval[int]_bval[int]+close.big.gap(bval[int])	
		#converts 110011 to 111111 
		cycle_1
	
		while (cycle==1){		
			guide_close.little.gap(bval[int])				
			cycle_0
			if (sum(guide)>0){
				#only if sum(guide)>0 is there a 0-to-1 switch to be made
				cycle_1
				bval[int]_bval[int]+guide
			}
		}
																		}
		i_i+2
	}
#	cat("elapsed time equals ... ",proc.time()-bt,"\n")
	return(bval)
}

near.val_function(bval,k){
	if (sign(k)==-1){
#		cat("length of bval... ",length(bval),"\n")
		nval_c(rep(0,-k),bval[1:( length(bval)+k )])
		return(nval)
	}	
	nval_c(bval[-(1:k)],rep(0,k))
}

close.big.gap_function(bval){
	
	#converts 110011 to 111111 
	
	bval.m1_near.val(bval,-1)
	bval.m2_near.val(bval,-2)
	bval.m3_near.val(bval,-3)
	bval.p1_near.val(bval,1)
	bval.p2_near.val(bval,2)
	bval.p3_near.val(bval,3)
	
	guide_bval==0 & ((bval.m1==1 & bval.m2==1 & bval.p2==1 & bval.p3==1) 
									| (bval.m3==1 & bval.m2==1 & bval.p1==1 & bval.p2==1))
	return(guide)
}

close.little.gap_function(bval){
	
	#converts 1101 to 1111
	
	bval.m1_near.val(bval,-1)
	bval.m2_near.val(bval,-2)
	
	bval.p1_near.val(bval,1)
	bval.p2_near.val(bval,2)
	
	
	guide_bval==0 & ((bval.m1==1 & bval.m2==1 & bval.p1==1) 
									| (bval.m1==1 & bval.p1==1 & bval.p2==1))
	return(guide)
}