## usage ### ## TO BE USED IN SPLUS### ## Input is a dataframe with chromosome and ratio in two columns. ## Example -- # output_cancer.seg(df,chrom.col="CHROM",ratio.col="RATIO") ## output will have two objects - one is the list of segments and the second is the decompressed version of the segments to make it easier to plot cancer.seg_function(df.name, chrom.col = "CHROM", ratio.col = "GEOMEAN.RATIO", stringency1= 10^-5, ratio=1, k=3, nrandoms=3){ bt_proc.time() chrom.places_make.chrom.place.1(df.name, col.name = chrom.col)[[1]] chrom.places_sort(union(chrom.places[-1],1+chrom.places[-length(chrom.places)])) n.chrom_length(chrom.places)/2 places_c() for (i in 1:n.chrom){ int_chrom.places[2*i-1]:chrom.places[2*i] df.temp_df.name[int,] z_stringent.segment.T.1(df.temp, chrom.col, ratio.col, stringency1, use.chrom=F) places_c(places,z[[1]][[2]]+int[1]-1) } places_sort(union(places[-1],1+places[-length(places)])) broad.places_sort(union(places,chrom.places)) n.broad_length(broad.places)/2 lval_log(df.name[,ratio.col]) mean.broad_rep(0,n.broad) median.broad_rep(0,n.broad) std.broad_rep(0,n.broad) up.down.broad_rep(0,n.broad) length.broad_rep(0,n.broad) id.to.broad_rep(0,n.broad) mean.broad.long_rep(0,length(lval)) median.broad.long_rep(0,length(lval)) std.broad.long_rep(0,length(lval)) up.down.probes_rep(0,length(lval)) id.to.broad.long_rep(0,length(lval)) probe.num_rep(0,length(lval)) for (i in 1:n.broad){ int_broad.places[2*i-1]:broad.places[2*i] mean.broad[i]_mean(lval[int]) median.broad[i]_median(lval[int]) length.broad[i]_length(int) std.broad[i]_stdev(lval[int]) up.down.broad[i]_ifelse(mean.broad[i]>0,1,-1) id.to.broad[i]_i mean.broad.long[int]_mean.broad[i] median.broad.long[int]_median.broad[i] std.broad.long[int]_std.broad[i] up.down.probes[int]_up.down.broad[i] id.to.broad.long[int]_i probe.num[int]_int } broad.short.table_data.frame("id.to.broad"=id.to.broad,"mean.broad"=mean.broad,"std.broad"=std.broad,"length.broad"=length.broad,"sign.broad"=up.down.broad) broad.long_data.frame("probe.num"=probe.num,"id.to.broad"=id.to.broad.long,"lval"=lval,"mean.broad"=mean.broad.long,"std.broad"=std.broad.long,"median.broad"=median.broad.long,"sign.broad"=up.down.probes) return(broad.short.table,broad.long) } #################################New Stringent Segment############################################################ stringent.segment.T.1_function(df.name, chrom.col = "CHROM", ratio.col = "GeoMeanRatio", stringency1= 10^-5,use.chrom = T) { #this version stops at one stringency view bt_proc.time() # cat("starting stringent.segment.T.1\n") #meant for tumor data #probe data must be chromosome position order #if it is not, sort your data sheet... #column containing chromosome will be assumed to be "CHROM" #unless you specify otherwise #as a column number or a quoted string #similarly, ratio values will be assumed to be in column named "GeoMeanRatio" #unless otherwise specified #as a column number or a quoted string #and the vector of logarithmic ratios lval_log(df.name[,ratio.col]) #make place vector for chromosome boundaries if (use.chrom == T) { ans_make.chrom.place.1(df.name, col.name = chrom.col) place_ans[[1]] chrom.names_ans[[2]] } if (use.chrom == F) { place_c(0,length(lval)) chrom.names_NA} #run the program z_assemble.4.T.2(lval, place = place, stringency1 = stringency1) #now make a dataframe with all the relevant values (long stats): #broad means and medians #and companion short stats: #breaks,intervals,maybe positions,means,standard deviations # cat("Total Time=... ",proc.time()-bt,"\n") return(z) } assemble.4.T.2 _ function(lval, place = c(0, length(lval)), seglen = 100, stringency1 = 10^-5, n = 5, minlen = 6) { #modified assemble.4.T.1 to do just one stringency #assumed for use with tumors t1 <- proc.time() # cat("starting assemble.4.T.2\n") places.1 <- seg.n.merge.2(lval, place = place, seglen = seglen, stringency1 = stringency1, minlen = minlen)[[3]] #seg.n.merge will require surgery to use dynamic programming to accelerate minimization of variance places.2 <- check.spots(lval, places.1, upto = 2, minlen) #places contains the starts of new segments, by probe index #if needed, can always break places into immutable chromosome boundaries and the rest places.3 _ sort(union(places.2, place)) #cat("places.3=... ",places.3,"\n") stats.1 <- make.stats.only.1(lval, places.3) tt <- proc.time() - t1 # cat("total time elapsed ... ", tt, "\n") # df.ret.test_ return(list("places" = list("original.places"=place, "broad.places" = places.3), "long.stats"=stats.1[[1]], "short.stats"=stats.1[[2]])) } make.stats.only.1 _ function(lval, place = c(0, length(lval)) ) { #just makes stats given place #NOTE: stats are based on log ratios, and a case can be made #that stats should be based on ratios bt <- proc.time() # cat("starting make.stats.only.1\n") len <- length(lval) len.place <- length(place)-1 # cat("len",len,len.place,"\n") loc_rep(0,len) val.mean <- rep(0, len) val.median <- rep(0, len) val.stdev <- rep(0, len) loc.start_rep(0,len.place) loc.end_rep(0,len.place) short.mean <- rep(0, len.place) short.median <- rep(0, len.place) short.stdev <- rep(0, len.place) for(i in 2:(1+len.place)) { #i believe this is the needed correction int <- (place[i - 1] + 1):place[i] loc.start[i-1]<-place[i-1] loc.end[i-1]<-place[i] short.mean[i-1] <- exp(mean(lval[int])) short.median[i-1] <- exp(median(lval[int])) short.stdev[i-1] <- exp(stdev(lval[int])) # cat(place[i-1],place[i],short.mean[i-1],short.stdev[i-1],"\n") loc val.mean[int] <- short.mean[i-1] val.median[int] <- short.median[i-1] val.stdev[int] <- short.stdev[i-1] } z1 <- data.frame(val.mean = val.mean, val.stdev = val.stdev, val.median = val.median) z2 <- data.frame(loc.start=loc.start,loc.end=loc.end, short.mean = short.mean, short.stdev = short.stdev, short.median = short.median) stats <- list(long = z1, short = z2) # cat("elapsed time is... ", proc.time() - bt, "\n") return(stats) } seg.n.merge.2 _ function(lval, place = c(0, length(lval)), seglen = 100, stringency1 = 10^-5, minlen = 6) { #altered to accomodate chromosome breaks #altered to consider single stringency and dynamic programming all.place <- segmate.2(lval, place = place, seglen, stringency = stringency1, minlen = minlen) merged.place <- merge.segs(lval, all.place, stringency = stringency1, minlen = minlen) list(all.place, merged.place[[1]], merged.place[[2]]) } segmate.2 _ function(lval, place = c(0, length(lval.sim)), stringency = 10^-5, seglen = 100, minlen = 6) { #altered to accomodate chromosome breaks #first, breaks data into seglen chunks and finds breaks #then removes artificial breaks #then repeats act with seglen chunks offset by seglen/2 #finally combines all the breaks #with given breaks (place) to accomodate chromosome segmentation #and asks finally for any breaks that were missed #all breaks are right most bound of interval make.var.sum.val(lval) #works with all.tile to give dynamic programming speed to minimization of variance len <- length(lval) place1 <- place #note: all.tile.5 will make the default place vector seg2 <- seglen * (1:(floor(len/seglen))) place2 <- sort(union(place1, seg2)) #adds chromosome boders if any, plus first and last place place2 <- all.tile.5(lval, place2, stringency = stringency, minlen = minlen) place2 <- setdiff(place2, seg2) seg3 <- floor(seglen/2) + seglen * (1:(-1 + floor(len/seglen))) place3 <- sort(union(place1, seg3)) #adds chromosome boders if any, plus first and last place place3 <- all.tile.5(lval, place3, stringency = stringency, minlen = minlen) place3 <- setdiff(place3, seg3) all.place <- sort(union(place1, union(place2, place3))) #adds chromosome boders if any all.place <- all.tile.5(lval, all.place, stringency = stringency, minlen = minlen) all.place } ################################# unchanged accessories ############################################################# make.chrom.place.1_function(df, col.name = "CHROM") { chrom.names <- union(c(),df[, col.name]) chrom.bounds <- match(chrom.names,df[,col.name]) place <- c(0,chrom.bounds[-1]-1,nrow(df)) list(place,chrom.names) } check.spots _ function(lval, merged.place, upto = 2, minlen = 6) { bt_proc.time() # cat("starting check.spots\n") len <- length(merged.place) if(len == 2) { return(merged.place) } while(upto < len) { if(len == 2) { return(merged.place) } i <- upto testval <- lval[(1 + merged.place[i - 1]):merged.place[i + 1]] spot <- merged.place[i] - merged.place[i - 1] true.spot <- borderwar(testval, spot, minlen = minlen) merged.place[i] <- true.spot + merged.place[i - 1] # cat(i,true.spot,"\n") if(true.spot < (minlen/2) | (true.spot > (1 + length(testval) - (minlen/2)))) { #then we eliminate that boundary upto <- upto - 1 merged.place <- merged.place[ - i] len <- len - 1 } upto <- upto + 1 } # cat("elapsed time is... ",proc.time()-bt,"\n") merged.place } merge.segs _ function(lval, all.place, stringency = 0.005, minlen = 6) { bt_proc.time() # cat("starting merge.seg \n") #note: this is my first recursive function #warning: it is not really.... it is really a while loop #and this can get us in trouble some day len <- length(all.place) p <- rep(0, len) if(len == 2) { return(list(p, all.place)) } for(i in 2:(len - 1)) { p[i] <- ks.gof(lval[(all.place[i - 1] + 1):all.place[i]], lval[(all.place[i] + 1):all.place[i + 1]])[[2]] ########## } #cat("merge ", "stringency= ", stringency,"\n") #cat(round(p,6),"\n") #rule: #if p[i]>stringency #delete i if p[i]=max(p[(i-1):(i+1)]) #repeat until all p[i]<=stringency count <- 0 elim <- c() max.p <- max(p) if(max.p > stringency) { count <- 1 for(i in 2:(len - 1)) { if(p[i] > stringency & p[i] == max(p[(i - 1):(i + 1)])) { elim <- c(elim, i) } } } if(count == 1) { all.place <- all.place[ - elim] ans <- merge.segs(lval, all.place, stringency = stringency, minlen = minlen) all.place <- ans[[2]] p <- ans[[1]] } # cat("elapsed time is... ",proc.time()-bt,"\n") list(round(p, 4), all.place) } borderwar _ function(lval, spot, dir = 0, minlen = 6, num.o.calls = 0) { # bt_proc.time() # cat("starting borderwar \n") lim.o.calls <- 200 while(num.o.calls < lim.o.calls) { len <- length(lval) if(spot < (minlen/2)) { # cat("elapsed time is... ",proc.time()-bt,"\n") return(0) } #the break dissappeared if(spot > (1 + len - (minlen/2))) { # cat("elapsed time is... ",proc.time()-bt,"\n") return(len) } #the break dissappeared x <- lval[spot] mu.A <- mean(lval[1:(spot - 1)]) sig.A <- stdev(lval[1:(spot - 1)]) mu.B <- mean(lval[(spot + 1):len]) sig.B <- stdev(lval[(spot + 1):len]) p <- comp.p(x, mu.A, sig.A, mu.B, sig.B) #cat("lval, spot=...",lval,spot,"\n" ) #cat("mu.A, sig.A, mu.B, sig.B=... ",mu.A, sig.A, mu.B, sig.B,"\n") #cat("p=... ",p,"\n") if(x <= mu.A & mu.A <= mu.B) { p <- 1 } if(x >= mu.A & mu.A >= mu.B) { p <- 1 } if(x <= mu.B & mu.B <= mu.A) { p <- 0 } if(x >= mu.B & mu.B >= mu.A) { p <- 0 } if(p < 0.5) { #ie spot belongs to B, we had it wrong if(dir < 1) { #and if moving backwards, keep on going dir <- -1 num.o.calls <- num.o.calls + 1 spot <- spot - 1 next } if(dir == 1) { #if we were moving forward #then we have grabbed a probe too many #and the previous probe was the end point #of the A segment # cat("elapsed time is... ",proc.time()-bt,"\n") return(spot - 1) } } if(p >= 0.5) { #spot does belong to A if(dir > -1) { #if we are moving forward #lets grab the next spot dir <- 1 num.o.calls <- num.o.calls + 1 spot <- spot + 1 next } if(dir == -1) { #if we were backing up #we can finally stop #and return spot # cat("elapsed time is... ",proc.time()-bt,"\n") return(spot) } } } # cat("limit on border wars breached\n") # cat("elapsed time is... ",proc.time()-bt,"\n") return(spot) } comp.p _ function(x, mu.A, sig.A, mu.B, sig.B) { if (sig.A*sig.B==0){return(0)} pA <- (1/sig.A) * exp(-0.5 * ((x - mu.A)/sig.A)^2) pB <- (1/sig.B) * exp(-0.5 * ((x - mu.B)/sig.B)^2) p <- pA/(pA + pB) p } ########################################### all.tile.5 ########################################################### all.tile.5 _ function(lval, place = c(0, length(lval)), upto = 1, stringency = 0.005, minlen = 6) { #mods that can be made #1. instead of calling make.var.sum.val once per all.tile, make it once per overall run #2. instead of computing cumsum for all lval, do so in overlapping chunks of lval # because accuracy will increase #3. dont compute minvar for every position, do so for every third or forth, then track inward # bt <- proc.time() # cat("starting all.tile.5\n") len <- length(lval) # make.var.sum.val(lval) #moved to higher level "segmate.2" # cat((place[upto] < (len - minlen)),"\n") while(place[upto] < (len - minlen)) { # cat(upto,"\n") spot <- find.left.break.5(lval, place, upto, stringency = stringency, minlen = minlen) # cat(spot,"\n") if(spot == place[upto + 1]) { upto <- upto + 1 if(upto == length(place)) { break } next } place <- sort(c(place, spot)) } # cat("time elapsed ", et <- proc.time() - bt, "\n") place } find.left.break.5 _ function(lval, place, upto, stringency, minlen) { # cat("starting find.left.break.5\n") #assumes function crack #that provides a valid break... #with place[upto] the index of leftmost tamed border of lval #returns new place if there is a new break or else place[upto+1] #will not try to break a tile shorter than minlen #and will judge significance after ntries if((place[upto+1] - place[upto]) <= 2*ceiling(minlen/2)){ #wont break a piece smaller than 2*ceiling(minlen/2) return(place[upto + 1]) } # ans <- crack.5(lval[(place[upto] + 1):place[upto + 1]], stringency, minlen) ans _ crack.5(lval,place[upto]+1,place[upto+1],stringency,minlen) if(ans == 0) { return(place[upto + 1]) } else { return(ans) } } crack.5 _ function(lval, I, J, stringency, minlen) { # cat("starting crack.5\n") ans <- apply.divide.5(lval, I, J, minlen)[[1]] pval <- ks.gof(lval[(I+1):ans], lval[(1 + ans):J])[[2]] if(pval > stringency) { return(0) } return(ans) } apply.divide.5 _ function(lval, I, J, minlen) { # cat ("starting apply.divide.5\n") #no segment smaller than ceiling(minlen/2) can be made minsum <- maxsum <- var.sum(lval,(I+1),J) for(i in (I + ceiling(minlen/2)):(J - ceiling(minlen/2))) { sum.i <- var.sum(lval,(I+1),i) + var.sum(lval,(i+1),J) if(sum.i <= minsum) { index <- i minsum <- sum.i } } list(index, maxsum, minsum) } make.var.sum.val _ function(lval){ #makes values that var.sum uses # cat ("starting make.var.sum.val\n") assign("cumsum.val",cumsum(lval),0) assign("cumsum.sq.val",cumsum(lval^2),0) } var.sum _ function(lval, I, J){ # cat ("starting var.sum\n") #lval is not needed of course sum.I.J _ cumsum.val[J] - cumsum.val[I-1] sum.sq.I.J _ cumsum.sq.val[J] - cumsum.sq.val[I-1] return(sum.sq.I.J - sum.I.J^2/(J-I+1)) } ############################################# OUTLIERS ######################################################### outlier.up_function(lval,nval,bval,places=c(1,length(nval)),ratio=1,k=3){ # cat ("starting outlier.up\n") # bt_proc.time() #first converts nval to binary string bval #where 0 is less than or equal to mean plus ratio*stdev #and 1 is more #to save coding time and separate tasks #nval is assumed to be in standard mode #each segment with mean of 0 and standard deviation of 1 #modified on 10/23/2003 #outliers are defined as top (1-q)% of data based on ranks # both nval and bval comes as an input ---- both nval&bval are computed in bridge.4 # bval_nval>ratio bval_convert.new(bval,places) ####note convert replaced with convert.new #assumes places is properly parsed, as all-inclusive pairs of segment boundaries #now make string outliers of indices into all runs #with breaks at places #and no run less than k outliers_c() i_1 plen_length(places) while (i <= plen) { int _ places[i]:places[i+1] outliers_c(outliers,sliced(bval[int],places[i]-1,k)) i_i+2 } # cat("outliers are ... ",outliers,"\n") #now make short stats df.outliers_make.outlier.stats(lval,nval,outliers) # cat("elapsed time is ... ",proc.time()-bt,"\n") return(df.outliers) } sliced_function(bval,index,k){ # cat ("starting sliced\n") #parses runs of 1's of length k, and, if any, #parses plus index #if there are no outliers, return null if (sum(bval)==0){ return() } #parses runs of 1's transitions_which(bval[-length(bval)]!=bval[-1]) if(bval[1]==1){ transitions_c(0,transitions) } if (length(transitions)%%2==1){ transitions_c(transitions,length(bval)) } transitions_transitions+rep(c(1,0),length(transitions)/2) #eliminate runs of less than k trans.mat_index+matrix(transitions,nrow=2) runs_trans.mat[2,]-trans.mat[1,]+1 keep_which(runs>=k) outlier.parse_as.vector(trans.mat[,keep]) } make.outlier.stats_function(lval,nval,outliers){ # cat ("starting make.outlier.stats\n") # bt_proc.time() #given nval and parsing "outliers", #yields p-vals of KS-test and T-tests #in the form of a dataframe #(4 columns: start, end index, ks-test,T-test) #(n rows of parsed events) len_length(outliers) if (len==0){return()} # nulls_rep(0,len/2) # df_data.frame(start=nulls,end=nulls,KS.test=nulls,T.test=nulls) df_matrix(0,nrow=len/2,ncol=6) int_1:(len/2) df[,1]_outliers[2*int-1] df[,2]_outliers[2*int] for (i in int){ # df[i,1]_outliers[2*i-1] # df[i,2]_outliers[2*i] df[i,3]_exp(mean(lval[outliers[2*i-1]:outliers[2*i]])) df[i,4]_exp(stdev(lval[outliers[2*i-1]:outliers[2*i]])) df[i,5]_100*ks.gof(nval[outliers[2*i-1]:outliers[2*i]],sample(nval,10^3))[[2]] df[i,6]_100*t.test(nval[outliers[2*i-1]:outliers[2*i]],sample(nval,10^3))[[3]] } df_data.frame(df) names(df)_c("start","end","seg.mean","seg.stdev","KS.test","T.test") # cat("elapsed time equals ... ",proc.time()-bt,"\n") return(df) } convert.new_function(bval,places=c(1,length(bval))){ cat ("starting convert.new\n") # bt_proc.time() #assumes places are inclusive indices into bval segments #whenever 1,1,0,0,1,1 occurs converts to 1,1,1,1,1,1 #and thereafter, iteratively, converts 1,1,0,1 to 1,1,1,1 #until string stabilizes #uses functions close.big.gap() and close.little.gap() #which in turn uses function near.val() #return as converted binary values plen_length(places) # cat("plen is ... ",plen,"\n") i_1 while (i <= plen) { int _ places[i]:places[i+1] if(length(int)>=4){ # cat("length of bval... ",length(int),"\n") bval[int]_bval[int]+close.big.gap(bval[int]) #converts 110011 to 111111 cycle_1 while (cycle==1){ guide_close.little.gap(bval[int]) cycle_0 if (sum(guide)>0){ #only if sum(guide)>0 is there a 0-to-1 switch to be made cycle_1 bval[int]_bval[int]+guide } } } i_i+2 } # cat("elapsed time equals ... ",proc.time()-bt,"\n") return(bval) } near.val_function(bval,k){ if (sign(k)==-1){ # cat("length of bval... ",length(bval),"\n") nval_c(rep(0,-k),bval[1:( length(bval)+k )]) return(nval) } nval_c(bval[-(1:k)],rep(0,k)) } close.big.gap_function(bval){ #converts 110011 to 111111 bval.m1_near.val(bval,-1) bval.m2_near.val(bval,-2) bval.m3_near.val(bval,-3) bval.p1_near.val(bval,1) bval.p2_near.val(bval,2) bval.p3_near.val(bval,3) guide_bval==0 & ((bval.m1==1 & bval.m2==1 & bval.p2==1 & bval.p3==1) | (bval.m3==1 & bval.m2==1 & bval.p1==1 & bval.p2==1)) return(guide) } close.little.gap_function(bval){ #converts 1101 to 1111 bval.m1_near.val(bval,-1) bval.m2_near.val(bval,-2) bval.p1_near.val(bval,1) bval.p2_near.val(bval,2) guide_bval==0 & ((bval.m1==1 & bval.m2==1 & bval.p1==1) | (bval.m1==1 & bval.p1==1 & bval.p2==1)) return(guide) }