library(stringr)

# Run "write.seg.R" in whatever directory the script itself is contained in
rundir <- str_match(commandArgs(), '^--file=(.*/).*?$')[,2]
rundir <- rundir[!is.na(rundir)]
source(sprintf('%swrite.seg.R', ifelse(length(rundir>0), rundir, '')))

indir <- commandArgs(trailingOnly=TRUE)[1]
samplename <- commandArgs(trailingOnly=TRUE)[2]
min.cells <- as.integer(commandArgs(trailingOnly=TRUE)[3])
pval <- as.numeric(commandArgs(trailingOnly=TRUE)[4])
abs.diff <- as.numeric(commandArgs(trailingOnly=TRUE)[5])

segdata <- read.table(sprintf('%s/%s.rle.txt', indir, samplename), header=TRUE)
rawdata <- read.table(sprintf('%s/%s.raw.txt', indir, samplename), header=TRUE)

# List of original changepoints, as bin numbers
# Throughout this code, lists of changepoints will include the first and last bins
# as "changepoints"
chpts <- c(1, cumsum(segdata$n.probes)+1)
# "Natural" changepoints are those at chromosome boundaries or chromosome arm boundaries
# Boolean vector indicating which changepoints are natural:
chpt.naturalness <- c(TRUE, diff(as.integer(segdata$arm))!=0, TRUE)

# Create list of "separations"
# A "separation" is the information required to determine whether an individual changepoint
# is real.
# Intuitively, it is a vertical pattern in IGV.
# Technically, it is a list of length equal to the number of cells, containing lists of
# size two. Each element of the outer list corresponds to a cell, and the two elements of the
# inner list are the raw data in the segments to either side of the changepoint in that cell.
segnums <- rep(1:nrow(segdata), segdata$n.probes)
seglists <- lapply(rawdata[,-(1:2)], split, f=segnums)
separations <- lapply(1:(nrow(segdata)-1), function(i) lapply(seglists, function(seglist) list(seglist[[i]], seglist[[i+1]])))

# Test "clonality" of change points: whether they're in the minimum
# number of cells
# Function to test whether a changepoint is present greater than the minimum number of cells
# Current assumption is that the changepoint will be detectable in the cells that possess it looking
# at that cell individually
# This fails in the following situation: imagine the p value for the changepoint in each cell is <.3.
# This clearly isn't a uniform distribution, so the changepoint must be present in several cells;
# however, in no INDIVIDUAL cell is the null hypothesis rejected, so the number of cells is counted as 0.
test.separation <- Vectorize(function(separation)
{
  pvals <- sapply(separation, function(cell.sep) wilcox.test(cell.sep[[1]], cell.sep[[2]])$p.value)
  adjusted.pvals <- p.adjust(pvals, method='hommel')
  abs.diffs <- sapply(separation, function(cell.sep) abs(median(cell.sep[[1]])-median(cell.sep[[2]])))
  number.of.cells.with.changepoint <- sum(adjusted.pvals<pval & abs.diffs > abs.diff)
  if (number.of.cells.with.changepoint >= min.cells) { return(TRUE)
  } else return(FALSE)
})
chpt.clonality <- c(TRUE, test.separation(separations), TRUE)

# Filter changepoints, keeping the clonal and natural ones
keep.chpts <- chpt.naturalness | chpt.clonality
new.chpts <- chpts[keep.chpts]

# Make new segmentation, mimicking the format of copynumber::multipcf output
new.segnums <- c(1,as.integer(cut(2:nrow(rawdata), new.chpts)))
newlengths <- rle(new.segnums)$lengths
newmeans <- apply(rawdata[,-(1:2)], 2, function(cell) tapply(cell, new.segnums, mean))
newchrom <- rawdata$chrom[new.chpts[-length(new.chpts)]]
newarm <- segdata$arm[keep.chpts[-length(keep.chpts)]]
newstarts <- rawdata$pos[new.chpts[-length(new.chpts)]]
newends <- rawdata$pos[new.chpts[-1]-1]
newsegdata <- data.frame(cbind(newchrom, newarm, newstarts, newends, newlengths, newmeans))
colnames(newsegdata) <- colnames(segdata)

# Make new segmentation in redundant (long) encoding
longmeans <- apply(newmeans, 2, rep.int, times=newlengths)
newlongsegdata <- data.frame(cbind(rawdata$chrom, rawdata$pos, longmeans))
colnames(newlongsegdata) <- c('chrom', 'pos', colnames(longmeans))

# Output
write.table(newsegdata, sprintf('%s/%s.%dormore.rle.txt', indir, samplename, min.cells), quote=FALSE, sep='\t', row.names=FALSE)
write.table(newlongsegdata, sprintf('%s/%s.%dormore.long.txt', indir, samplename, min.cells), quote=FALSE, sep='\t', row.names=FALSE)
write.seg(newsegdata, sprintf('%s/%s.%dormore.seg', indir, samplename, min.cells))