library(stringr)
library(copynumber)

# Run "write.seg.R" in whatever directory the script itself is contained in
rundir <- str_match(commandArgs(), '^--file=(.*/).*?$')[,2]
rundir <- rundir[!is.na(rundir)]
source(sprintf('%swrite.seg.R', ifelse(length(rundir>0), rundir, '')))

# Read in command line arguments
# 1. Path of input gc ratios
gc.ratio.file <- commandArgs(trailingOnly=TRUE)[1]
# 2. Path of input bincounts
bincount.file <- commandArgs(trailingOnly=TRUE)[2]
# 3. Sample name.
sample.name <- commandArgs(trailingOnly=TRUE)[3]
# 4. Output directory
outdir <- commandArgs(trailingOnly=TRUE)[4]
# 5. Regularization parameter "gamma" of segmentation
gamma.param <- as.numeric(commandArgs(trailingOnly=TRUE)[5])

# Uncomment for testing
#gc.ratio.file <- '/Volumes/sci/users/unruh/DCIS_Paper_Data/CN/MDC13/uber-matrix/uber.gc.MDC13.ratio.txt'
#bincount.file <- '/Volumes/sci/users/unruh/DCIS_Paper_Data/CN/MDC13/uber-matrix/uber.MDC13.bin.txt'
#sample.name <- str_match(gc.ratio.file, '.*/uber.gc.(.*).ratio.txt')[,2]
#outdir <- 'test'
#gamma.param <- 40


# Extract data from file
uber <- read.table(gc.ratio.file, header=TRUE)
chrom <- uber[,1]
chrompos <- uber[,2]
cells <- uber[,-(1:3)]

# Filter by median bincounts using bincount file
median.bincounts <- apply(read.table(bincount.file, header=TRUE)[,-(1:3)], 2, median)
cells <- cells[median.bincounts > 20]

# Extract metadata (not currently used)
#metadata <- data.frame(str_match(colnames(cells), 'MDC\\d\\d_(([DI])\\d+).bl'))
#colnames(metadata) <- c('fullname', 'cellname', 'subpop.id')

# Log data
logratios <- log2(cells)
stopifnot(min(logratios)>-Inf)

# Add position information of each bin
annotated.logratios <- cbind(chrom, chrompos, logratios)
colnames(annotated.logratios)[2] <- 'pos'

# Smooth outliers
winsorized.logratios <- winsorize(annotated.logratios)
# Uncomment to skip winsorization:
#winsorized.logratios <- annotated.logratios

# Segment data
seg.output <- multipcf(winsorized.logratios, gamma=gamma.param, return.est=TRUE, assembly='hg19', fast=TRUE)

# Write data
write.table(seg.output$segments, sprintf('%s/%s.rle.txt', outdir, sample.name), quote=FALSE, sep='\t', row.names=FALSE)
write.table(seg.output$estimates, sprintf('%s/%s.long.txt', outdir, sample.name), quote=FALSE, sep='\t', row.names=FALSE)
write.table(annotated.logratios, sprintf('%s/%s.raw.txt', outdir, sample.name), quote=FALSE, sep='\t', row.names=FALSE)
write.table(winsorized.logratios, sprintf('%s/%s.winsorized.txt', outdir, sample.name), quote=FALSE, sep='\t', row.names=FALSE)
write.seg(seg.output$segments, sprintf('%s/%s.seg', outdir, sample.name))
