library(sna)
library(stringr)
library(argparse)

parser <- ArgumentParser()
parser$add_argument("-g", "--graph", help="Input dot file")
parser$add_argument("-l", "--likelihoods", help="Input log likelihood matrix")
parser$add_argument("-o", "--output", help="Output dot file")
parser$add_argument("-m", "--matrix", help="Master matrix")

# For testing
#args <- parser$parse_args(c('-g','CO8_ml0.renamed.gv', '-l', 'CO8_ml0.conf.names.txt', '-o', 'CO8_ml0.renamed.conf.gv', '-m', '~/results/data/CO8/CO8.master_matrix.txt'))
args <- parser$parse_args(commandArgs(trailingOnly=TRUE))
input.graph.path <- args$graph
input.likelihood.path <- args$likelihoods
output.path <- args$output
mat.path <- args$matrix

# Read log likelihoods
log.likelihood <- read.delim(input.likelihood.path, sep=' ')
# Change "root" to "Root" to match the name used in the tree file
rownames(log.likelihood)[rownames(log.likelihood)=='root'] <- 'Root'
# Convert likelihoods to posterior probabilities, assuming a uniform prior
# Which means, just normalize them to sum to 1
probability <- apply(log.likelihood, 2, function(ll) exp(ll)/sum(exp(ll)))

# Read tree
dot.file <- readLines(input.graph.path)
scite.tree <- read.dot(textConnection(str_replace_all(
              dot.file, '[ ;]', '')))
# Convert to undirected graph
undir.tree <- ifelse(scite.tree==1 | t(scite.tree==1), 1, 0)
# Calculate path lengths between all pairs of nodes
distances <- geodist(undir.tree, count.paths=FALSE)$gdist
colnames(distances) <- colnames(undir.tree)
rownames(distances) <- rownames(undir.tree)
# Keep only path lengths between mutations, since that's what we're
# going to use
mut.distances <- distances[rownames(probability), rownames(probability)]

# Calculate expected distance of each cell from each mutation
# (if a mutation is attached to a mutation, distance is 0; if it is attached
# to a mutation attached to that mutation, distance is 1, etc.)
expected.distance <- mut.distances %*% probability
# Calculate geometric medians of each cell's placement distribution
# Which means, just find the mutation from which it has the minimum
# expected distance
medians <- rownames(expected.distance)[apply(expected.distance, 2, which.min)]
names(medians) <- colnames(expected.distance)
# Calculate the expected absolute deviation of each cell's placement
# distribution
# Which means, just find the value of its shortest expected distance
# from any mutation.
# This serves as a measure of unecertainty: high values mean we don't know
# where to place the cell.
absdev <- apply(expected.distance, 2, min)

# Missing value fraction
master.mat <- read.table(mat.path)
cell.mat <- master.mat[,!grepl('pop',colnames(master.mat))]
frac.missing <- apply(cell.mat, 2, function(v) mean(v==0.5))

# The part of the dot file containing mutations, without cells
dot.file.muts <- dot.file[1:max(grep('node', dot.file))]
# Place cells
cell.placements <- sprintf('%s -> %s;', medians, names(medians))
# Indicate confidence in placements with colors
conf.colors <- sapply(absdev, function(dev)
  rgb(colorRamp(c('cyan', 'white', 'orange'))(min(dev/3, 1)), maxColorValue=255))
cell.confs <- sprintf('%s [fillcolor="%s",label="%s (%.2f)"];', names(conf.colors), conf.colors, colnames(cell.mat), frac.missing)
# Put it together in a new dot file
new.dot.file <- c(dot.file.muts, cell.placements, cell.confs, "}")
# Write file
writeLines(new.dot.file, output.path)
