#!/usr/bin/Rscript

# Script from Huang, Yu, et al., 2017, paper on aflatoxin B1 mutagenesis

# Copyright 2017 by Mi Ni Huang, Alvin Wei Tian Ng, and Steven G Rozen

# Released under the GPL-3 license


require(lsa)
require(RColorBrewer)

source('supervised.smooth.nmf.R')

input.mat <- read.table('selected.genomes.txt',sep='\t',header=T)

mut.counts <- as.matrix(input.mat[5:length(input.mat)])
mut.counts[mut.counts==0] <- 1e-12

## ref sigs
COSMIC.sigs <- read.table('COSMIC.signatures.txt',sep='\t',header=T)
ref.sig <- read.table('aflatoxin.signatures.txt',sep='\t',header=T)

## Known liver sigs + 2 afb1
sigs <- as.matrix(cbind(COSMIC.sigs[,4+c(1,4,5,6,12,16,17,22,23)], ref.sig[,5:6]))

## "supervised" NMF, meaning that the w (signature matrix is
## completely specified.
super.afla <- nmf(mut.counts, ncol(sigs),
                  # On a mulitcore machine comment in
                  # nrun=100, .opt='vp50',
                  nrun=1, # On multicore machine, comment out
                  'supervised_smooth',
                  w=as.matrix(sigs),
                  th=0,maxIter=2000,
                  .stop=nmf.stop.stationary)

## Reconstructed spectra
recon.mat <- as.matrix(sigs) %*% as.matrix(.coef(super.afla))

# Calculate reconstruction errors and write them
cos <- unlist(lapply(1:ncol(mut.counts),
                     function(i)cosine(recon.mat[,i], mut.counts[,i])))
pearson<-unlist(lapply(1:ncol(mut.counts),
                       function(i)cor(recon.mat[,i], mut.counts[,i], , method='pearson')))
total.counts <- colSums(mut.counts)
Eu.dist <-  apply(mut.counts-recon.mat, 2, function(x) sqrt(sum((x)^2)) )
recon.error <-rbind(cos, pearson, Eu.dist, total.counts)
recon.error <- recon.error[ ,order(colSums(recon.mat), decreasing=T)]

exp.recon.error <-
  structure(c(0.978226931145552, 0.958021263863586, 731.871343064885,
23966, 0.964691651309808, 0.918825294028685, 432.544269987212,
12076, 0.959769061761071, 0.926363183376219, 506.647284621857,
11797, 0.986270355104994, 0.966900315772254, 236.613316221715,
10863, 0.953032216405431, 0.887553333302668, 360.230615601132,
8795, 0.955957441251485, 0.894043834487311, 331.15542894608,
8352, 0.976019071984237, 0.939719454057152, 228.212618941236,
7938, 0.980479786531796, 0.944032244247277, 176.271145633952,
7079, 0.973951280345047, 0.926903249042528, 143.298277075933,
4961, 0.982408468992167, 0.96265092545176, 120.854008732739,
4630, 0.980489656562225, 0.94400258584913, 110.882070594822,
4465, 0.958899447003336, 0.891538932813736, 143.749259167991,
3872, 0.976160094596342, 0.933266181631521, 103.886426956852,
3740, 0.970595532986828, 0.910420936828936, 111.072225228914,
3680, 0.958775884565915, 0.915005735565751, 137.716648033897,
3403, 0.962365132868054, 0.92248385765634, 50.7667252364943,
1305, 0.829018989121893, 0.59526757015107, 41.9415340166957,
529.000000000004), .Dim = c(4L, 17L), .Dimnames = list(c("cos",
"pearson", "Eu.dist", "total.counts"), c("HK084", "HK177", "HK169",
"HK090", "HK268", "HK106", "HK067", "HK203", "HK079", "HK113",
"HK035", "RK033", "HK154", "HK260", "D023048", "RK213", "RK206"
)))

# Sanity check
stopifnot(all.equal(exp.recon.error, recon.error, tolerance=1e-5))

write.table(recon.error, 'recon.error.txt', sep='\t', quote=F)

#  Write "activities" (the activity of each signature in each tumor)
activities <- .coef(super.afla)[ ,order(colSums(recon.mat), decreasing=T)]
rownames(activities) <- colnames(sigs)
write.table(activities, 'activities.txt', sep='\t', quote=F)

exp.activities <-
  structure(c(3.71753940623283e-12, 1432.76594183887, 3.27170500421196e-12,
3.26907055146582e-12, 3414.90378720722, 2903.24779673892, 6.01367087368752e-12,
2984.99985988291, 2.87821667652759e-12, 7918.32628906728, 5801.66123412017,
7.65433705933443e-12, 1269.95909119927, 287.993964878181, 7.17695939102387e-12,
1300.31082777985, 5146.23528416871, 8.03251498653607e-12, 868.376778409653,
5.85130129152424e-12, 2381.42077489144, 1174.348112135, 6.93591259909335e-12,
364.684351670866, 7.70854369747343e-10, 6.31664506540439e-12,
1409.17178767097, 2439.22444173305, 1.11063265462722e-11, 325.82995837625,
5.05556074153819e-12, 5363.02450268366, 2051.03604556564, 75.7716532334164,
1.66980275424784, 870.361175362177, 14.1842668540532, 791.255302517469,
2675.04921546705, 1.10955037614836e-11, 736.617577414679, 5.66583654871821e-12,
4460.83139773449, 928.883120718923, 8.70717063703075e-12, 4.29267534168128e-09,
3040.06724493314, 8.21996730514669e-12, 2019.17377284788, 803.787201090772,
1.128082843196e-11, 46.1233902871648, 6.64893307675879e-12, 2395.19790029285,
624.266150852383, 1.04382246930254e-11, 1011.43463603726, 856.736935481272,
9.94571019447751e-12, 801.28742058176, 2194.40662839415, 1.33141332531847e-11,
503.914840063639, 7.73875878967721e-12, 2517.43678646425, 445.810254925131,
2.14647525498254, 7.05403963033207e-12, 282.890989185018, 1.1216946100152e-11,
666.806078467303, 2191.34574018861, 1.4988527374033e-11, 1342.30702106079,
9.18220348583211e-12, 2237.72921800269, 1105.26851728201, 1.25289666075392e-11,
1.99495743980081e-05, 824.252115842866, 1.19530637660774e-11,
1500.77905478973, 1899.66634643009, 1.40905581072912e-11, 442.976952698163,
9.46490746775278e-12, 1873.83059043585, 419.359881413715, 1.77034486936693e-11,
0.00296592134300811, 1332.46686993322, 1.75358340299073e-11,
8.17967658048345, 1025.76738148023, 2.27996911143064e-11, 899.211767813735,
1.35195375869063e-11, 1332.66094523957, 246.785706769904, 1.8673832605542e-11,
4.98083657837933e-06, 4.20637455012411, 1.71372067544908e-11,
264.634272734269, 1030.55787298686, 233.727855876227, 244.978603763335,
1.39804352104927e-11, 1925.59555680493, 814.673151881719, 1.94177321869004e-11,
8.0397610029669e-05, 614.717960434298, 1.83216251991201e-11,
594.157097076058, 868.111279098053, 217.592827512631, 301.430905777894,
1.49558963784227e-11, 1329.96072879752, 444.974975085307, 257.938596690724,
0.418978395049549, 1440.23192244352, 1.63919100373123e-11, 0.0186522948439557,
1125.64959156935, 2.38864795076977e-11, 46.5352970110345, 1.48258726741219e-11,
804.976626869382, 235.101460920337, 29.7749308384305, 3.90907142709512e-11,
819.026797403388, 2.07917138290666e-11, 231.165480936053, 783.746235233653,
2.9577778028397e-11, 408.784441725956, 58.1999634298194, 1119.81773261423,
255.228614413012, 81.928693890753, 1.73979110402058e-07, 1054.3872998479,
2.13952419232749e-11, 158.904794759278, 853.039985575832, 2.80479767872523e-11,
500.805776143085, 1.81438661640079e-11, 729.296455531442, 266.326362082664,
5.18836578577758e-05, 1.50452309463652e-11, 579.232392662136,
27.2569914671642, 308.743986405425, 141.312664483195, 72.9339965936478,
132.35018305646, 1.80503169674452e-11, 1328.50601412304, 681.480767156685,
44.7733611848425, 0.428333999979393, 6.10193123212526e-11, 5.37754783917029e-11,
242.307888249803, 60.7703169523812, 34.0261381557938, 90.1927604523723,
4.830777409518e-11, 566.157608665612, 235.763185935232, 1.68761947058195,
4.54064726837959e-06, 281.669962873305, 21.9494455478732, 19.6570029025479,
1.42727633966575e-10, 8.35862802174949, 35.2577384083194, 10.018254374886,
96.9197427741752, 37.2910831482872), .Dim = c(11L, 17L), .Dimnames = list(
    c("COSMIC.1", "COSMIC.4", "COSMIC.5", "COSMIC.6",
    "COSMIC.12", "COSMIC.16", "COSMIC.17", "COSMIC.22",
    "COSMIC.23", "AFsig2", "AFB1sig"), c("HK084", "HK177", "HK169",
    "HK090", "HK268", "HK106", "HK067", "HK203", "HK079", "HK113",
    "HK035", "RK033", "HK154", "HK260", "D023048", "RK213", "RK206"
    )))

# Sanity check
stopifnot(all.equal(exp.activities, activities, tolerance=1e-2))

# Figure 5D,E, but different color scheme
pdf('figs.pdf', paper='A4', useDingbats = F)
par(mfrow=c(2,1))
tmp.col <- RColorBrewer::brewer.pal(n=nrow(activities), name='Set3')
tmp.col[length(tmp.col) - 1] <- 'red'
tmp.col[length(tmp.col)] <- 'black'

# 5D
barplot(activities, col=tmp.col, las=2)
legend(x='topright',
       rev(rownames(activities)),
       fill=rev(tmp.col),
       cex=0.6)

# 5E
a2 <- apply(activities, 2, FUN=function(x) x / sum(x))
barplot(a2, col=tmp.col, las=2)
dev.off()
