#!usr/bin/Rscript
# predictions for HGT

# this file is just a parsed version of the gainloss reconstruction, except that species are mapped not to the node at the end of the branch, but rather to the ancestor. done using the parse_reconstruction_foranc.py script.
# this is because we are trying to predict based on the ancestor, otherwise it is circular.  
allanc = read.table(gzfile('processed_data/021214_MOtree_ancestralrecon.txt.gz.anc_pres'),header=T)

realgains = read.table('gainLoss_results/MOtree_GLrun/AncestralReconstructPosterior.txt.gain_probs',header=T)

alphadj = read.table('processed_data/alpha_treduce.txt',header=T)
alphanc = read.table('processed_data/AncestralReconstructPosterior.txt.pres_probs.alpha',header=T)

edgenum = sum(as.matrix(alphadj))

# get all the alpha/betaproteobacteria branches
alphaspecs = rownames(allanc)[!(rownames(allanc) %in% rownames(alphanc))]

# get relevant genes - in pgce net
coled = colnames(alphadj)[colSums(alphadj)>0]
rowed = rownames(alphadj)[rowSums(alphadj)>0]
both = sort(unique(append(rowed,coled)))
both = names(na.omit(colSums(allanc[,both])))

alphapres = allanc[alphaspecs,both]
alphamodel = as.matrix(alphadj[both,both])
alphagains = colSums(round(realgains[alphaspecs,]))

# check that everything is accounted for- suprisingly irritating!
stopifnot(length(which(is.na(alphapres)))==0)
stopifnot(length(which(is.na(alphamodel)))==0)

# make predictions
alphapredict = as.matrix(alphapres) %*% as.matrix(alphamodel)

# restrict to genes that can even be predicted
predictable = names(which(colSums(alphadj)>0))
alphapredict = alphapredict[,predictable]

alphatrue = realgains[alphaspecs,predictable]
alphatrue[alphatrue > .5] = 1
alphatrue[alphatrue < .5] = 0

alpha_for_roc = c()

for (gene in predictable) {
	max_in = sum(alphadj[,gene])
#	print(gene)
	for (branch in alphaspecs) {
		if (alphapres[branch,gene] > .4) {
			next
			} else {
			comparison = c(alphapredict[branch,gene]/max_in,alphatrue[branch,gene],branch,gene)
			alpha_for_roc = rbind(alpha_for_roc,comparison)
			}
		
		}
	}

# ROC??
source('code/figure_scripts/roc.R')
print('ROC for alpha/betaproteobacteria partition')
ROC(alpha_for_roc)

cat('number of edges:',edgenum,'\n')
cat('number of predictable genes:',length(predictable),'\n')
cat('number of gained genes:',length(which(alphagains>=1)),'\n')
