#!usr/bin/Rscript
# predictions for HGT

# this file is just a parsed version of the gainloss reconstruction, except that species are mapped not to the node at the end of the branch, but rather to the ancestor. done using the parse_reconstruction_foranc.py script.
# this is because we are trying to predict based on the ancestor, otherwise it is circular.  
allanc = read.table('processed_data/021214_MOtree_ancestralrecon.txt.gz.anc_pres',header=T)

realgains = read.table('gainLoss_results//MOtree_GLrun/AncestralReconstructPosterior.txt.gain_probs',header=T)

firmadj = read.table('processed_data/firm_treduced_net_112414.txt',header=T)
firmanc = read.table('processed_data/AncestralReconstructPosterior.txt.pres_probs.firm',header=T)

edgenum = sum(as.matrix(firmadj))

# get all the firmicutes branches
firmspecs = rownames(allanc)[!(rownames(allanc) %in% rownames(firmanc))]
firmgains = colSums(round(realgains[firmspecs,]))

# get relevant genes - in pgce net
coled = colnames(firmadj)[colSums(firmadj)>0]
rowed = rownames(firmadj)[rowSums(firmadj)>0]
both = sort(unique(append(rowed,coled)))
both = names(na.omit(colSums(allanc[,both])))

firmpres = as.matrix(allanc[firmspecs,both])
firmmodel = as.matrix(firmadj[both,both])

# make it so that branches where gene is present will not get predicted
# there are actually better ways
#diag(firmmodel) = -999

# check that everything is accounted for- suprisingly irritating!
stopifnot(length(which(is.na(firmpres)))==0)
stopifnot(length(which(is.na(firmmodel)))==0)

# make predictions
firmpredict = firmpres %*% as.matrix(firmmodel)

# restrict to genes that can even be predicted
predictable = names(which(colSums(firmadj)>0))
firmpredict = firmpredict[,predictable]

firmtrue = as.matrix(realgains[firmspecs,predictable])
firmtrue[firmtrue > .5] = 1
firmtrue[firmtrue < .5] = 0

firm_for_roc = c()

for (gene in predictable) {
#	print(gene)
	max_in = sum(firmadj[,gene])

	for (branch in firmspecs) {
		if (firmpres[branch,gene] > .4) {
			next
			} else {
	#		print(gene)
			comparison = c(firmpredict[branch,gene]/max_in,firmtrue[branch,gene],branch,gene)
			firm_for_roc = rbind(firm_for_roc,comparison)
			}
		
		}
	}
firm_for_roc = as.matrix(firm_for_roc)

# ROC??
source('code/figure_scripts/roc.R')
print('ROC + precision/recall for firmicutes partition')
ROC(firm_for_roc)

cat('number of edges:',edgenum,'\n')
cat('number of predictable genes:',length(predictable),'\n')
cat('number of gained genes:',length(which(firmgains>=1)),'\n')

rm(firmanc,firmadj,firmpres,firmtrue,realgains)

