# to compute distances from root of gains for different ranks of toposort
# to be run within/after 'plot_alltopos_motree_072314.R'
require(ape)
ordering = as.vector(read.table('processed_data/reproduced_analysis/hgt_toposort_attr_090915.txt',header=T))
tree= read.tree('gainLoss_results/MOtree_GLrun/TheTree.INodes.ph')

# get rid of archaea- list of KEGG ids corresponding to the bacteria only on the tree
bact = read.table('processed_data/motree_noarch_specs')
archs = tree$tip.label[ !(tree$tip.label %in% as.vector(bact[,]))]

bactree = drop.tip(tree,archs)

tree = chronopl(bactree, 0, age.min=.5, tol=1e-8)
#error()

alldists = dist.nodes(tree)
# 635th in the list is the root in the full tree ([N1]), 562 in the bact-only
#dist_from_root = alldists[,635]
dist_from_root = alldists[,562]
#names(dist_from_root) = tree$node.label
names(dist_from_root) = append(bactree$tip.label,bactree$node.label)

thresh = .6

gain_mat = as.matrix(gains)
gain_mat[gains>thresh] = 1
gain_mat[gains<=thresh] = 0

gain_mat = gain_mat[names(dist_from_root),]

rank1 = c()
rank2 = c()
rank3 = c()
rank4 = c()
rank5 = c()
#rank6 = c()

dists = list(rank1,rank2,rank3,rank4,rank5)

for (rank in sort(unique(as.vector(ordering[,])))) {
	print(rank)
	for (gene in rownames(ordering)[ordering[,]==rank]) {
#		print(gene)
		branches = rownames(gain_mat)[gain_mat[,gene]==1]
#		for (branch in rownames(gain_mat)) {			
			if (length(branches)>=1) {
				#if ( rank==6) {
	#			print(gene)
	#			print(gain_mat[branches,gene]) }
				dists[[rank]] = append(dists[[rank]], mean(na.omit(dist_from_root[branches])))
#				dists[[rank]] = append(dists[[rank]], dist_from_root[branches])
				}
			
#			}
			
	
		}
	
	}
	
	
calcCI = function(x) {
	mu = mean(x)
	se = sd(x)
	n = length(x)
	upper = mu + se/sqrt(n)
	lower = mu - se/sqrt(n)
	all = c(mu,se,n,upper,lower)
	names(all) = c('mean','SE','N','upper CI','lower CI')
	return(all)
	}

depth = cbind(rep(1,length(dists[[1]])),dists[[1]])
depth = rbind(depth,cbind(rep(2,length(dists[[2]])),dists[[2]]))
depth = rbind(depth,cbind(rep(3,length(dists[[3]])),dists[[3]]))
depth = rbind(depth,cbind(rep(4,length(dists[[4]])),dists[[4]]))
depth = rbind(depth,cbind(rep(5,length(dists[[5]])),dists[[5]]))
#depth = rbind(depth,cbind(rep(6,length(dists[[6]])),dists[[6]]))
depth = na.omit(depth)

print(cor.test(depth[,1],(1-depth[,2])))
print(cor.test(depth[,1],(1-depth[,2]),method='spearman'))

means = by((1-depth[,2]),depth[,1],mean)
sds = by((1-depth[,2]),depth[,1],sd)
lengths = by((1-depth[,2]),depth[,1],length)	

# tried as a barplot/plotting means, but distribution is so skewed that not a very meaningful representation...	
#pdf('boxplot_rankvs_rootdist_bygain.pdf')
# 1-depth because depth is actually distance from the root, which is the inverse of depth.
# recall that all tips are equidistant (1.0) from root- chronopl did that.
boxed = boxplot((1-depth[,2])~depth[,1],ylab='Phylogenetic depth of gains',
# box widths scaled to the number of gains in each rank
boxwex = lengths/1500,cex=.5,notch=TRUE)
points(c(1,2,3,4,5),means,col='red',pch=19,cex=.75)
#arrows(boxed, means - 1.96*(sds / sqrt(lengths)), boxed, means + 1.96*(sds / sqrt(lengths)), length=0,angle=90,col='red')

print('now plot barplot of same with 95% CIs- probably less informative.')
barred = barplot(means, ylab='Phylogenetic depth of gains', xlab='Topological rank',  ylim = c(0.0,.1))#, col='white')
# 95% CI for depths... maybe not super well-considered, totally non-normal
arrows(barred, means - 1.96*(sds / sqrt(lengths)), barred, means + 1.96*(sds / sqrt(lengths)), length=0,angle=90)
#dev.off()	


