## calculates distance (Euclidian, non-parametric Correlation, and Hamming Dist)
## uses hopach for unsupervised machine learning

setwd("~/Documents/...")

dat <- read.delim("olfrnameclean2-noempty.fa.8-mer.minLen500.RCcombo.txt",h=T,as.is=T)
#  dat format expected - seqName seqLength k-mers

# set kmer thresholding method
meth = "presenceOfKmer"	# based on number of seqs with the k-mer
#meth = "sumOfKmer"	# based on total number of each kmer over all seqs

# set the percent of sequences in which the kmer is present if meth = "presenceOfKmer"
# otherwise is the average number of occurences of the kmer if meth = "sumOfKmer"
percent = 0.2

############################################

kmers = dat[,-c(1:2)]
rownames(kmers) = dat$seq

if(meth == "presenceOfKmer"){
	sumMers = apply(kmers,2,function(x)length(which(x > 0)))
}else{
	sumMers = apply(kmers,2,sum)
}

summary(sumMers)

minCount = length(kmers[,1])*percent
which.minCount = which(sumMers > minCount)
length(which.minCount)

kmers.min = sapply(kmers[,which.minCount],function(x)(log2(x+1)))
rownames(kmers.min) = rownames(kmers)
sumGenes = apply(kmers.min,1,sum)
which.notZero = which(sumGenes>0)
# hist(sumMers[which.minCount],nclass=100,ylim=c(0,3000)) # most between 20-30
kmers.min = kmers.min[which.notZero,]


library("affy")

means = apply(kmers.min,2,mean,na.rm=T)	# mean for each kmer
summary(means)

gvals = scale(kmers.min,rowMedians(t(kmers.min)),means[colnames(kmers.min)]) # centers and scales values for each kmer

library("hopach")

# Euclidian distance
seq.euc.dist = distancematrix(gvals,d="euclid")
seq.euc.hobj = hopach(gvals, dmat=seq.euc.dist, d="euclid", mss="med")
unique(sort(seq.euc.hobj$clust$labels))
seq.euc.hobj$clust$sizes 
save(seq.euc.dist, file=paste("seq.min", percent, ".euc.dist.RData", sep=""))
save(seq.euc.hobj, file=paste("seq.min", percent, ".euc.hobj.RData", sep=""))

### writing out cluster info

minInCluster = 5
clusters = unique(sort(seq.euc.hobj$clust$labels))
numInCluster = seq.euc.hobj$clust$sizes

goodClust = which(numInCluster >= minInCluster)
newCluster = cbind(clusters[goodClust],numInCluster[goodClust])

for(i in 1:length(goodClust)){
	whichClust = which(seq.euc.hobj$clust$labels == newCluster[i,1])
	genes = round(seq.euc.dist[whichClust,],2)
	rownames(genes) = rownames(gvals)[whichClust]
	genes = genes[,whichClust]
	colnames(genes) = rownames(genes)
	write.table(genes, file=paste(newCluster[i,1], "method-euc_numGenes", newCluster[i,2], ".txt",sep=""),
		sep="\t",col.names=T,row.names=T,quote=F)
}



# correlation - using Kendall's Tau
seq.cor.dist = cor(t(gvals), method = "kendall")
# seq.cor.dist = distancematrix(gvals,d="cor")	# what is cor type?
seq.cor.hobj = hopach(gvals, dmat=seq.cor.dist, d="cor", mss="med")
unique(sort(seq.cor.hobj$clust$labels))
seq.cor.hobj$clust$sizes
save(seq.cor.dist, file=paste("seq.min", percent, ".cor.dist.RData", sep=""))
save(seq.cor.hobj, file=paste("seq.min", percent, ".cor.hobj.RData", sep=""))

### writing out cluster info

minInCluster = 5
clusters = unique(sort(seq.cor.hobj$clust$labels))
numInCluster = seq.cor.hobj$clust$sizes

goodClust = which(numInCluster >= minInCluster)
newCluster = cbind(clusters[goodClust],numInCluster[goodClust])

for(i in 1:length(goodClust)){
	whichClust = which(seq.cor.hobj$clust$labels == newCluster[i,1])
	genes = round(seq.cor.dist[whichClust,],2)
	rownames(genes) = rownames(gvals)[whichClust]
	genes = genes[,whichClust]
	colnames(genes) = rownames(genes)
	write.table(genes, file=paste(newCluster[i,1], "method-cor_numGenes", newCluster[i,2], ".txt",sep=""),
		sep="\t",col.names=T,row.names=T,quote=F)
}


# try Hamming distance 

library("e1071")
seq.hamm.dist = hamming.distance(gvals)
seq.hamm.hobj = hopach(gvals, dmat=seq.hamm.dist, mss="med")
unique(sort(seq.hamm.hobj$clust$labels))
seq.hamm.hobj$clust$sizes
save(seq.hamm.dist, file=paste("seq.min", percent, ".hamm.dist.RData", sep=""))
save(seq.hamm.hobj, file=paste("seq.min", percent, ".hamm.hobj.RData", sep=""))

### writing out cluster info

minInCluster = 5
clusters = unique(sort(seq.hamm.hobj$clust$labels))
numInCluster = seq.hamm.hobj$clust$sizes

goodClust = which(numInCluster >= minInCluster)
newCluster = cbind(clusters[goodClust],numInCluster[goodClust])

for(i in 1:length(goodClust)){
	whichClust = which(seq.hamm.hobj$clust$labels == newCluster[i,1])
	genes = round(seq.hamm.dist[whichClust,],2)
	rownames(genes) = rownames(gvals)[whichClust]
	genes = genes[,whichClust]
	colnames(genes) = rownames(genes)
	write.table(genes, file=paste(newCluster[i,1], "method-hamm_numGenes", newCluster[i,2], ".txt",sep=""),
		sep="\t",col.names=T,row.names=T,quote=F)
}





# heatmaps

#distMat = as.matrix(seq.hamm.dist)
#rownames(distMat) = colnames(distMat) = rownames(gvals)
#heatmap(distMat,Colv=NULL,symm=T)	# symm=T is very important!