#clustal de Toxo - Neo

setwd("")


library(ape)
library(seqinr)
li1<-read.table("brh_toxo", header=FALSE,sep='\t')
li2<-read.table("brh_neo", header=FALSE,sep='\t')
li1[!is.na(li1)]->li1
li2[!is.na(li2)]->li2
len<-length(li1)

#mat<-matrix(0,nrow=length(li1),ncol=length(li1))
mat<-matrix(0,nrow=length(li1),ncol=1)
#colnames(mat)<-li1
rownames(mat)<-li2

# makeblastdb -parse_seqids -hash_index -dbtype prot -in gp63_tcc.pep

for (x in 1:len){
  # for (y in 1:len){
  #  if(x==y) { 
  #   else {
  xp<-paste('blastdbcmd -entry ',li1[x],' -db brh_toxo.pep | sed \'s/lcl|//\'> tmp1')
  yp<-paste('blastdbcmd -entry ',li2[x],' -db brh_neo.pep | sed \'s/lcl|//\'>> tmp1')
  system(xp)
  system(yp)
  system(paste('clustalw2 -INFILE=tmp1 -OUTPUT=phy >a 2>error_clustalw')) 
  system('protdist < options_protdist > b 2>error_protdist')
  # escribe en outfile puedo parsearlo diferente 
  #    2
  # lcl|masp_2 0.000000 0.810140
  # lcl|masp_6 0.810140 0.00000
  gr<-paste('grep Neo outfile | awk \'{print $2}\' |head -1 >dis')
  system(gr)
  dis<-read.table("dis")
  dis$V1->a
  mat[x,1]<-a
  #mat[y,x]<-a
  #}else {}
  #}
}
# write.table(mat, file = "distmat_k2p", append = FALSE, sep = "\t")

##########################################
# Distance Hist
#########################################3
h = hist(mat[,1],100) # or hist(x,plot=FALSE) to avoid the plot of the histogram
h$density = h$counts/sum(h$counts)*100
plot(h,freq=FALSE, xlim=c(0,3),ylim=c(0,8), xlab="JTT aa distance",ylab="Density %", main="N. caninum - T. gondii distance", font.main=3)
########################################
# con ggplot
##########################################
library
library(ggplot2)

# dataset:
data=data.frame(value=rnorm(10000))
data<-as.data.frame(mat[,1])
colnames(data)<-c("JTT_distance")
# Custom Binning. I can just give the size of the bin
#opcion1
ggplot(data, aes(x=mat[, 1])) + geom_histogram(binwidth = 0.05)
#opcion2
ggplot(data, aes(x=JTT_distance)) + ggtitle("N.caninum - T. gondii")+
  geom_histogram(binwidth = 0.2, color="white", fill=rgb(0.2,0.7,0.1,0.4) ) +
  theme(plot.title = element_text( face="italic", size=14,hjust=0.5))

##########################################
### whit Percentage
##########################################
histPercent <- function(x, ...) {
  H <- hist(x, plot = FALSE)
  H$density <- with(H, 100 * density* diff(breaks)[1])
  labs <- paste(round(H$density), "%", sep="")
  plot(H, freq = FALSE, labels = labs, ylim=c(0, 1.08*max(H$density)),...)
}
histPercent(mat[,1], col="gray")
##########################################


m<-matrix(1:4, byrow = TRUE, nrow=2, ncol=2)
layout(m)
layout.show(4)


# Determine number of clusters
wss <- (nrow(mat)-1)*sum(apply(mat,2,var))
for (i in 2:15) wss[i] <- sum(kmeans(mat,   centers=i)$withinss)
plot(1:15, wss, type="b", xlab="Number of Clusters",  ylab="Within groups sum of squares") 
#dev.copy2pdf(file="dm28c_maspcortas_n_cluster.pdf")

# Plot MDS
fit <- cmdscale(mat,eig=F, k=4)
fit1 <- kmeans(fit, 1) #  cluster solution
plot(fit[,1],fit[,2], pch=20,col=fit1$cluster, main="MDS" )
#dev.copy2pdf(file="dm28c_maspcortas_mds.pdf")

# Plot cluster
library(cluster) 
fit <- kmeans(mat, 10)
clusplot(mat, fit$cluster, color=F, shade=F,lines=0,  labels=6)
#dev.copy2pdf(file="dm28c_maspcortas_clusplot.pdf")

#heatmap
heatmap(mat)
#dev.copy2pdf(file="dm28c_maspcortas_heatmap.pdf")

#dendogram
d <- dist(mat, method = "euclidean") # distance matrix
fit <- hclust(d, method="ward")
plot(fit) # display dendogram
#dev.copy2pdf(file="dm28c_maspcortas_dendogram.pdf")
