#read data
freq <- as.matrix(read.csv(paste(data_dir,"/data/data_VAF.txt",sep=""), row.names=1, sep="\t"))
freq <- freq / 100
#remove noisy mutation
freq <- freq[-which(rownames(freq) == "MosJ 42"),]
#add root data (freq=0.5)
freq <- rbind(freq, 0.5)
rownames(freq)[dim(freq)[1]] <- "root"
#remove low allele frequency data
for(i in dim(freq)[1]:1){
if(max(freq[i,]) < 0.05){
freq <- freq[-i,]
}
}
mnames <- rownames(freq)
mutation_num <- dim(freq)[1]
sample_num <- dim(freq)[2]
#Distance between mutations
Distance_matrix <- matrix(0, nrow=mutation_num, ncol=mutation_num)
for(i in 1:mutation_num){
for(j in 1:mutation_num){
Distance_matrix[i,j] <- sum((freq[i,]-freq[j,])**2) / (sample_num * 0.5 * (mean(freq[i,])+mean(freq[j,])))
}
}
diag(Distance_matrix) <- 0
minus_log_Distance_matrix <- matrix(nrow=mutation_num, ncol=mutation_num)
for(i in 1:mutation_num){
for(j in 1:mutation_num){
if(Distance_matrix[i,j] == 0){
minus_log_Distance_matrix[i,j] <- Inf
}
else{
minus_log_Distance_matrix[i,j] <- -log(Distance_matrix[i,j])
}
#remove edge that satisfy -log(d(i,j)) < theta
if(minus_log_Distance_matrix[i,j] < theta){
minus_log_Distance_matrix[i,j] <- 0
}
}
}
#clustering mutation into mutation nodes based on connected components of the above graph
g <- graph.adjacency(minus_log_Distance_matrix, weighted=TRUE, mode="undirected")
cls <- clusters(g, "weak")
mutation_node_num <- cls$no
mutation_node_list <- list()
for(i in 1:mutation_node_num){
mutation_node_list[[i]] <- which(cls$membership==i)
}
mutation_node_mnames <- rep("", mutation_node_num)
for(i in 1:mutation_node_num){
mutation_node_mnames[i] <- rownames(freq)[mutation_node_list[[i]]][[1]]
if(length(mutation_node_list[[i]]) >= 2){
for(j in 2:length(mutation_node_list[[i]])){
mutation_node_mnames[i] <- paste(mutation_node_mnames[i], rownames(freq)[mutation_node_list[[i]]][[j]], sep="\n")
}
}
}
#mean allele frequency of mutation node
mean_freq <- matrix(0, nrow=mutation_node_num, ncol=dim(freq)[2])
for(i in 1:mutation_node_num){
if(length(mutation_node_list[[i]]) > 1){
mean_freq[i,] <- colSums(freq[mutation_node_list[[i]],]) / length(mutation_node_list[[i]])
}
else{
mean_freq[i,] <- freq[mutation_node_list[[i]][1],]
}
}
#inference of one mother two daughter nodes relationships
epsilon1 <- 0.005
Issum_matrix <- matrix(rep(0,mutation_node_num*2), nrow=mutation_node_num, ncol=mutation_node_num)
while(epsilon1 > 0.0){
print(paste("current epsilon1 =",epsilon1,sep=" "))
Issum_matrix <- diag(rep(0,mutation_node_num))
for(i in 1:mutation_node_num){
for(j in 1:(mutation_node_num-1)){
for(k in (j+1):mutation_node_num){
if(i == j || i == k){
next
}
Y <- mean_freq[i,]
X <- mean_freq[j,] + mean_freq[k,]
tmp <- sum((Y-X)**2/sample_num)
tmp <- tmp / mean(X)
if(tmp < epsilon1){
Issum_matrix[j,i] <- 1
Issum_matrix[k,i] <- 1
}
}
}
}
if(sum(apply((Issum_matrix!=0), MARGIN=2, sum) > 2) > 0 || (sum(apply((Issum_matrix!=0), MARGIN=1, sum) > 1)) > 0){
epsilon1 <- epsilon1 - 0.00025
}
else{
break
}
}
jpeg(paste(data_dir,"/out/lineage_step1.jpeg",sep=""),height=960, width=960, res=144)
q1 <- qgraph(t(Issum_matrix!=0), layout="spring", labels=mutation_node_mnames, edge.color="red")
dev.off()
options(repr.plot.width=15, repr.plot.height=15)
plot(q1)