library(cluster)

# Input
countFile <- "RNA_counts_rep1.txt_wENSid"
rpkmFile <- "RNA_RPKM_rep1.txt_wENSid"
loopGeneFile <- "fibro_combined_30_merged_loops_sort_chronly_filtered_bed_lpsize_sort_genesENS_GeneID"

# Output
clusterFile <- "geneRcount_pam_zyggenes_k7"
barPlot <- "geneRcount_pam_zyggenes_k7_bar.pdf"


counts <- read.table(countFile, header=T, sep="\t")
rpkm <- read.table(rpkmFile, header=T, sep="\t")


y <- counts[, 2:11]
st9Counts <- counts[, 2]
st9RPKM <- rpkm[, 2]
maxCountsInStages <- apply(counts[, 2:11], 1, max)
maxRpkmInStages <- apply(rpkm[, 2:11], 1, max)



# We defined genes "zygotically expressed genes" so that
# the read counts at st9 < 30,
# the RPKM < 2 at st9,
# one of read counts in later stages > 30, and
# One of RPKM in later stages > 4.
# zeg == "zygitically expressed genes"
zeg <- counts[st9Counts < 30 & st9RPKM < 2 & maxRpkmInStages > 30 & maxCountsInStages > 4,]


# Clustering the time course of the transcription level of the zygotically expressd genes
zegCounts <- zeg[, 2:11]
scaledZegCounts <- t(scale(t(log2(zegCounts + 1)))) # log2 and standardized for each time course
pamZegCounts <- pam(scaledZegCounts, 7) # k-medoids for seven clusters with Eucledian distance
scaledZegCountsWithCluster <- cbind(pamZegCounts$cluster, scaledZegCounts)
#orderedScaledZegCountsWithCluster <- scaledZegCountsWithCluster[order(scaledZegCountsWithCluster[, 1]),]
#write.table(orderedScaledZegCountsWithCluster[, 2:11],
#            clusterFile, sep="\t", append=F, quote=F, row.name=F, col.name=T)

# Re-numbering of the clusters to sort them by transcription start timing
scaledZegCountsWithCluster[pamZegCounts$cluster==1, 1] <- 1
scaledZegCountsWithCluster[pamZegCounts$cluster==2, 1] <- 5
scaledZegCountsWithCluster[pamZegCounts$cluster==3, 1] <- 7
scaledZegCountsWithCluster[pamZegCounts$cluster==4, 1] <- 6
scaledZegCountsWithCluster[pamZegCounts$cluster==5, 1] <- 4
scaledZegCountsWithCluster[pamZegCounts$cluster==6, 1] <- 2
scaledZegCountsWithCluster[pamZegCounts$cluster==7, 1] <- 3
orderedScaledZegCountsWithCluster <- scaledZegCountsWithCluster[order(scaledZegCountsWithCluster[, 1]),]
write.table(orderedScaledZegCountsWithCluster[, 2:11],
            clusterFile ,sep="\t", append=F, quote=F, row.name=F, col.name=T)


# Loop genes
# We selected long range loops > 100 kb
loopGenes <- read.table(loopGeneFile, header=F)
longRangeLoopGeneIDs <- loopGenes[loopGenes[, 4] >= 100000 & loopGenes[, 10] == 0, 8]
loopGeneIDs <- loopGenes[loopGenes[, 10]==0, 8] # Coumn 10 menas the distance between TSS and loop anchor, ==0 means TSS and loop anchor are at the same bin. 
# Select long-range loop gene in the zygotically expressed genes
z <- scaledZegCountsWithCluster[!is.na(match(zeg[, 1], longRangeLoopGeneIDs)),]
# Select non-loop gene in the zygotically expressed genes
w <- scaledZegCountsWithCluster[is.na(match(zeg[, 1], loopGeneIDs)),]

z1 <- length(z[z[, 1]==1, 1])
z2 <- length(z[z[, 1]==2, 1])
z3 <- length(z[z[, 1]==3, 1])
z4 <- length(z[z[, 1]==4, 1])
z5 <- length(z[z[, 1]==5, 1])
z6 <- length(z[z[, 1]==6, 1])
z7 <- length(z[z[, 1]==7, 1])
zt <- nrow(z)
w1 <- length(w[w[, 1]==1, 1])
w2 <- length(w[w[, 1]==2, 1])
w3 <- length(w[w[, 1]==3, 1])
w4 <- length(w[w[, 1]==4, 1])
w5 <- length(w[w[, 1]==5, 1])
w6 <- length(w[w[, 1]==6, 1])
w7 <- length(w[w[, 1]==7, 1])
wt <- nrow(w)


# Plot the result
pdf(barPlot ,height=6,width=6)
barplot(c(z1/zt, w1/wt, 0,
          z2/zt, w2/wt, 0,
          z3/zt, w3/wt, 0,
          z4/zt, w4/wt, 0,
          z5/zt, w5/wt, 0,
          z6/zt, w6/wt, 0,
          z7/zt, w7/wt), col=c(2, 8, 1), ylim=c(0, 0. 25), las=1)
legend("topleft", c('loop genes','non loop genes'), col=c(2, 8), pch=15)
dev.off()
