#R script used for analysis in "Growth disrupting mutations in epigenetic regulatory molecules are associated with abnormalities of epigenetic aging"
#https://github.com/arjeffries/TBRS2019

library(limma)
library(minfi)
library(IlluminaHumanMethylation450kanno.ilmn12.hg19)
library(IlluminaHumanMethylation450kmanifest)
library(RColorBrewer)
library(missMethyl)
library(matrixStats)
library(minfiData)
library(Gviz)
library(DMRcate)
library(stringr)
library(GenomicRanges)
library(reshape2)
library(wateRmelon)

setwd("/mnt/data1/DNMT3A")
cross <- read.csv('CrossHybridisingProbesPriceORWeksberg.csv')
annot <- read.csv("AdditionalAnnotation450K_Price_Great_SNPsinProbeSequence.csv", stringsAsFactors = F)
pheno<-read.csv("SampleSheet.csv", stringsAsFactors = FALSE)

#readin files
path <- "/mnt/data1/DNMT3A/idat"
m <- readEPIC(path, n=T, oob=T)

#normalize
meth.pf <- pfilter(m)
meth.pf.dasen <- dasen(meth.pf)
beta <- betas(meth.pf.dasen)
rawbeta <- betas(m)

#remove crosshyb probes
y <- rownames(beta)
x <- y %in% cross$x
beta <- beta[!x,]

#remove crosshyb probes
y <- rownames(rawbeta)
x <- y %in% cross$x
rawbeta <- rawbeta[!x,]

#remove common SNPs from Weksburg list
y <- rownames(beta)
x <- y %in% annot$TargetID[annot$Weksburg_CommonSNP_Af_within10bpSBE != ""]
beta <- beta[!x,]

#remove common SNPs from Illumina list
y <- rownames(beta)
x <- y %in% annot$TargetID[annot$Illumina_CommonSNP_Af_within10bpSBE != ""]
beta <- beta[!x,]

#remove common SNPs from Weksburg list
y <- rownames(rawbeta)
x <- y %in% annot$TargetID[annot$Weksburg_CommonSNP_Af_within10bpSBE != ""]
rawbeta <- rawbeta[!x,]

#remove common SNPs from Illumina list
y <- rownames(rawbeta)
x <- y %in% annot$TargetID[annot$Illumina_CommonSNP_Af_within10bpSBE != ""]
rawbeta <- rawbeta[!x,]

#remove rs SNPs
beta <- beta[!grepl("rs", rownames(beta)),]

#Lookup in annotation
cg <- rownames(beta)
y <- match(cg,annot$TargetID)
annot <- annot[y,]  #retrieve annot in correct order and trim out unused probes

#add more friendly names to the columns
x <- colnames(beta)
x <- gsub("outlier/","",x) #added as outlier string added by watermelon in new version ?
idatID <- paste(pheno$Sentrix_ID,"_",pheno$Sentrix_Position, sep="")
y <- match(x,idatID)
y <- pheno[y,3]  #substitue in real names
colnames(beta) <- y

#add more friendly names to the columns
x <- colnames(rawbeta)
idatID <- paste(pheno$Sentrix_ID,"_",pheno$Sentrix_Position, sep="")
y <- match(x,idatID)
y <- pheno[y,3]  #substitue in real names
colnames(rawbeta) <- y

#reshuffle the pheno file so that in the same order
x <- colnames(beta)
y <- match(x,pheno$Sample_ID)
pheno <- pheno[y,]  #retrieve annot in correct order
pheno$Genotype <- as.factor(pheno$Genotype)

#Make a beta matrix and annotation matrix with no X or Y Chromosome data
beta_noxy <- beta[annot$CHR_37 !="X" & annot$CHR_37 !="Y" & annot$CHR_37 != "XY",]
annot_noxy <- annot[annot$CHR_37 !="X" & annot$CHR_37 !="Y" & annot$CHR_37 != "XY",]


#Check bisulfite conversion rates
greenChannel<-intensitiesByChannel(QCdata(m))$Cy3 
redChannel<-intensitiesByChannel(QCdata(m))$Cy5 

### these are fully methylated controls so closer to 1/100% the better the BS conversion. 
### NOTE due to different chemistries type I conversion controls and type II conversion controls need to be handled differently. 
greenChannel<-greenChannel[grep("BS.Conversion", rownames(greenChannel)),] 
redChannel<-redChannel[grep("BS.Conversion", rownames(redChannel)),] 
BScon1<-rbind(greenChannel[c("BS.Conversion.I.C1", "BS.Conversion.I.C2", "BS.Conversion.I.C3"),], 
redChannel[c("BS.Conversion.I.C4", "BS.Conversion.I.C5", "BS.Conversion.I.C6"),]) ## this is method 1 
BScon2<-redChannel[c("BS.Conversion.II.1", "BS.Conversion.II.2", "BS.Conversion.II.3", "BS.Conversion.II.4"),] 
## this is method 1 
### for type II probes red channel is M and green channel is U so easy conversion into beta values 
BScon2.betas<-redChannel[c("BS.Conversion.II.1", "BS.Conversion.II.2", "BS.Conversion.II.3", 
"BS.Conversion.II.4"),]/(redChannel[c("BS.Conversion.II.1", "BS.Conversion.II.2", "BS.Conversion.II.3","BS.Conversion.II.4"),]+greenChannel[c("BS.Conversion.II.1","BS.Conversion.II.2","BS.Conversion.II.3","BS.Conversion.II.4"),]) 
### for type I both methylated and unmethylated in the same channel 
BScon1.betas<-rbind(greenChannel[c("BS.Conversion.I.C1", "BS.Conversion.I.C2","BS.Conversion.I.C3"),], redChannel[c("BS.Conversion.I.C4", "BS.Conversion.I.C5", "BS.Conversion.I.C6"),])/(rbind(greenChannel[c("BS.Conversion.I.C1","BS.Conversion.I.C2","BS.Conversion.I.C3"),], redChannel[c("BS.Conversion.I.C4", "BS.Conversion.I.C5","BS.Conversion.I.C6"),]) + rbind(greenChannel[c("BS.Conversion.I.U1", "BS.Conversion.I.U2","BS.Conversion.I.U3"),], redChannel[c("BS.Conversion.I.U4", "BS.Conversion.I.U5","BS.Conversion.I.U6"),])) 
BScon.betas<-rbind(BScon1.betas, BScon2.betas) 
BScon.beta.median<-apply(BScon.betas, 2, median)*100
boxplot(BScon.betas, las=2)


# visualise what the data looks like before and after normalisation
par(mfrow=c(1,2))
densityPlot(rawbeta, sampGroups=pheno$Genotype,main="Raw", legend=FALSE)
legend("top", legend = levels(factor(pheno$Genotype)), text.col=brewer.pal(8,"Dark2"))
densityPlot(beta, sampGroups=pheno$Genotype,main="Normalized", legend=FALSE)
legend("top", legend = levels(factor(pheno$Genotype)),text.col=brewer.pal(8,"Dark2"))

# Visualise with autosomes only
densityPlot(beta_noxy, sampGroups=pheno$Genotype,main="Normalized", legend=FALSE)
legend("top", legend = levels(factor(pheno$Genotype)),text.col=brewer.pal(8,"Dark2"))


# Visualise methylation density of autosome only data in specific samples for supplementary figure creation
par(mfrow=c(2,1))
samples <- c("4169","4170")  # 19 and 20y
densityPlot(beta_noxy[,(colnames(beta_noxy) %in% samples)], sampGroups=pheno$Genotype[pheno$Sample_ID %in% samples],main="Age/sex matched Amish DNMT3A variant carrier (CT) vs unaffected sib (CC) - pair 1", legend=TRUE)
samples <- c("4161","4167") # 24 and 25 year old
densityPlot(beta_noxy[,(colnames(beta_noxy) %in% samples)], sampGroups=pheno$Genotype[pheno$Sample_ID %in% samples],main="Age/sex matched Amish DNMT3A variant carrier (CT) vs unaffected sib (CC) - pair 2", legend=TRUE)

#Perform statistical comparison (Wilcoxon Rank Sum test)
samples <- c("4170","4169")  # 19 and 20y
x <- beta_noxy[,(colnames(beta_noxy) %in% samples[1])]
y <- beta_noxy[,(colnames(beta_noxy) %in% samples[2])]
wilcox.test(x,y)
summary(x) # CC
summary(y) # CT

samples <- c("4167","4161") # 24 and 25 year old
x <- beta_noxy[,(colnames(beta_noxy) %in% samples[1])]
y <- beta_noxy[,(colnames(beta_noxy) %in% samples[2])]
wilcox.test(x,y)
summary(x) # CC
summary(y) # CT


##Perform LIMMA on autosomes (no sex chromosomes) to determine DMPs
phenotempf <- pheno$Sex == "F" #make a 'F' score to match 'M' score
design <- model.matrix(~ 0+factor(pheno$Genotype)+factor(pheno$Sex)+factor(phenotempf))
colnames(design) <- c("CC", "CT", "M", "F")
contrast.matrix <- makeContrasts(CT-CC, levels=design) #make a contrast to limit comparison to genotype but correct for Sex
fit <- lmFit(beta_noxy, design)
fit2 <- contrasts.fit(fit, contrast.matrix)
fit2 <- eBayes(fit2)
DMPs <- toptable(fit2, number=nrow(fit2)) # full list
sig <- subset(DMPs, DMPs$adj.P.Val < 0.05) # significant DMPs



#Make a Volcano plot
pdf("volcano_fig1.pdf")
CTmean <- apply(beta_noxy[,pheno$Genotype=="CT"],1,mean)
CCmean <- apply(beta_noxy[,pheno$Genotype=="CC"],1,mean)
deltabeta <- CTmean - CCmean
plot(DMPs$logFC,-log10(DMPs$P.Value), pch=20, ylab=expression("-log"[10]*"(p-values)"), xlab=expression("log"[2]*"(fold change)"), cex.lab=1.5, cex.axis=1.3)
points(DMPs$logFC[DMPs$adj.P.Val < 0.05],-log10(DMPs$P.Value[DMPs$adj.P.Val < 0.05]), pch=20, col="red")
dev.off()



##Rearrange DMPs variable to order for flattening (below) which will allow probe to feature assessment
y <- match(row.names(beta_noxy),row.names(DMPs))
DMPs_noxy <- DMPs[y,]  #retrieve annot in correct order and remove any not there
CTmean <- apply(beta_noxy[,pheno$Genotype=="CT"],1,mean)
CCmean <- apply(beta_noxy[,pheno$Genotype=="CC"],1,mean)
deltabeta <- CTmean - CCmean

for(i in 1:nrow(annot_noxy)){
if (annot_noxy$UCSC_RefGene_Group[i] == "") { annot_noxy$UCSC_RefGene_Group[i] <- "intergenic" 
annot_noxy$UCSC_RefGene_Name[i] <- paste("intergenic",i,sep="") }}



#flatten out the descriptors for methylation	
	genestore <- ""
	refseqstore <- ""
	datastore <- NA
	cpgstore <- ""
	x <- data.frame()
	xindex <- 1
	for(i in 1:nrow(annot_noxy)){
		gene <- unlist(strsplit(as.character(annot_noxy$UCSC_RefGene_Name)[i], ";"))
		if(length(gene) > 0) {
		refseq <- unlist(strsplit(as.character(annot_noxy$UCSC_RefGene_Group)[i], ";"))
#		if (length(refseq)==0) { refseq <- "intergenic" }
		cpg <- unlist(strsplit(as.character(annot_noxy$Relation_to_UCSC_CpG_Island)[i], ";"))
		for(gi in 1:length(gene)) {
			genestore[xindex] <- gene[gi]
			refseqstore[xindex] <- refseq[gi]
			cpgstore[xindex] <- cpg[gi]
			datastore[xindex] <- i
			xindex <- xindex + 1
	}}}
flatgene <- data.frame(annot_noxy$TargetID[datastore], genestore,refseqstore,cpgstore,beta_noxy[datastore,], CCmean[datastore], CTmean[datastore], deltabeta[datastore], DMPs_noxy$adj.P.Val[datastore])	
x <- colnames(flatgene)
x[1] <- "TargetID"
colnames(flatgene) <- x   #rename columns

flatcpg <- flatgene[!duplicated(flatgene[c("TargetID","cpgstore")]),]	
flatgene <- flatgene[!duplicated(flatgene[c("TargetID", "refseqstore")]),]	

bkcountsgene <- summary(flatgene$refseqstore)
bkcountscpg <- summary(flatcpg$cpgstore)
countsgene <- summary(flatgene$refseqstore[flatgene$DMPs_noxy.adj.P.Val.datastore. < 0.05])
countscpg <- summary(flatcpg$cpgstore[flatcpg$DMPs_noxy.adj.P.Val.datastore. < 0.05])
bkcountsgene
countsgene
bkcountscpg
countscpg


## Perform DMR analysis using DMRcate
myAnnotation <- cpg.annotate(beta_noxy, datatype = "array", 
                             analysis.type="differential", design=design, 
                             what="Beta", arraytype = "450K", coef="CT - CC", contrasts=T, cont.matrix = contrast.matrix
							 )
DMRs <- dmrcate(myAnnotation, lambda=1000, C=2)
results.ranges <- extractRanges(DMRs, genome = "hg19")
DMRlist <- (DMRs$results) # rank ordered result list


## Dump file for web based epigenetic age prediction on 450k data
## For EPIC, use agep function of wateRmelon on dasen normalized beta
epi <- data.frame(row.names(rawbeta),rawbeta)
epicols <- colnames(epi)
epicols[1] <- "ProbeID"
colnames(epi) <- epicols
write.csv(epi, "foragecalc.csv", row.names=F) #use this for https://dnamage.genetics.ucla.edu
 


##Perform GO and KEGG analysis using missMethyl package
library(missMethyl)
library(annotate)
library(org.Hs.eg.db)
library(IlluminaHumanMethylation450kanno.ilmn12.hg19)
library(clusterProfiler)
library(KEGGREST)
	

#Perform gene ontology on 'sig' from limma with probesnames from DMPs used as a background
gont <- gometh(sig.cpg=rownames(sig), all.cpg=rownames(DMPs), collection="GO", prior.prob=T, array.type="450K")
gont <- (gont[order(gont$P.DE),])

#Perform KEGG analysis by pulling down KEGG data and assessing with gsameth function (again with sig as test, DMPs variable as background list)
hsa_kegg = download_KEGG('hsa')
kegg <- hsa_kegg$KEGGPATHID2EXTID
k <- dcast(kegg, kegg$to ~ kegg$from)
k <- data.frame(k[2:ncol(k)])
testpathway <- gsameth(sig.cpg=rownames(sig), all.cpg=rownames(DMPs), collection=k, prior.prob=T, plot.bias=T, array.type="450K")
testpathway <- data.frame(testpathway)
testpathway <- (testpathway[order(testpathway$P.DE),])

#Look to annotate the KEGG pathways into something readable rather than simply a number
desc <- ""
for (i in 1:nrow(testpathway)) {
rownames(testpathway)[i] -> ref
xxx <- keggGet(ref)
desc[i] <- xxx[[1]]$NAME
}
testpathway <- data.frame(desc, testpathway)


write.csv(testpathway, "KEGG.csv")
write.csv(gont, "GO.csv")


##read in EPIC data to compare other datasets
library(wateRmelon)
setwd("/mnt/data1/DNMT3Aextra")
path <- "/mnt/data1/DNMT3Aextra/idatsextend"
annotEPIC <- read.csv("MethylationEPIC_v-1-0_B4.csv", header=T, stringsAsFactors=F, skip=7)
m <- readEPIC(path, n=T, oob=T)

#normalize
meth.pf <- pfilter(m)
meth.pf.dasen <- dasen(meth.pf)
#meth.dasen <- dasen(m)
betaEPIC <- betas(meth.pf.dasen)
rawbetaEPIC <- betas(m)
x <- agep(betaEPIC)

phenoEPIC <- read.csv("SampleSheet2.csv")

#add more friendly names to the columns
x <- colnames(betaEPIC)
x <- gsub("outlier/","",x) #added as outlier string added by watermelon in new version ?
idatID <- paste(phenoEPIC$Sentrix_ID,"_",phenoEPIC$Sentrix_Position, sep="")
y <- match(x,idatID)
y <- phenoEPIC[y,1]  #substitue in real names
colnames(betaEPIC) <- y



#reshuffle the phenoEPIC file
x <- colnames(betaEPIC)
y <- match(x,phenoEPIC$Sample_ID)
phenoEPIC <- phenoEPIC[y,]  #retrieve in correct order
phenoEPIC$Group <- as.factor(phenoEPIC$Group)

#remove cross hyb probes - files from supplementary data at https://doi.org/10.1186/s13059-016-1066-1 and https://dx.doi.org/10.1016%2Fj.gdata.2016.05.012
crosshyb <- read.table("/mnt/data1/EPIC_reference/CrossHydridisingProbes_McCartney.txt", stringsAsFactors = F)
crosshybS1 <- read.csv("/mnt/data1/EPIC_reference/Pidsley_SM1.csv", stringsAsFactors = F) #Pidlsey crosshyb probes
SNPhybs <- read.table("/mnt/data1/EPIC_reference/SNPProbes_McCartney.txt", header=T, stringsAsFactors = F)
SNPhybsS4 <- read.csv("/mnt/data1/EPIC_reference/Pidsley_SM4.csv", stringsAsFactors = F) #Pidsley SNPs
SNPhybsS5 <- read.csv("/mnt/data1/EPIC_reference/Pidsley_SM5.csv", stringsAsFactors = F)
SNPhybsS6 <- read.csv("/mnt/data1/EPIC_reference/Pidsley_SM6.csv", stringsAsFactors = F)

#remove crosshyb probes from above datasets
y <- rownames(betaEPIC)
x <- y %in% crosshyb$V1
betaEPIC <- betaEPIC[!x,]
y <- rownames(betaEPIC)
x <- y %in% crosshybS1$X
betaEPIC <- betaEPIC[!x,]

snpProbes<-SNPhybs[which(SNPhybs$EUR_AF >= 0.05 & SNPhybs$EUR_AF <= 0.95),]

y <- rownames(betaEPIC)
x <- y %in% snpProbes$IlmnID
betaEPIC <- betaEPIC[!x,]

y <- rownames(betaEPIC)
x <- y %in% SNPhybsS4$PROBE
betaEPIC <- betaEPIC[!x,]

y <- rownames(betaEPIC)
x <- y %in% SNPhybsS5$PROBE
betaEPIC <- betaEPIC[!x,]

y <- rownames(betaEPIC)
x <- y %in% SNPhybsS6$PROBE
betaEPIC <- betaEPIC[!x,]

#remove rs SNPs
betaEPIC <- betaEPIC[!grepl("rs", rownames(betaEPIC)),]

#remove XY chromosomes
betaEPIC_allchr <- betaEPIC
xyprobes <- subset(annotEPIC, annotEPIC$CHR == "X" | annotEPIC$CHR == "Y" | annotEPIC$CHR =="XY")
y <- rownames(betaEPIC)
x <- y %in% xyprobes$IlmnID
betaEPIC <- betaEPIC[!x,] 
rm(xyprobes)


#Comparison of R771 DMPs to EPIC data
R771 <- sig
R771beta <- beta
R771betanoxy <- beta_noxy
R771annot <- annot
R771pheno <- pheno

meanEPICControl <- apply(betaEPIC[,phenoEPIC$Group=="C"],1,mean)
meanEPICMutant <- apply(betaEPIC[,phenoEPIC$Group=="D"],1,mean)
mean450kControl <- apply(R771betanoxy[,R771pheno$Genotype=="CC"],1,mean)
mean450kMutant <- apply(R771betanoxy[,R771pheno$Genotype=="CT"],1,mean)

delta450k <- mean450kMutant-mean450kControl
x <- match(rownames(R771), names(delta450k)) #subset to DMPs fround in R771 only
deltaR771 <- delta450k[x]
plot(hist(deltaR771[R771$adj.P.Val < 0.05],breaks=100)) #check all good

deltaEPIC <- meanEPICMutant-meanEPICControl
x <- match(rownames(R771), names(deltaEPIC))
deltaEPICfromR771 <- deltaEPIC[x]
plot(hist(deltaEPICfromR771,breaks=100)) #check all good

#filter to only overlapping probes
x <- match(names(deltaEPICfromR771), names(delta450k)) #subset to DMPs that are shared in EPIC
deltaR771 <- delta450k[x]
plot(hist(deltaR771[R771$adj.P.Val < 0.05],breaks=100)) #check all good

#Plot effect size
plot(deltaEPICfromR771, deltaR771, pch=16, cex=0.5, xlim=c(-0.6,+0.6), ylim=c(-0.6,0.6), ylab="DNMT3A p.(Arg771Gln)", xlab="Other TBRS DNMT3A mutations")
abline(v=0, h=0, col="red")
cor.test(deltaEPICfromR771, deltaR771)


#draw boxplot showing effect size of E771 identified DMPs in other samples
deltaEPIC <- betaEPIC-meanEPICControl
deltaEPIC <- deltaEPIC[,phenoEPIC$Group=="D"]
x <- match(names(deltaR771),rownames(deltaEPIC))
sigprobesinEPIC <- deltaEPIC[x,]
finaldeltavalues <- cbind(deltaR771, sigprobesinEPIC)
finaldeltavalues <- cbind(deltaR771[deltaR771<0], sigprobesinEPIC)
boxplot(sigprobesinEPIC[,4],sigprobesinEPIC[,13],sigprobesinEPIC[,9],sigprobesinEPIC[,3],sigprobesinEPIC[,6],sigprobesinEPIC[,5],sigprobesinEPIC[,11],sigprobesinEPIC[,12],sigprobesinEPIC[,1],sigprobesinEPIC[,8],sigprobesinEPIC[,7],finaldeltavalues[,1],sigprobesinEPIC[,14],sigprobesinEPIC[,15],sigprobesinEPIC[,2],sigprobesinEPIC[,10], outline=F, las=2, ylab=("change in methylation beta value"), col=c(6,7,5,7,7,7,7,7,7,6,5,3,7,7,7,7,7), cex.axis=1.5, cex.lab=1.5)