Enrichment of MHBs in open chromatin regions

In [ ]:
library("dplyr")
library("ggplot2")

An example using colon MHBs

In [ ]:
# run computeCpgCov in mHapSuite
system("java -jar mHapSuite-2.0-jar-with-dependencies.jar computeCpgCov \
        -bigwig colon_MM.bw -cpgPath hg19_CpG.gz -openChromatin colon_open.bed -tag colon_MHB \
        -bedPath colon_MHB.bed -missingDataAsZero")
In [11]:
# the output file shows mean methylation of each CpG sites and whether it's covered by open chromatin regions
result=read.table("colon_MHB.txt")
names(result) <- c("chr", "beta", "Open")
head(result)
A data.frame: 6 × 3
chrbetaOpen
<chr><dbl><int>
1chr1:540838-5408390.61428570
2chr1:540841-5408420.56923080
3chr1:540850-5408510.58208950
4chr1:540854-5408550.59090910
5chr1:540866-5408670.54687500
6chr1:540868-5408690.57812500
In [21]:
# assign CpG into groups by mean methylation-levels
result <- result[order(result$beta, decreasing = FALSE),]
result$Cluster <- paste0("cluster_", rep(1:10, each = trunc(nrow(result) / 10) + 1))[1:nrow(result)]

result$total=as.numeric("1")
head(result);table(result$Cluster)
res = as.data.frame(result[c("total","Open","Cluster")] %>% group_by(Cluster) %>% summarise_each(funs=sum))
rownames(res) =res$Cluster;res$Cluster = NULL
cluster_res = as.data.frame(result[c("beta","Cluster")] %>% group_by(Cluster) %>% summarise_each(funs=median))
rownames(cluster_res) = cluster_res$Cluster;cluster_res$Cluster=NULL
res = cbind(res,cluster_res)
res$Freq = 100*round(res$Open/res$total,4)
res$cluster=rownames(res)
head(res);dim(res)
colon_MHB=res[,c(3,4,5)]
head(colon_MHB)
A data.frame: 6 × 5
chrbetaOpenClustertotal
<chr><dbl><int><chr><dbl>
30850chr16:103150-1031510.002824861cluster_11
30851chr16:103156-1031570.002849001cluster_11
30853chr16:103186-1031870.002857141cluster_11
30855chr16:103203-1032040.002890171cluster_11
30852chr16:103174-1031750.002923981cluster_11
30854chr16:103195-1031960.002923981cluster_11
 cluster_1 cluster_10  cluster_2  cluster_3  cluster_4  cluster_5  cluster_6 
     12646      12641      12646      12646      12646      12646      12646 
 cluster_7  cluster_8  cluster_9 
     12646      12646      12646 
A data.frame: 6 × 5
totalOpenbetaFreqcluster
<dbl><int><dbl><dbl><chr>
cluster_112646124280.033538398.28cluster_1
cluster_1012641 97520.729166777.15cluster_10
cluster_212646123510.158503397.67cluster_2
cluster_312646123730.216666797.84cluster_3
cluster_412646122430.276119496.81cluster_4
cluster_512646120820.356236695.54cluster_5
  1. 10
  2. 5
A data.frame: 6 × 3
betaFreqcluster
<dbl><dbl><chr>
cluster_10.033538398.28cluster_1
cluster_100.729166777.15cluster_10
cluster_20.158503397.67cluster_2
cluster_30.216666797.84cluster_3
cluster_40.276119496.81cluster_4
cluster_50.356236695.54cluster_5
In [26]:
# perform similar analysis for UMR, LMR, PMD and HMR
input=read.table("colon_merge.txt",header = T);head(input);table(input$type)
A data.frame: 6 × 4
betaFreqclustertype
<dbl><dbl><chr><chr>
10.470588225.65cluster_1 HMR
20.9802632 9.99cluster_10HMR
30.736842124.35cluster_2 HMR
40.824817519.88cluster_3 HMR
50.871508417.82cluster_4 HMR
60.901098916.66cluster_5 HMR
HMR LMR MHB PMD UMR 
 10  10  10  10  10 
In [68]:
options(repr.plot.width = 8, repr.plot.height=6)
ggplot(input,aes(x=beta,y=Freq,color=type))+geom_smooth(se=F,size=1)+scale_y_continuous(expand=c(0,0))+
    scale_x_continuous(limits=c(0,1))+ggtitle("Colon")+
    scale_color_manual(values = c('#56A902','#2e409a','#fcbe32','#e97f02','#f15c5c'),name="")+
    labs(x="Mean methylation",y="The Percent of CpG sites \noverlapped by Open chromatin regions (%)")+theme_classic()+
    theme(plot.title = element_text(hjust=0.5,vjust=0.5),legend.text=element_text(size=13),axis.title=element_text(size=16),
        axis.text = element_text(size = 13))+coord_cartesian(ylim = c(0, 100))+
    geom_vline(aes(xintercept = 0.25),colour="black",linetype="dashed")
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'

Average profile across all tissue types

In [60]:
total=read.table("merge.txt",header = T)
total$tissue=(sapply(strsplit(total$tissue, "\\."), function(z) z[1]))
total$type=sapply(strsplit(total$tissue, "\\_"), function(z) z[2])
total$tissue=sapply(strsplit(total$tissue, "\\_"), function(z) z[1])
head(total);table(total$tissue,total$type)
A data.frame: 6 × 5
betaFreqclustertissuetype
<dbl><dbl><chr><chr><chr>
10.33000009.25cluster_1 AdiposeHMR
21.00000002.18cluster_10AdiposeHMR
30.72727279.72cluster_2 AdiposeHMR
40.83333335.54cluster_3 AdiposeHMR
50.88571433.99cluster_4 AdiposeHMR
60.91803283.12cluster_5 AdiposeHMR
           
            HMR LMR MHB PMD UMR
  Adipose    10  10  10  10  10
  Adrenal    10  10  10  10  10
  B-cell     10  10  10  10  10
  Breast     10  10  10  10  10
  colon      10  10  10  10  10
  esophagus  10  10  10  10  10
  heart      10  10  10  10  10
  liver      10  10  10  10  10
  lung       10  10  10  10  10
  ovary      10  10  10  10  10
  pancreas   10  10  10  10  10
  placenta   10  10  10  10  10
  spleen     10  10  10  10  10
  stomach    10  10  10  10  10
  T-cell     10  10  10  10  10
  thymus     10  10  10  10  10
  thyroid    10  10  10  10  10
In [64]:
total_input=aggregate(total[1:2], by = list(total$cluster, total$type), mean);head(total_input)
dim(total_input);table(total_input$`Group.2`)
A data.frame: 6 × 4
Group.1Group.2betaFreq
<chr><chr><dbl><dbl>
1cluster_1 HMR0.464688416.720588
2cluster_10HMR0.9917105 5.696471
3cluster_2 HMR0.755749213.751765
4cluster_3 HMR0.843069510.236471
5cluster_4 HMR0.8873174 8.662941
6cluster_5 HMR0.9144253 7.845294
  1. 50
  2. 4
HMR LMR MHB PMD UMR 
 10  10  10  10  10 
In [67]:
options(repr.plot.width = 8, repr.plot.height=6)
ggplot(total_input,aes(x=beta,y=Freq,color=Group.2))+geom_smooth(se=F,size=1)+scale_y_continuous(expand=c(0,0))+
    scale_x_continuous(limits=c(0,1))+ggtitle("17 tissues")+
    scale_color_manual(values = c('#56A902','#2e409a','#fcbe32','#e97f02','#f15c5c'),name="")+
    labs(x="Mean methylation",y="The Percent of CpG sites \noverlapped by Open chromatin regions (%)")+theme_classic()+
    theme(plot.title = element_text(hjust=0.5,vjust=0.5),legend.text=element_text(size=13),axis.title=element_text(size=16),
        axis.text = element_text(size = 13))+coord_cartesian(ylim = c(0, 100))+
    geom_vline(aes(xintercept = 0.25),colour="black",linetype="dashed")
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'
Warning message:
“Removed 1 rows containing non-finite values (`stat_smooth()`).”
In [ ]: