Genome segmentation by methylation patterns

MHB discovery

In [ ]:
system("java -jar mHapSuite-2.0-jar-with-dependencies.jar MHBDiscovery -mhapPath colon.mhap.gz \
          -cpgPath hg19_CpG.gz -window 5 -r2 0.5 -pvalue 0.05 -outputDir outputDir \
          -tag colon_MHB")

Identification of UMR, LMR and PMD

In [ ]:
#!/bin/Rscript
#Find the Genomic Feature in DNA methylation modification
library(MethylSeekR)
library("BSgenome.Hsapiens.UCSC.hg19")
sLengths=seqlengths(Hsapiens)
library(regioneR)
library(parallel)
library(rtracklayer)
####
set.seed(123)

if (!file.exists("train")){
	dir.create("train")
	dir.create("res")
}

######
##merge RData
data_path="merged_CpG/"

samples=list.files(data_path,pattern="Count_CpG.bedGraph")
undone = setdiff(gsub("_Merged_Count_CpG.bedGraph","",samples),
				gsub("_PMDs.bed","",list.files("res",pattern="_PMDs.bed")))
samples = paste0(undone,"_Merged_Count_CpG.bedGraph")

	####Hg19 CGI
	session <- browserSession()
 	genome(session) <- "hg19"
	query <- ucscTableQuery(session, table = "cpgIslandExt")
 	CpGislands.gr <- track(query)
 	genome(CpGislands.gr) <- NA

 	###remove CGI +/-5K CpGs
	CpGislands.gr <-suppressWarnings(resize(CpGislands.gr, 5000, fix="center"))


	for ( i in samples){

	###Load GR
	x <- toGRanges(paste0(data_path,i))
	names(mcols(x)) = c("M","Um")
	mcols(x)[,"T"] = mcols(x)[,1] + mcols(x)[,2]
	###
	ranges(x) <- end(x)
	mcols(x) <- mcols(x)[,c("T","M")]

	tag <- gsub("_Merged_Count_CpG.bedGraph","",i)
	###PMD
	PMDsegments<-segmentPMDs(m=x, chr.sel="chr22",
             seqLengths=sLengths,pdfFilename=paste0("train/",tag,"_PMD.pdf"), num.cores=10)
	

	###FDR cut-off
	stats <- suppressWarnings(calculateFDRs(m=x, CGIs=CpGislands.gr,
							PMDs=PMDsegments, num.cores=10,pdfFilename=paste0("train/",tag,"_FDR.pdf")))

	FDR.cutoff <- 5
	m.sel <- 0.5
	n.sel=as.integer(names(stats$FDRs[as.character(m.sel), ]
		[stats$FDRs[as.character(m.sel), ]<FDR.cutoff])[1])

	###UMR LMR
	UMRLMRsegments <- segmentUMRsLMRs(m=x, meth.cutoff=m.sel,
						nCpG.cutoff=n.sel, PMDs=PMDsegments,
						num.cores=10, myGenomeSeq=Hsapiens,minCover=5,
						seqLengths=sLengths,pdfFilename=paste0("train/",tag,"_UMR_LMR.pdf"))
	
	#save PMD LMR UMR
	###PMD
	write.table(as.data.frame(PMDsegments[PMDsegments$type=="PMD"])[c(1:3)],file=paste0("res/",tag,"_PMDs.bed"),
				sep="\t",quote=F,col.names=F,row.names=F)
	save(PMDsegments,file=paste0("res/",tag,"_PMDs.RData"))
	###LMR & UMR 

	write.table(granges(UMRLMRsegments[UMRLMRsegments$type=="LMR",]),file=paste0("res/",tag,"_LMRs.bed"),
				sep="\t",quote=F,col.names=F,row.names=F)
	write.table(granges(UMRLMRsegments[UMRLMRsegments$type=="UMR",]),file=paste0("res/",tag,"_UMRs.bed"),
				sep="\t",quote=F,col.names=F,row.names=F)
	save(UMRLMRsegments,file=paste0("res/",tag,"_UMRsLMRs.RData"))
}

Identification of HMR

In [ ]:
### Find HMR
system("
    #!/bin/bash
    #Find the Hyper-methylation regions

    ### N_gaps
    N_gaps="hg19_N_gaps.bed"

    ### genome bed
    hg19_chrom="hg19_chrom.bed"

    ### MHB path
    mhb_path="MHB_RES"

    ### C DR PATH
    cd res/

    for i in `ls *LMRs.bed`
    do
        bedtools subtract -a ${hg19_chrom} -b ${i%_*}_PMDs.bed ${i%_*}_LMRs.bed ${i%_*}_UMRs.bed ${N_gaps} | \
        awk '{OFS="\t"} $3-$2>10 {print $0}'|grep -v chrM >${i%_*}_HMRs.bed
    done

    ####mhb
    for i in `ls ${mhb_path}`
    do
        less ${mhb_path}/${i}/${i}_MHB.bed |awk '{OFS="\t"}{print $0,"MHB"}'>${i}_T_tag_MHBs.bed
    done


    #######Genomic Segments
    for i in `ls *_T_LMRs.bed`
    do
     awk '{OFS="\t"} {print $1,$2,$3,"PMD"}' ${i%_*}_PMDs.bed > ${i%_*}_tag_PMDs.bed
     awk '{OFS="\t"} {print $1,$2,$3,"LMR"}' ${i%_*}_LMRs.bed > ${i%_*}_tag_LMRs.bed
     awk '{OFS="\t"} {print $1,$2,$3,"UMR"}' ${i%_*}_UMRs.bed > ${i%_*}_tag_UMRs.bed
     awk '{OFS="\t"} {print $1,$2,$3,"HMR"}' ${i%_*}_HMRs.bed > ${i%_*}_tag_HMRs.bed
    ##MHB
     cat ${i%_*}_tag_MHBs.bed ${i%_*}_tag_PMDs.bed ${i%_*}_tag_LMRs.bed  ${i%_*}_tag_UMRs.bed ${i%_*}_tag_HMRs.bed >${i%_*}_MHB_genomic_segments.bed

    /usr/bin/rm -rf ${i%_*}_tag_DRs.bed ${i%_*}_tag_MHBs.bed ${i%_*}_tag_PMDs.bed ${i%_*}_tag_LMRs.bed  ${i%_*}_tag_UMRs.bed ${i%_*}_tag_HMRs.bed

    done
")
In [ ]: