The Bd21Control genome was divided into 100bp windows and the number of possible CG, CHG, and CHH sites was calculated. These describe the total number of tiles in the genome that could contain information:

all.cg=read.delim('CGsites_100bp.bed',head=F)
all.chg=read.delim('CHGsites_100bp.bed',head=F)
all.chh=read.delim('CHHsites_100bp.bed',head=F)

We can then merge these sites together to create a final all.windows dataframe:

t1=merge(all.cg,all.chg,by=c('V1','V2','V3'),all=T)
all.windows=merge(t1,all.chh,by=c('V1','V2','V3'),all=T)
colnames(all.windows)=c('V1','V2','V3','CGsites','CHGsites','CHHsites')

cg.possible=table(is.na(all.windows$CGsites))[1]
chg.possible=table(is.na(all.windows$CHGsites))[1]
chh.possible=table(is.na(all.windows$CHHsites))[1]

For each sequence context, there is a number of possible windows with at least one site. We can now determine how many windows we actually have data for:

#read in Bd21 mapped tile data
cg=read.delim('Bd21_CpG_100bp.wig',head=F,skip=1)
chg=read.delim('Bd21_CHG_100bp.wig',head=F,skip=1)
chh=read.delim('Bd21_CHH_100bp.wig',head=F,skip=1)

#get rid of scaffolds
cg=cg[grep('^Bd',cg$V1),]
chg=chg[grep('^Bd',chg$V1),]
chh=chh[grep('^Bd',chh$V1),]

cg=cg[with(cg,order(cg[,1],cg[,2])),]
chg=chg[with(chg,order(chg[,1],chg[,2])),]
chh=chh[with(chh,order(chh[,1],chh[,2])),]

all=merge(all.windows,cg,by=c('V1','V2','V3'),all=T)
all=merge(all,chg,by=c('V1','V2','V3'),all=T)
colnames(all)=c('V1','V2','V3','CGsites','CHGsites','CHHsites','cg_prop','cg_met','_cg_unmet','cg_total','cg_site','chg_prop','chg_met','chg_unmet','chg_total','chg_site')
all=merge(all,chh,by=c('V1','V2','V3'),all=T)
colnames(all)=c('V1','V2','V3','CGsites','CHGsites','CHHsites','cg_prop','cg_met','_cg_unmet','cg_total','cg_site','chg_prop','chg_met','chg_unmet','chg_total','chg_site','chh_prop','chh_met','chh_unmet','chh_total','chh_site')

cg.coverage=table(is.na(all$cg_total)==T)[1]
chg.coverage=table(is.na(all$chg_total)==T)[1]
chh.coverage=table(is.na(all$chh_total)==T)[1]

Lets look at the overall distribution of DNA methylation for each class:

par(mfrow=c(1,3))
plot(density(all$cg_prop,na.rm=T),main='CG methylation',xlab='Methylation %',col='#990000',lwd=3)
plot(density(all$chg_prop,na.rm=T),main='CHG methylation',xlab='Methylation %',col='#003399',lwd=3)
plot(density(all$chh_prop,na.rm=T),main='CHH methylation',xlab='Methylation %',col='#336600',lwd=3,xlim=c(0,20))

From this we can see that methylation is largely bimodal with windows being highly methylated, or showing close to zero methylation.

Of the windows that have coverage, we can make a binary call if a window is deemed methylated or not. For CG this can be 50%, for CHG, this can be 30%. CHH is not bimodal, so we will just say things over 10% are methylated:

cg.class=ifelse(all$cg_prop>50,1,0)
chg.class=ifelse(all$chg_prop>30,1,0)
chh.class=ifelse(all$chh_prop>10,1,0)

all=cbind(all,cg.class,chg.class,chh.class)

From this binary classification, we can go through and place each window (with coverage across all context) within one of eight catagories:

window_class=
ifelse(all$cg.class==1 & all$chg.class==1 & all$chh.class==1,'all_met',
ifelse(all$cg.class==1 & all$chg.class==0 & all$chh.class==0,'cg_only',
ifelse(all$cg.class==0 & all$chg.class==1 & all$chh.class==0,'chg_only',
ifelse(all$cg.class==0 & all$chg.class==0 & all$chh.class==1,'chh_only',
ifelse(all$cg.class==1 & all$chg.class==1 & all$chh.class==0,'cg_chg',
ifelse(all$cg.class==1 & all$chg.class==0 & all$chh.class==1,'cg_chh',
ifelse(all$cg.class==0 & all$chg.class==1 & all$chh.class==1,'chg_chh',
ifelse(all$cg.class==0 & all$chg.class==0 & all$chh.class==0,'no_met',
'NA'))))))))

all=cbind(all,window_class)

barplot(table(all$window_class,exclude=NULL),las=2)

From these classes, we can see that the majority of tiles display either CG and CHG methylation, CG only, or no methylation at all. Note that there are also many windows that do not have coverage information across all three sequence context, and therefore cannot be placed in these catagories without ambiguity.

From here, we can bring in annotation information to see if there are specific transposable element classes, or genes, that are enriched for these classes. A bed file was created by taking all Bd21Control windows and identifying the closest gene or transposable element from the published annotations (v2.2):

anno=read.delim('all.windows.to.genes.to.tes.bed',head=F)

From this file, we can classify each genomic tile as either intersecting a gene, a transposon, or falling into an intergenic region:

anno.class=
  ifelse(anno$V11==0 & anno$V19==0, 'TE-gene boundary',
  ifelse(anno$V11==0,'gene',
  ifelse(anno$V19==0,as.character(anno$V16),
  'intergenic')))

anno2=cbind(anno[,1:3],anno.class)
#fix the NA name for transposable element fragments
anno2$anno.class=ifelse(is.na(anno2$anno.class)==T,'TE_frag',as.character(anno2$anno.class))
all=merge(all,anno2,by=c('V1','V2','V3'))
write.table(all,'Bd21_total_tile_annotation.txt',sep='\t',row.names=F,quote=F)

With all of this information gathered, we can begin to look at class enrichments across the genome:

barplot(table(all$anno.class),las=2,main='Number of tiles across annotation classes')

barplot(table(all$window_class),las=2,main='Number of tiles across methylation classes')

prop.met.class=prop.table(table(all$anno.class,all$window_class),margin=2)

par(mar=c(5,4,4,7),xpd=T)
barplot(prop.met.class,col=rainbow(length(rownames(prop.met.class))),las=2)
legend("topright",inset=c(-.15,0),fill=rainbow(length(rownames(prop.met.class))),legend=rownames(prop.met.class))

prop.anno.class=prop.table(table(all$window_class,all$anno.class),margin=2)
prop.anno.class=prop.anno.class[,c(7,8,14,13,1,2,3,4,5,6,9,10,11,12)]
pdf('fig1c.pdf',width=10,height=5)
par(mar=c(5,4,4,7),xpd=T)
barplot(prop.anno.class,col=c('black','#8B008B','#CD6600','#990000','#008B8B','#003399','#336600','grey'),las=2,horiz=T)
legend("topright",inset=c(-.16,0),fill=c('black','#8B008B','#CD6600','#990000','#008B8B','#003399','#336600','grey'),legend=rownames(prop.anno.class))
dev.off()
## quartz_off_screen 
##                 2
barplot(prop.anno.class,col=c('black','#8B008B','#CD6600','#990000','#008B8B','#003399','#336600','grey'),las=2)
legend("topright",inset=c(-.16,0),fill=c('black','#8B008B','#CD6600','#990000','#008B8B','#003399','#336600','grey'),legend=rownames(prop.anno.class))

The plots shown above highlight that genes and intergenic regions show a very different proportion of each of the eight methylation classes compared to the transposable element classes. One TE class with a slightly different methylation profile is the DTT or Mariner elements that show a much higher proportion of classes containing CHH methylation compared to the other TE classes.

table(all$chh.class)
## 
##       0       1 
## 2700685  126255
#126255 total tiles contain CHH)
table(all$window_class,all$anno.class)
##           
##               DHH    DTA    DTC    DTH    DTM    DTT   gene intergenic
##   all_met      75    227   1814   2034   1118   5599   2769      24573
##   cg_chg     2500   2706  45931   6934  12461   4276  30873     228882
##   cg_chh       17     33     81    255    136   2837   2172       4866
##   cg_only     413    517   2866    988    942   3052 272218      89536
##   chg_chh       3      6     38    197     33    529    278       1788
##   chg_only     17     26    158     93     68    195   1534       5470
##   chh_only      7     10     36    235     38   1427   2196       5459
##   no_met      334    433    967   1612    261   1945 400555     477616
##           
##               RIX    RLC    RLG    RLX TE_frag TE-gene boundary
##   all_met     652   5641   8415   5529    2433             2863
##   cg_chg    30899  89329 313623  54856   64254            28624
##   cg_chh       83    748   1105    954     436             1630
##   cg_only    3321   9083  15026   4870    4587            15870
##   chg_chh      11    189    206    291      98              256
##   chg_only    127    600    724    437     238              221
##   chh_only     42    311    388    580     219              631
##   no_met     4385   5760   6045   3692    2705             6633
#DTT contains 10392 total tiles contain CHH
prop.table(table(all$anno.class))
## 
##              DHH              DTA              DTC              DTH 
##      0.001446396      0.001778783      0.020654549      0.005535056 
##              DTM              DTT             gene       intergenic 
##      0.005891869      0.010520204      0.317583614      0.355064812 
##              RIX              RLC              RLG              RLX 
##      0.015968120      0.049858744      0.130442016      0.029561008 
##          TE_frag TE-gene boundary 
##      0.029939606      0.025755224
#DTT makes up ~1% of all genomic tiles

Lets make a summary plot for a figure:

plot.table=matrix(NA,nrow=4,ncol=3)
colnames(plot.table)=c('CG','CHG','CHH')
rownames(plot.table)=c('Total tiles with sites','Total tiles with coverage >0','Tiles unmethylated','Tiles methylated')
plot.table[1,]=c(cg.possible,chg.possible,chh.possible)
plot.table[2,]=c(cg.coverage,chg.coverage,chh.coverage)
plot.table[3:4,1]=table(cg.class)
plot.table[3:4,2]=table(chg.class)
plot.table[3:4,3]=table(chh.class)
barplot(plot.table[1:2,],beside=T,horiz=T,las=1)

barplot(plot.table[3:4,],beside=F,horiz=T,las=1,col=c('#336600','grey','#003399','grey','#990000','grey'))

pdf('figure_tile_sumary.pdf',width=10,height=6)
barplot(plot.table[1:2,],beside=T,horiz=T,las=1)
barplot(plot.table[3:4,],beside=F,horiz=T,las=1,col=c('#336600','grey','#003399','grey','#990000','grey'))
dev.off()
## quartz_off_screen 
##                 2

We can also look at the overall distribution of the methylation tile classes across chromosome one (unfinished)

asub=subset(all,all$window_class=='all_met' & all$V1=='Bd1')
library(fields)
## Warning: package 'fields' was built under R version 3.1.3
## Loading required package: spam
## Warning: package 'spam' was built under R version 3.1.3
## Loading required package: grid
## Spam version 1.3-0 (2015-10-24) is loaded.
## Type 'help( Spam)' or 'demo( spam)' for a short introduction 
## and overview of this package.
## Help for individual functions is also obtained by adding the
## suffix '.spam' to the function name, e.g. 'help( chol.spam)'.
## 
## Attaching package: 'spam'
## 
## The following objects are masked from 'package:base':
## 
##     backsolve, forwardsolve
## 
## Loading required package: maps
## Warning: package 'maps' was built under R version 3.1.3
## 
##  # ATTENTION: maps v3.0 has an updated 'world' map.        #
##  # Many country borders and names have changed since 1990. #
##  # Type '?world' or 'news(package="maps")'. See README_v3. #
abin=stats.bin(asub$V2,asub$window_class,N=1000)
data=cbind(matrix(abin$centers,ncol=1),abin$stats["N",])
plot(data)