Codes in paper

Bar graph of the relationship between the amount of DNA methylation valleys and H3K4me3 and H3K27me3

Introduction

DNA methylation valleys (DNA methylation valleys, DMV) are a wide range of hypomethylated regions, which are related to the activation of histone marker H3K4me3 and the inhibition of histone marker H3K27me3. In metastatic castration-resistant prostate cancer (mCRPC, metastatic castration-resistant prostate cancer), as shown in the figure, the greater the number of DMVs in the sample, the higher the signal ratio of H3K4me3 to H3K27me3.

Sample-level log2(OR) calculated from the number of DMVs that overlap H3K4me3 vs. H3K27me3 sites. Lower values favor H3K27me3 and higher values favor H3K4me3. Lower: Sample-level count of DMVs, order matching upper panel.

Code explanation

Read data
#Loop through HMRs per sample and assign them to groups
output <- data.frame()
samples2use <- sample_ids_wgbs
valleydf <- read.delim(file=fn_valley,sep='\t',header=F,stringsAsFactors=F)
colnames(valleydf) <- c('chr','start','end','sample')
valley <- makeGRangesFromDataFrame(valleydf,keep.extra.columns=T)
Calculate the number of H3K4me3 and H3K27me3 peaks in the methylation valley, and do fisher test to calculate the OR of their relationship with the methylation valley
for(i in 1:length(samples2use)) {
    sample <- samples2use[i]
    valleyi <- valley[valley$sample==sample]

    valleywidth <- sum(width(valleyi))
    output[sample,'Sample'] <- sample  #Sample name 
    output[sample,'width'] <- valleywidth  # Total width of methylated valley
    output[sample,'count'] <- length(valleyi)  # Number of methylated valleys

    print(paste(i, output[sample,'count']))

    in_h3k4me3 <- countOverlaps(tracks[['pca100_H3K4me3']],valleyi,ignore.strand=T)>0
    in_h3k27me3 <- countOverlaps(tracks[['pca100_H3K27me3']],valleyi,ignore.strand=T)>0
    in_cpg_island <- countOverlaps(tracks[['cpg_island']],valleyi,ignore.strand=T)>0

    count_h3k4me3_valley <- sum(in_h3k4me3)
    count_h3k27me3_valley <- sum(in_h3k27me3)
    count_h3k4me3_novalley <- sum(!in_h3k4me3)
    count_h3k27me3_novalley <- sum(!in_h3k27me3)

    tab <- rbind(c(count_h3k4me3_valley,count_h3k4me3_novalley),
                 c(count_h3k27me3_valley,count_h3k27me3_novalley)) # Generate quadruple table
    ft <- fisher.test(tab)  # Fisher test, whether the signal values of h3k4me3 and h3k27me3 are related to the methylation valley

    output[sample,'OR'] <- log2(ft$estimate)
    output[sample,'p'] <- ft$p.value
}

output <- output[rownames(output) %in% sample_ids_wgbs,]
output$fdr <- p.adjust(output$p,method='fdr')
output$logfdr <- -log10(output$fdr)

lvl <- output[order(output$count),'Sample']
#lvl <- output[order(output$width),'Sample']
output$Sample <- factor(output$Sample,levels=lvl)
Drawing according to the output matrix, ggarrange layout drawing
#Set up data frames for plotting
df1 <- cbind.data.frame(sample=output$Sample,count=output$OR,SS=output$fdr<=0.05)
df3 <- cbind.data.frame(sample=output$Sample,count=output$width/HG38_SIZE*100)
df4 <- cbind.data.frame(sample=output$Sample,count=output$count)

cols <- brewer.pal(n=4, name='Dark2')
plist <- list()
plist[[1]] <- ggplot(df1,aes(x=sample,y=count))+theme_classic()+geom_bar(stat='identity')+
    theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))+ylab('Log2 Odds Ratio')+
    scale_fill_manual(values=c('darkgray',cols[1]))
plist[[3]] <- ggplot(df3,aes(x=sample,y=count))+theme_classic()+geom_bar(stat='identity',fill=cols[2])+
    theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank())+
    ylab('DMV %Genome')  # This picture is not shown in the article in the end
plist[[2]] <- ggplot(df4,aes(x=sample,y=count))+theme_classic()+geom_bar(stat='identity',fill=cols[3])+
    theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank())+
    ylab('DMV Count')

ggarrange(plotlist=plist,ncol=1,nrow=length(plist),heights=c(2,1,1))
ggsave(fn_figure2a,width=15,height=12)

Reference

Zhao S G , Chen W S , Li H , et al. The DNA methylation landscape of advanced prostate cancer[J]. Nature Genetics, 2020.

Original code

https://github.com/DavidQuigley/WCDT_WGBS/blob/master/scripts/2019_05_15_WGBS_figure_2A.R

Leave a Reply

Your email address will not be published. Required fields are marked *