Codes in paper

Volcano map of methylation affecting prostate cancer-specific gene expression

Introduction

The expression of some prostate cancer-specific genes has nothing to do with mutations, so the author wanted to see the effect of methylation on the expression of these genes.

Code explanation

Read in the correlation data of expression and mutation, methylation, prostate cancer-specific gene data, and set the fdr cutoff to 0.05

#source('/Volumes/datasets_1/human_sequence_prostate_WGBS/reproduce/scripts/2019_05_15_prepare_environment.r')

# Plots figure 3A

fdrcut = 0.05  

#This is the p-value which corresponds to an FDR of 0.05 looking only at
#Genebody/promoter eHMRs (expression-associated  hypomethylated  regions’)
pcutpromgene = 0.01238332

#Load in file with expr correlated to DNA alterations, methylation generated by summary_expr.txt

sumexpr = read.table(fn_hmr_summary_expr, sep='\t', stringsAsFactors=FALSE,
                      header=TRUE, row.names=1)
rownames(sumexpr) = str_split_fixed(rownames(sumexpr),'\\.',2)[,1]
totalgenes = dim(sumexpr)[1]

#Load in PCa vs other comparisons from TCGA pancan

pcaspec = read.table(fn_pca_specific,sep='\t',stringsAsFactors=F,header=T,row.names=1)
rownames(pcaspec) = str_split_fixed(rownames(pcaspec),'\\.',2)[,1]

Get prostate cancer-specific genes based on calculated fdr

#Figure out which genes are actually PCa specific
pcaspec$PRAD_p = NULL
pcaspec$FPPP_p = NULL
pcaspec$count = 0
realn = dim(pcaspec)[1]#*33

for(i in 11:42) { # The p-value here should be the p-value for the significant difference in prostate cancer, which is not mentioned in the article
    fdrcol = p.adjust(pcaspec[,i],method='fdr',n=realn)
    pcaspec$count = pcaspec$count+as.numeric(fdrcol<=fdrcut)
}
pvb_fdr_new = p.adjust(pcaspec$pvb,method='fdr',n=realn)
rowsspec = pcaspec$count>=32 & pvb_fdr_new<=fdrcut & pcaspec$fc_a>=2
print(sum(rowsspec))

siggenes = pcaspec[rowsspec,'gene']
print(length(siggenes))
commongenes = intersect(rownames(pcaspec)[rowsspec],rownames(sumexpr))
sumexpr$PCa_specific = FALSE
sumexpr[commongenes,'PCa_specific'] = TRUE

#Calculate FDRs for DNA alterations, mutations
sumexpr$svfdr = p.adjust(sumexpr$svp,method='fdr')
sumexpr$cnfdr = p.adjust(sumexpr$cnp,method='fdr')
sumexpr$mutfdr = p.adjust(sumexpr$mutp,method='fdr')
sumexpr$anovafdr = p.adjust(sumexpr$anovap,method='fdr')
sumexpr$sd = apply(log2(tpm[rownames(sumexpr),sample_ids_wgbs]+1),1,sd)
sumexpr$meanexpr = rowMeans(tpm[rownames(sumexpr),sample_ids_wgbs],na.rm=T)
sumexpr$genewidth = sumexpr$end-sumexpr$start

#write.table(sumexpr,'C:/Users/Admin/Desktop/volcano.tsv',col.names=NA,row.names=T,quote=F,sep='\t')

#Used to be use to determine what in the top-left corner of volcano to show
#I kept this just to print out those genes
corcut = quantile(abs(sumexpr$methylcor),0.995,na.rm=T)
sdcut = quantile(abs(sumexpr$sd),0.995,na.rm=T)
print(corcut)
print(sdcut)

rowsmethylsig = sumexpr$methylp<=pcutpromgene
rowsdnasig = sumexpr$svfdr<=fdrcut | sumexpr$cnfdr<=fdrcut | sumexpr$mutfdr<=fdrcut
rowsanovasig = sumexpr$anovafdr<=fdrcut
rowsmethylsig[is.na(rowsmethylsig)] = F
rowsdnasig[is.na(rowsdnasig)] = F
rowsanovasig[is.na(rowsanovasig)] = F
names(rowsanovasig) = rownames(sumexpr)

print(sum(sumexpr$PCa_specific & rowsdnasig))
print(sum(sumexpr$PCa_specific & rowsmethylsig))
print(sum(sumexpr$PCa_specific))
stopifnot(FALSE)

rowshighlight = rowsanovasig & rowsmethylsig &
    abs(sumexpr$methylcor)>=corcut & sumexpr$sd>=sdcut
rowshighlight[is.na(rowshighlight)] = F

Begin to plot figure

#Plot the volcano plot
sumexpr$group = ''
sumexpr[sumexpr$PCa_specific,'group'] = 'PCa specific'
genes2keep = c('SLC45A3','SCHLAP1','SPON2','PCAT14','TMEFF2','TDRD1')
sumexpr[sumexpr$name%in%genes2keep,'group'] = sumexpr[sumexpr$name%in%genes2keep,'name']
print(sumexpr[rowshighlight,c('name','methylcor','sd','meanexpr','PCa_specific')])

sumexpr = sumexpr[order(sumexpr$group),]

lmcor = lm(methylcor~meanexpr+sd+genewidth+PCa_specific,sumexpr)
print(summary(lmcor))

print('Plotting')
plist = list()
colors = c('darkgray','black','green','blue','purple','yellow','orange','red')
plist[[2]] = ggplot(sumexpr,aes(x=methylcor,y=sd,color=group,shape=PCa_specific,alpha=group)) + 
    geom_point(size=2)+theme_classic()+xlab('Spearman\'s rho')+ylab('Log2(TPM+1) SD')+
    scale_alpha_manual(guide='none',values=list(0.2,1,1,1,1,1,1,1))+
    scale_colour_manual(values=colors)+
    #geom_segment(x=(-corcut+0.01),y=sdcut,xend=(-corcut+0.01)-0.2,yend=sdcut,color='black')+
    #geom_segment(x=(-corcut+0.01),y=sdcut,xend=(-corcut+0.01),yend=sdcut+1.5,color='black')+
    geom_vline(xintercept=0,color='black')+
    scale_y_continuous(limits=c(0,5))
#geom_text(data=sumexpr[rowshighlight,],aes(methylcor,sd,label=name),nudge_y=0.1)

test = wilcox.test(sumexpr$sd~sumexpr$PCa_specific)
plist[[1]] = ggplot(sumexpr,aes(x=PCa_specific,y=sd))+
    xlab('')+ylab('')+ggtitle(paste('Wilcox p:',signif(test$p.value,4)))+
    geom_boxplot()+theme_classic()
plist[[3]] = ggplot() + theme_void()
test = wilcox.test(sumexpr$methylcor~sumexpr$PCa_specific)
plist[[4]] = ggplot(sumexpr,aes(x=PCa_specific,y=methylcor))+
    xlab('')+ylab('')+ggtitle(paste('Wilcox p:',signif(test$p.value,4)))+
    geom_boxplot()+theme_classic()+coord_flip()

ggarrange(plotlist=plist,widths=c(1,4),heights=c(4,1))
ggsave(fn_figure3a,width=13,height=10,useDingbats=F)

  Variability in gene expression levels versus the correlation between gene expression and methylation. Expression variability was calculated as the s.d. of log2(TPM + 1), and the correlation was calculated at the most significant promoter or gene-body eHMR for each gene. The y-axis box plot shows gene expression variability for prostate-cancer-specific genes vs. all other genes. The x-axis box plot shows the correlation of methylation with gene expression of prostate-cancer-specific genes vs. all other genes. Significance was assessed with a two-sided Wilcoxon signed-rank test; n= 169 vs. 51502 for prostate-cancer-specific genes vs. all other genes, respectively. Box plots show the median, first and third quartiles, and outliers are shown if outside the 1.5× interquartile range.

Interpretaion of results

火山图的横坐标是基因和甲基化的相关性,纵坐标是基因的表达变化(在样本间的标准差)。可以看到PCa(前列腺癌)特异性的基因许多都在相关性高并且sd高的范围,说明甲基化对这些前列腺癌特异性基因的调节。两边的boxplot也说明了这些前列腺癌特异性的基因跟其他基因甲基化调节情况的差异。

Reference

Zhao S G , Chen W S , Li H , et al. The DNA methylation landscape of advanced prostate cancer[J]. Nature Genetics, 2020.

Original code

https://github.com/DavidQuigley/WCDT_WGBS/blob/master/scripts/2019_05_15_WGBS_figure_3A.R

Leave a Reply

Your email address will not be published. Required fields are marked *