Codes in paper

Differentially expressed genes and GSVA pathway enrichment between two cluster of cells

Introduction

Whether it is single-cell RNA sequencing or bulk RNA sequencing, we need to perform differential gene expression analysis and GSVA pathway enrichment analysis. The following code is from a single-cell article

Code explanation

Load the required library and function. parse_h5ad is used to read anndata from the workflow of Scanpy.As we can see, expression data and meta data are returned.

## load data
.libPaths("/data2/csj/tools/Rlib")
library(Seurat)
library(dplyr)
library(monocle)
options(stringsAsFactors=FALSE)
library(reticulate)

parse_h5ad <- function(adata){
    require(reticulate)
    ad <- import("anndata", convert = FALSE)
    ada <- ad$read_h5ad(adata)
    meta <- py_to_r(ada$obs)
    if(class(ada$raw$X)[1] == "scipy.sparse.csr.csr_matrix"){
        exp <- t(py_to_r(ada$raw$X$toarray()))
    }
    else{
        exp <- t(py_to_r(ada$raw$X))
    }
    rownames(exp) <- rownames(py_to_r(ada$raw$var))
    colnames(exp) <- rownames(meta)
    return(
        list(
        metadata = meta,
        expression = exp
        )
    )
}

Authors calculated log fold change, p value and adjusted p value. "pseudocount.use" was used to avoid zero log. Wilcox.test was used to make test and then p value was adjusted by BH method.

############## different expression
DEGene <- function(h5ad,Cluster1,Cluster2){
    ## h5ad: the result of function parse_h5ad
    ## Cluster1 : a vector of Cluster
    ## Cluster2 : a vector of Cluster

    pseudocount.use =1

    cell_name_1 <- rownames(h5ad$metadata[h5ad$metadata$MajorCluster %in% Cluster1,])
    cell_name_2 <- rownames(h5ad$metadata[h5ad$metadata$MajorCluster %in% Cluster2,])

    Expression_1 <- h5ad$expression[,cell_name_1]
    Expression_2 <- h5ad$expression[,cell_name_2]

    ## log FC
    mean_c1 <- as.data.frame(rowMeans(as.matrix(Expression_1)))
    colnames(mean_c1) <- "mean_c1"
    mean_c2 <- as.data.frame(rowMeans(as.matrix(Expression_2)))
    colnames(mean_c2) <- "mean_c2"
    log2fc <- data.frame(log2fc = log2(mean_c1$mean_c1 + pseudocount.use) - log2(mean_c2$mean_c2 + pseudocount.use))
    rownames(log2fc) <- rownames(mean_c1)
    log2fc$gene <- rownames(log2fc)

    ## wilcox test
    group.info <- data.frame(row.names = c(cell_name_1, cell_name_2))
    group.info[cell_name_1, "group"] <- "Group1"
    group.info[cell_name_2, "group"] <- "Group2"
    group.info[, "group"] <- factor(x = group.info[, "group"])
    data.use <- h5ad$expression[, rownames(x = group.info), drop = FALSE]

    p_val <- sapply(
        X = 1:nrow(x = data.use),
        FUN = function(x) {
        return(wilcox.test(data.use[x, ] ~ group.info[, "group"])$p.value)
    })

    ## BH correction
    adj_p_val <- p.adjust(p_val, method="BH")

    ## DE table
    result <- data.frame(gene=log2fc$gene, log2FC=log2fc$log2fc, Pvalue=p_val, Adj_pval=adj_p_val)
    return(result)
}

The clusterProfiler package was used to run GSVA analysis.

##################################################### GSVA
runGSVA <- function(h5ad,Cluster1,Cluster2,kcdf="Gaussian",AdjPvalueCutoff=0.05){
    ## h5ad: the result of function parse_h5ad
    ## Cluster1 : Cluster1
    ## Cluster2 : Cluster2
    ## kcdf: kcdf="Gaussian" for continuous and 'Poisson for integer counts'

    require(GSVA)
    require(GSEABase)
    require(GSVAdata)
    require(clusterProfiler)
    data(c2BroadSets)
    library(limma)

    expression <- h5ad$expression[,rownames(h5ad$metadata[h5ad$metadata$MajorCluster %in% c(Cluster1,Cluster2),])]

    ## change gene symbol to geneid
    gene_entrezid <- bitr(rownames(expression), fromType = "SYMBOL", toType = "ENTREZID", OrgDb = "org.Hs.eg.db")
    expression_filt <- expression[gene_entrezid$SYMBOL,]
    rownames(expression_filt) <- gene_entrezid$ENTREZID
    expression_filt <- as.matrix(expression_filt)

    res_gsva <- gsva(expression_filt, c2BroadSets, parallel.sz=10,kcdf=kcdf) 

    annotation_col = data.frame(CellType = factor(h5ad$metadata[colnames(res_gsva),]$MajorCluster))

    rownames(annotation_col) = colnames(res_gsva)

    ## using limma to conduct DE analysis
    f <- factor(annotation_col$CellType)
    design <- model.matrix(~0+f)
    colnames(design) <- c("C1","C2")
    rownames(design) <- colnames(res_gsva)

    fit <- lmFit(res_gsva, design)
    cont.matrix=makeContrasts('C1-C2',levels = design)
    fit2=contrasts.fit(fit,cont.matrix)
    fit2=eBayes(fit2)
    gs <- topTable(fit2,adjust='BH', number=Inf, p.value=AdjPvalueCutoff)
    gs$cluster <- ifelse(gs$t > 0 , "C1", "C2")
    return(gs)
}


Bar plot showing different pathways enriched in C1QC+ macrophage and SPP1+ macrophage in lung cancer scored per cell by gene set variation analysis
(GSVA). t values are calculated with limma regression.

Interpretation of results

Although there is not plot function of codes, it is easy to run it in bar plot function.We can plot figure such as follows:

library(data.table)
library(ggplot2)

GSVA_result <- fread("result/GSVA_result.txt", sep = "\t", header = T)
p <- ggplot(GSVA_result)+
  geom_bar(aes(x=Pathway,y=t,
  fill=CellType),stat = 'identity',
  width = 0.8,position = position_dodge(width = 0.1))+
  coord_flip()+
  scale_fill_manual(values = c("#CC0000","#006633"))+
  theme_classic()

ggsave(p, file = "result/GSVA.pdf", width = 9, height = 4)

Figure shows that the SPP1+ TAMs showed preferential expression of genes involved in angiogenesis as a result of validation.

References

Cheng S , Li Z , Gao R , et al. A pan-cancer single-cell transcriptional atlas of tumor infiltrating myeloid cells[J]. Cell, 2021, 184(3):792-809.e23.

Original

https://github.com/Sijin-ZhangLab/PanMyeloid

Donation

[paypal-donation]

3 thoughts on “Differentially expressed genes and GSVA pathway enrichment between two cluster of cells

Leave a Reply

Your email address will not be published. Required fields are marked *