10X single-cell RNA sequencing QC (known sample SNP)
Introduction
In this paper, single-cell 10X RNA sequencing was performed on a mixture of a variety of known cell lines to study the interaction mode between polyclones. Here we introduce the operation of filtering doublets and low quality cells in single cell sequencing.
Code explanation
Load the required tool function
library(mclust)
library(ggplot2)
# load counts matrix, genes, and cell identities for a given experiment
load_sc_exp <- function(data_folder) {
rMat <- Matrix::readMM(file.path(data_folder, 'matrix.mtx')) %>% as.matrix()
genes <- readr::read_tsv(file.path(data_folder, 'genes.tsv'), col_names = F) %>%
magrittr::set_colnames(c('Ensembl_ID', 'Gene_Symbol'))
barcodes <- readr::read_tsv(file.path(data_folder, 'barcodes.tsv'), col_names = F)
colnames(rMat) <- barcodes$X1
rownames(rMat) <- genes$Ensembl_ID
num_CLs <- ncol(rMat)
num_genes <- nrow(rMat)
classification <- barcodes
if (file.exists(file.path(data_folder, 'classifications.csv'))) {
classification <- readr::read_csv(file.path(data_folder, 'classifications.csv')) %>% as.data.frame()
}
cat(sprintf('Loaded matrix with %d CLs and %d genes\n', num_CLs, num_genes))
return(list(counts = rMat, gene_info = genes, cell_info = classification))
}
# convert gene identifiers from ensembl IDs to hugo symbols
convert_to_hugo_symbols <- function (dat) {
ensemble_to_hugo <- dat$gene_info$Gene_Symbol %>% magrittr::set_names(dat$gene_info$Ensembl_ID)
rownames(dat$counts) %<>% ensemble_to_hugo[.] %>% magrittr::set_names(NULL)
dat$counts <- dat$counts[!is.na(rownames(dat$counts)), ]
dat$counts <- dat$counts[!duplicated(rownames(dat$counts)), ]
return(dat)
}
# input data object containing the counts, cell info (cell line classifications), and gene info
# for the experiment
create_seurat_obj <- function(dat, use_symbols = TRUE) {
if(use_symbols) {
dat <- dat %>%
convert_to_hugo_symbols() #convert genes to hugo symbols (and only keep unique hugo symbol genes)
}
rownames(dat$cell_info) <- dat$cell_info$barcode
#make into Seurat object
seuObj <- Seurat::CreateSeuratObject(dat$counts,
min.cells = 0,
min.features = 0,
meta.data = dat$cell_info)
#make singlet classifications the cell identifiers
seuObj <- Seurat::SetIdent(seuObj, value = seuObj@meta.data$singlet_ID)
#store gene info here
seuObj@misc <- dat$gene_info
#add mitochondrial gene fraction
seuObj<- Seurat::PercentageFeatureSet(seuObj, pattern = "^MT-", col.name = 'percent.mito')
#add cellular detection rate (fraction of genes detected in a given cell)
seuObj@meta.data$cell_det_rate <- seuObj$nFeature_RNA/nrow(seuObj@assays$RNA@counts)
return(seuObj)
}
# begin adding cell quality classifications
# classify cells with too few SNPs or abnormal mitochondrial read percentages as low quality
filter_low_quality_cells <- function(seuObj, min_SNPs = 50, max_mito_perc = 25, min_mito_perc=1) {
#too few SNPs
seuObj@meta.data <- seuObj@meta.data %>%
dplyr::mutate(cell_quality = ifelse(num_SNPs < min_SNPs, 'low_quality', 'normal'))
#abnormal mitochondrial read %
seuObj@meta.data <- seuObj@meta.data %>%
dplyr::mutate(cell_quality = ifelse(percent.mito > max_mito_perc | percent.mito < min_mito_perc, 'low_quality', cell_quality))
return(seuObj)
}
# run Seurat methods, excluding low quality cells
process_Seurat_obj <- function(seuObj, n_pcs = 50, cluster_k =10, clust_res = 1, plot = TRUE) {
all_meta <- seuObj@meta.data
rownames(all_meta) <- all_meta$barcode
rownames(seuObj@meta.data) <- seuObj@meta.data$barcode
seuObj <- Seurat::SubsetData(seuObj, cells = dplyr::filter(seuObj@meta.data, cell_quality == 'normal')$barcode)
# run Seurat methods to get gene expression clusters
seuObj <- Seurat::NormalizeData(object = seuObj,
normalization.method = "LogNormalize",
scale.factor = 1e5, verbose = FALSE)
seuObj <- Seurat::ScaleData(object = seuObj, vars.to.regress = c(), verbose = FALSE)
seuObj <- Seurat::FindVariableFeatures(object = seuObj,
nfeatures = 5000,
do.plot = FALSE,
selection.method = 'vst', verbose = FALSE)
seuObj <- Seurat::RunPCA(object = seuObj,
features = Seurat::VariableFeatures(seuObj),
seed.use = 1,
npcs = n_pcs,
verbose = FALSE)
seuObj <- Seurat::RunTSNE(object = seuObj,
dims = 1:n_pcs,
check_duplicates = FALSE,
seed.use = 1,
perplexity = 30,
verbose = FALSE)
seuObj <- Seurat::FindNeighbors(seuObj, reduction = 'pca',
dims = 1:n_pcs,
k.param = cluster_k,
force.recalc = TRUE,
verbose = FALSE)
seuObj <- Seurat::FindClusters(seuObj, resolution = clust_res, random.seed = 1, verbose = FALSE)
if(plot) {
gg <- Seurat::DimPlot(seuObj, group.by = 'seurat_clusters') + Seurat::NoLegend()
print(gg)
}
all_meta <- dplyr::left_join(all_meta, seuObj@meta.data[,c('barcode', 'RNA_snn_res.1','seurat_clusters')])
seuObj@meta.data <- all_meta
return(seuObj)
}
# identify low quality 'cells', which may be from empty droplets
empty_droplet_clusters <- function(seuObj, dev_thresh = 0.3, plot = TRUE) {
df <- Seurat::Embeddings(seuObj, reduction = 'tsne') %>% as.data.frame() %>%
tibble::rownames_to_column(var = 'barcode') %>%
dplyr::left_join(seuObj@meta.data %>%
dplyr::mutate(max_dev = singlet_dev + doublet_dev_imp), by='barcode')
#compute stats for each cluster
clust_stats <- plyr::ddply(df, .(seurat_clusters), function(ss) {
data_frame(n_cells = nrow(ss),
med_doub = median(ss$doublet_dev_imp, na.rm=T),
med_dev = median(ss$max_dev, na.rm=T))
}) %>% dplyr::mutate(
clust_type = ifelse(med_dev <= dev_thresh, 'empty_droplet', 'normal')
)
empty_droplets <- df$barcode[which((clust_stats$clust_type %>% rlang::set_names(clust_stats$seurat_clusters) %>%
.[df$seurat_clusters]) == 'empty_droplet')]
print(sprintf('Identified %d empty droplets', length(empty_droplets)))
rownames(seuObj@meta.data) <- seuObj@meta.data$barcode
seuObj@meta.data[empty_droplets,'cell_quality'] <- 'empty_droplet'
if(plot) {
gg <- Seurat::DimPlot(seuObj, group.by = 'cell_quality')
print(gg)
}
return(seuObj)
}
# Identify doublets
classify_doublets <- function(seuObj, doub_prob_thresh=0.5, doublet_dev_offset = 0.001, plot = TRUE) {
# use singlet deviance, doublet deviance improvement, and fraction of genes detected in a given cell
# to identify cells that are doublets
# doublet identification
doublet_df <- Seurat::FetchData(seuObj, c('singlet_dev', 'doublet_dev_imp', 'cell_quality')) %>%
tibble::rownames_to_column(var = 'barcode') %>%
dplyr::filter(cell_quality %in% c('normal', 'doublet')) %>%
dplyr::mutate(log_doublet_imp = log10(doublet_dev_imp + doublet_dev_offset)) %>%
dplyr::select(-doublet_dev_imp)
# handle cells where the doublet model failed by treating them as singlets.
fail_doublet_model <- doublet_df %>%
dplyr::filter(is.na(log_doublet_imp)) %>%
.[['barcode']]
print(sprintf('%d cells with failed doublet models treated as singlets', length(fail_doublet_model)))
doublet_df %<>% dplyr::filter(!(barcode %in% fail_doublet_model))
# sfit GMM
gmm_output <- mclust::Mclust(as.matrix(doublet_df[, c('log_doublet_imp', 'singlet_dev')]), G = 2, verbose = FALSE, prior = mclust::priorControl(shrinkage = 0))
doublet_cluster <- which.max(gmm_output$parameters$mean['log_doublet_imp', ])
singlet_cluster <- which.min(gmm_output$parameters$mean['log_doublet_imp', ])
doublet_df$doublet_prob <- gmm_output$z[,doublet_cluster]
doublet_cells <- doublet_df %>%
dplyr::filter(doublet_prob >= doub_prob_thresh) %>%
.[['barcode']]
seuObj@meta.data[doublet_cells, 'cell_quality'] <- 'doublet'
if(plot) {
g1 <- plot(gmm_output, what = 'classification')
g2 <- ggplot2::ggplot(doublet_df, ggplot2::aes(singlet_dev, log_doublet_imp, color=doublet_prob)) +
ggplot2::geom_point() +
ggplot2::theme(legend.position = "right", legend.direction = "vertical", legend.text = ggplot2::element_text(size=10)) +
ggplot2::xlab("singlet deviance") + ggplot2::ylab("log doublet deviance improvement")
g3 <- ggplot2::ggplot(doublet_df %>% dplyr::mutate(is_doublet = barcode %in% doublet_cells), ggplot2::aes(singlet_dev, log_doublet_imp, color= is_doublet)) +
ggplot2::geom_point() +
ggplot2::theme(legend.position = "right", legend.direction = "vertical", legend.text =ggplot2::element_text(size=10)) +
ggplot2::xlab("singlet deviance") + ggplot2::ylab("log doublet deviance improvement") +
ggplot2::guides(color = ggplot2::guide_legend(title = 'doublet'))
print(g1)
print(g2)
print(g3)
}
return(seuObj)
}
identify_low_confidence_cells <- function(seuObj, min_z_margin = 2, plot = TRUE) {
df <- seuObj@meta.data
seuObj@meta.data %<>%
dplyr::mutate(cell_quality = ifelse(seuObj@meta.data$singlet_z_margin < min_z_margin & seuObj@meta.data$cell_quality == 'normal', 'low_confidence', cell_quality))
if(plot) {
cell_class_confidence <- ggplot2::ggplot(df %>% filter(cell_quality == 'normal'),
ggplot2::aes(singlet_z_margin)) +
ggplot2::geom_density() +
ggplot2::geom_vline(xintercept = min_z_margin, linetype = 'dashed')
print(cell_class_confidence)
gg <- ggplot2::ggplot(seuObj@meta.data, ggplot2::aes(singlet_dev, doublet_dev_imp, fill=cell_quality)) +
ggplot2::geom_point(pch = 21, size = 1.5, color = 'white', stroke = 0.1) + ggplot2::ggtitle('cell quality classifications')
print(gg)
}
return(seuObj)
}
To read in the data, the data is placed under a folder named data, not a single_cell_classification folder, which includes the output matrix.mtx, genes.tsv, barcodes.tsv of Cellranger, and a cell classification file identified by SNP. classifications.csv
# run QC methods
dat <- load_sc_exp(here::here('data'))
seuObj <- create_seurat_obj(dat)
num_CLs <- length(unique(seuObj@meta.data$singlet_ID))
Set the parameters according to the number of sequenced cell lines, n_pcs is the number of principal components in principal component analysis, and clust_res is the resolution parameter of running FindCluster function
# set parameters based on the number of cell lines in the experiment
n_pcs <- num_CLs*2
param_range <- 10
if(num_CLs < 25+param_range) {
clust_res <- 1
} else if(num_CLs > 50-param_range & num_CLs < 50+param_range) {
clust_res <- 2
} else if(num_CLs > 100-param_range & num_CLs < 100+param_range) {
clust_res <- 4
} else {
stop('Not implemented')
}
Filter operation
seuObj <- filter_low_quality_cells(seuObj)
seuObj <- process_Seurat_obj(seuObj, n_pcs = n_pcs, clust_res = clust_res)
seuObj <- empty_droplet_clusters(seuObj)
seuObj <- classify_doublets(seuObj)
seuObj <- identify_low_confidence_cells(seuObj)
References
Kinker G S , Greenwald A C , Tal R , et al. Pan-cancer single-cell RNA-seq identifies recurring programs of cellular heterogeneity[J]. Nature Genetics, 2020, 52(11):1208-1218.
Original
https://github.com/broadinstitute/single_cell_classification