This quick start guide demonstrates the essential steps for evaluating cell type
annotations using scTypeEval. For a comprehensive tutorial, see the main
vignette.
library(scTypeEval)
library(Matrix)
# Generate example data
set.seed(123)
counts <- Matrix(rpois(50000, 5), nrow=500, ncol=100, sparse=TRUE)
rownames(counts) <- paste0("Gene", seq_len(500))
colnames(counts) <- paste0("Cell", seq_len(100))
metadata <- data.frame(
celltype = rep(c("TypeA", "TypeB", "TypeC", "TypeD"), each=25),
sample = rep(paste0("S", seq_len(5)), times=20),
row.names = colnames(counts)
)
# Create object
sceval <- create_scTypeEval(matrix=counts, metadata=metadata)
# Process data
sceval <- run_processing_data(
sceval,
ident = "celltype",
sample = "sample",
min_samples = 3,
min_cells = 5
)
#> # Processing data for single-cell ...
#> Transforming and filtering count matrix...
#> Normalizing count matrix via Log1p...
#> # Processing data for pseudobulk ...
#> Transforming and filtering count matrix...
#> Normalizing count matrix via Log1p...
# Identify features
sceval <- run_hvg(sceval,
var_method = "basic",
ngenes = 1000)
#> Not using black gene list
#> Computing HVG...
# Run PCA
sceval <- run_pca(sceval, ndim = 20)
#>
#> Using HVG gene list.
#> Not using black gene list
#> # Computing PCA data for single-cell ...
#> Filtering gene list...
#> Filtering empty rows and cols...
#> Computing PCA space...
#> > Returning 20 dimensions for PCA
#> # Computing PCA data for pseudobulk ...
#> Filtering gene list...
#> Filtering empty rows and cols...
#> Computing PCA space...
#> > Returning 19 dimensions for PCA
# Compute dissimilarity
sceval <- run_dissimilarity(
sceval,
method = "Pseudobulk:Euclidean",
reduction = TRUE
)
#> Running distance for euclidean...
# Get consistency
results <- get_consistency(
sceval,
dissimilarity_slot = "Pseudobulk:Euclidean",
consistency_metric = "silhouette"
)
#> Computing internal validation metrics for Pseudobulk:Euclidean ...
print(results)
#> celltype measure consistency_metric dissimilarity_method ident
#> TypeA TypeA 0.009929989 silhouette Pseudobulk:Euclidean celltype
#> TypeB TypeB -0.011988536 silhouette Pseudobulk:Euclidean celltype
#> TypeC TypeC -0.005448364 silhouette Pseudobulk:Euclidean celltype
#> TypeD TypeD -0.020708784 silhouette Pseudobulk:Euclidean celltype
library(Seurat)
# Create Seurat object with example data generated earlier
seurat_obj <- Seurat::CreateSeuratObject(
counts = counts,
meta.data = metadata
)
sceval_seurat <- create_scTypeEval(seurat_obj)
# Continue with standard workflow
library(SingleCellExperiment)
# Create SCE object with example data generated earlier
sce <- SingleCellExperiment::SingleCellExperiment(
assays = list(counts = counts),
colData = metadata
)
sceval_sce <- create_scTypeEval(sce)
# Continue with workflow as above
# Compute different dissimilarity methods
sceval <- run_dissimilarity(
sceval,
method = "Pseudobulk:Euclidean",
reduction = TRUE
)
#> Running distance for euclidean...
sceval <- run_dissimilarity(
sceval,
method = "Pseudobulk:Cosine",
reduction = TRUE
)
#> Running distance for cosine...
sceval <- run_dissimilarity(
sceval,
method = "WasserStein",
reduction = TRUE
)
#> Splitting matrices...
#> Computing pairwise WasserStein distance...
# Compare consistency across methods
dissimilarity_methods <- c("Pseudobulk:Euclidean",
"Pseudobulk:Cosine",
"WasserStein")
results_df <-
get_consistency(
sceval,
dissimilarity_slot = dissimilarity_methods, # compute for multiple dissimilarities
consistency_metric = "silhouette"
)
#> Computing internal validation metrics for Pseudobulk:Euclidean ...
#> Computing internal validation metrics for Pseudobulk:Cosine ...
#> Computing internal validation metrics for WasserStein ...
results_df
#> celltype measure consistency_metric dissimilarity_method ident
#> TypeA TypeA 0.009929989 silhouette Pseudobulk:Euclidean celltype
#> TypeB TypeB -0.011988536 silhouette Pseudobulk:Euclidean celltype
#> TypeC TypeC -0.005448364 silhouette Pseudobulk:Euclidean celltype
#> TypeD TypeD -0.020708784 silhouette Pseudobulk:Euclidean celltype
#> TypeA1 TypeA -0.003214443 silhouette Pseudobulk:Cosine celltype
#> TypeB1 TypeB -0.002765134 silhouette Pseudobulk:Cosine celltype
#> TypeC1 TypeC -0.013348235 silhouette Pseudobulk:Cosine celltype
#> TypeD1 TypeD -0.022767009 silhouette Pseudobulk:Cosine celltype
#> TypeA2 TypeA 0.034467020 silhouette WasserStein celltype
#> TypeB2 TypeB -0.026486747 silhouette WasserStein celltype
#> TypeC2 TypeC -0.026581239 silhouette WasserStein celltype
#> TypeD2 TypeD -0.043528190 silhouette WasserStein celltype
# Compute multiple consistency metrics
consistency_metrics <- c("silhouette",
"NeighborhoodPurity",
"Average_similarity")
all_metrics <-
get_consistency(
sceval,
dissimilarity_slot = "Pseudobulk:Euclidean",
consistency_metric = consistency_metrics
)
#> Computing internal validation metrics for Pseudobulk:Euclidean ...
all_metrics
#> celltype measure consistency_metric dissimilarity_method ident
#> TypeA TypeA 0.009929989 silhouette Pseudobulk:Euclidean celltype
#> TypeB TypeB -0.011988536 silhouette Pseudobulk:Euclidean celltype
#> TypeC TypeC -0.005448364 silhouette Pseudobulk:Euclidean celltype
#> TypeD TypeD -0.020708784 silhouette Pseudobulk:Euclidean celltype
#> TypeA1 TypeA 0.320000000 NeighborhoodPurity Pseudobulk:Euclidean celltype
#> TypeB1 TypeB 0.240000000 NeighborhoodPurity Pseudobulk:Euclidean celltype
#> TypeC1 TypeC 0.240000000 NeighborhoodPurity Pseudobulk:Euclidean celltype
#> TypeD1 TypeD 0.200000000 NeighborhoodPurity Pseudobulk:Euclidean celltype
#> TypeA2 TypeA 0.504617271 Average_similarity Pseudobulk:Euclidean celltype
#> TypeB2 TypeB 0.500755570 Average_similarity Pseudobulk:Euclidean celltype
#> TypeC2 TypeC 0.501285885 Average_similarity Pseudobulk:Euclidean celltype
#> TypeD2 TypeD 0.498308342 Average_similarity Pseudobulk:Euclidean celltype
# Heatmap of dissimilarities
plot_heatmap(
sceval,
dissimilarity_slot = "Pseudobulk:Euclidean",
sort_consistency = "silhouette"
)
#> Computing consistency metric for silhouette.
#> Consistency computed.
# Pseudobulk PCA per sample & cell type
plot_pca(
sceval,
reduction_slot = "pseudobulk"
)
# Identify cell type markers
sceval <- run_gene_markers(
sceval,
method = "scran.findMarkers",
ngenes_celltype = 50
)
#> Not using black gene list
#> Computing cell type markers for celltype...
# Use markers for dissimilarity calculation
sceval <- run_dissimilarity(
sceval,
method = "Pseudobulk:Euclidean",
gene_list = "scran.findMarkers", # gene list recently added
reduction = FALSE
)
#>
#> Using scran.findMarkers gene list.
#> Not using black gene list
#> Filtering gene list...
#> Filtering empty rows and cols...
#> Running distance for euclidean...
# Add custom gene list
immune_genes <- c("CD3D", "CD4", "CD8A", "CD19", "CD14", "NCAM1")
sceval <- add_gene_list(
sceval,
gene_list = list("immune_markers" = immune_genes) # add a named list
)
# Run analysis on custom genes
sceval <- run_dissimilarity(
sceval,
method = "Pseudobulk:Euclidean",
gene_list = "immune_markers" # name of the list to use
)
#> Running distance for euclidean...
Low consistency scores may indicate:
plot_heatmap() or plot_pca() to identify
problematic samplesbrowseVignettes("scTypeEval")sessionInfo()
#> R version 4.6.0 alpha (2026-04-05 r89794)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 24.04.4 LTS
#>
#> Matrix products: default
#> BLAS: /home/biocbuild/bbs-3.23-bioc/R/lib/libRblas.so
#> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0 LAPACK version 3.12.0
#>
#> locale:
#> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_GB LC_COLLATE=C
#> [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
#> [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#>
#> time zone: America/New_York
#> tzcode source: system (glibc)
#>
#> attached base packages:
#> [1] stats4 stats graphics grDevices utils datasets methods
#> [8] base
#>
#> other attached packages:
#> [1] SingleCellExperiment_1.33.2 SummarizedExperiment_1.41.1
#> [3] Biobase_2.71.0 GenomicRanges_1.63.2
#> [5] Seqinfo_1.1.0 IRanges_2.45.0
#> [7] S4Vectors_0.49.1-1 BiocGenerics_0.57.0
#> [9] generics_0.1.4 MatrixGenerics_1.23.0
#> [11] matrixStats_1.5.0 Seurat_5.4.0
#> [13] SeuratObject_5.4.0 sp_2.2-1
#> [15] Matrix_1.7-5 scTypeEval_0.99.31
#> [17] BiocStyle_2.39.0
#>
#> loaded via a namespace (and not attached):
#> [1] RColorBrewer_1.1-3 jsonlite_2.0.0 magrittr_2.0.5
#> [4] magick_2.9.1 spatstat.utils_3.2-2 farver_2.1.2
#> [7] rmarkdown_2.31 vctrs_0.7.3 ROCR_1.0-12
#> [10] spatstat.explore_3.8-0 tinytex_0.59 S4Arrays_1.11.1
#> [13] htmltools_0.5.9 BiocNeighbors_2.5.4 SparseArray_1.11.13
#> [16] sass_0.4.10 sctransform_0.4.3 parallelly_1.46.1
#> [19] KernSmooth_2.23-26 bslib_0.10.0 htmlwidgets_1.6.4
#> [22] ica_1.0-3 plyr_1.8.9 plotly_4.12.0
#> [25] zoo_1.8-15 cachem_1.1.0 igraph_2.2.3
#> [28] mime_0.13 lifecycle_1.0.5 pkgconfig_2.0.3
#> [31] rsvd_1.0.5 R6_2.6.1 fastmap_1.2.0
#> [34] fitdistrplus_1.2-6 future_1.70.0 shiny_1.13.0
#> [37] digest_0.6.39 patchwork_1.3.2 tensor_1.5.1
#> [40] dqrng_0.4.1 RSpectra_0.16-2 irlba_2.3.7
#> [43] beachmat_2.27.5 labeling_0.4.3 progressr_0.19.0
#> [46] spatstat.sparse_3.1-0 httr_1.4.8 polyclip_1.10-7
#> [49] abind_1.4-8 compiler_4.6.0 withr_3.0.2
#> [52] S7_0.2.1-1 BiocParallel_1.45.0 fastDummies_1.7.5
#> [55] MASS_7.3-65 DelayedArray_0.37.1 bluster_1.21.1
#> [58] tools_4.6.0 lmtest_0.9-40 otel_0.2.0
#> [61] httpuv_1.6.17 future.apply_1.20.2 goftest_1.2-3
#> [64] glue_1.8.0 nlme_3.1-169 promises_1.5.0
#> [67] grid_4.6.0 Rtsne_0.17 cluster_2.1.8.2
#> [70] reshape2_1.4.5 gtable_0.3.6 spatstat.data_3.1-9
#> [73] tidyr_1.3.2 data.table_1.18.2.1 metapod_1.19.2
#> [76] ScaledMatrix_1.19.0 BiocSingular_1.27.1 XVector_0.51.0
#> [79] spatstat.geom_3.7-3 RcppAnnoy_0.0.23 ggrepel_0.9.8
#> [82] RANN_2.6.2 pillar_1.11.1 stringr_1.6.0
#> [85] limma_3.67.1 spam_2.11-3 RcppHNSW_0.6.0
#> [88] later_1.4.8 splines_4.6.0 dplyr_1.2.1
#> [91] lattice_0.22-9 survival_3.8-6 deldir_2.0-4
#> [94] tidyselect_1.2.1 locfit_1.5-9.12 scuttle_1.21.6
#> [97] miniUI_0.1.2 pbapply_1.7-4 transport_0.15-4
#> [100] knitr_1.51 gridExtra_2.3 bookdown_0.46
#> [103] edgeR_4.9.7 scattermore_1.2 xfun_0.57
#> [106] statmod_1.5.1 stringi_1.8.7 lazyeval_0.2.3
#> [109] yaml_2.3.12 evaluate_1.0.5 codetools_0.2-20
#> [112] tibble_3.3.1 BiocManager_1.30.27 cli_3.6.6
#> [115] uwot_0.2.4 xtable_1.8-8 reticulate_1.46.0
#> [118] jquerylib_0.1.4 dichromat_2.0-0.1 Rcpp_1.1.1-1
#> [121] globals_0.19.1 spatstat.random_3.4-5 png_0.1-9
#> [124] spatstat.univar_3.1-7 parallel_4.6.0 ggplot2_4.0.2
#> [127] dotCall64_1.2 scran_1.39.2 listenv_0.10.1
#> [130] viridisLite_0.4.3 scales_1.4.0 ggridges_0.5.7
#> [133] purrr_1.2.2 rlang_1.2.0 cowplot_1.2.0