1 Overview

This quick start guide demonstrates the essential steps for evaluating cell type annotations using scTypeEval. For a comprehensive tutorial, see the main vignette.

library(scTypeEval)

2 Minimal Workflow

2.1 From a Count Matrix

library(Matrix)

# Generate example data
set.seed(123)
counts <- Matrix(rpois(50000, 5), nrow=500, ncol=100, sparse=TRUE)
rownames(counts) <- paste0("Gene", seq_len(500))
colnames(counts) <- paste0("Cell", seq_len(100))

metadata <- data.frame(
  celltype = rep(c("TypeA", "TypeB", "TypeC", "TypeD"), each=25),
  sample = rep(paste0("S", seq_len(5)), times=20),
  row.names = colnames(counts)
)

# Create object
sceval <- create_scTypeEval(matrix=counts, metadata=metadata)

# Process data
sceval <- run_processing_data(
  sceval,
  ident = "celltype",
  sample = "sample",
  min_samples = 3,
  min_cells = 5
)
#> # Processing data for single-cell ...
#>    Transforming and filtering count matrix...
#>    Normalizing count matrix via Log1p...
#> # Processing data for pseudobulk ...
#>    Transforming and filtering count matrix...
#>    Normalizing count matrix via Log1p...

# Identify features
sceval <- run_hvg(sceval,
                  var_method = "basic",
                  ngenes = 1000)
#> Not using black gene list
#> Computing HVG...

# Run PCA
sceval <- run_pca(sceval, ndim = 20)
#> 
#> Using HVG gene list.
#> Not using black gene list
#> # Computing PCA data for single-cell ...
#>    Filtering gene list...
#>    Filtering empty rows and cols...
#>    Computing PCA space...
#>    > Returning 20 dimensions for PCA
#> # Computing PCA data for pseudobulk ...
#>    Filtering gene list...
#>    Filtering empty rows and cols...
#>    Computing PCA space...
#>    > Returning 19 dimensions for PCA

# Compute dissimilarity
sceval <- run_dissimilarity(
  sceval,
  method = "Pseudobulk:Euclidean",
  reduction = TRUE
)
#>    Running distance for euclidean...

# Get consistency
results <- get_consistency(
  sceval,
  dissimilarity_slot = "Pseudobulk:Euclidean",
  consistency_metric = "silhouette"
)
#> Computing internal validation metrics for Pseudobulk:Euclidean ...
print(results)
#>       celltype      measure consistency_metric dissimilarity_method    ident
#> TypeA    TypeA  0.009929989         silhouette Pseudobulk:Euclidean celltype
#> TypeB    TypeB -0.011988536         silhouette Pseudobulk:Euclidean celltype
#> TypeC    TypeC -0.005448364         silhouette Pseudobulk:Euclidean celltype
#> TypeD    TypeD -0.020708784         silhouette Pseudobulk:Euclidean celltype

2.2 From a Seurat Object

library(Seurat)

# Create Seurat object with example data generated earlier
seurat_obj <- Seurat::CreateSeuratObject(
  counts = counts,
  meta.data = metadata
)

sceval_seurat <- create_scTypeEval(seurat_obj)

# Continue with standard workflow

2.3 From a SingleCellExperiment Object

library(SingleCellExperiment)

# Create SCE object with example data generated earlier
sce <- SingleCellExperiment::SingleCellExperiment(
  assays = list(counts = counts),
  colData = metadata
)

sceval_sce <- create_scTypeEval(sce)

# Continue with workflow as above

3 Common Use Cases

3.1 Compare Multiple Dissimilarity Methods

# Compute different dissimilarity methods
sceval <- run_dissimilarity(
  sceval,
  method = "Pseudobulk:Euclidean",
  reduction = TRUE
)
#>    Running distance for euclidean...
sceval <- run_dissimilarity(
  sceval,
  method = "Pseudobulk:Cosine",
  reduction = TRUE
)
#>    Running distance for cosine...
sceval <- run_dissimilarity(
  sceval,
  method = "WasserStein",
  reduction = TRUE
)
#> Splitting matrices...
#> Computing pairwise WasserStein distance...

# Compare consistency across methods
dissimilarity_methods <- c("Pseudobulk:Euclidean",
                           "Pseudobulk:Cosine",
                           "WasserStein")
results_df <- 
  get_consistency(
    sceval,
    dissimilarity_slot = dissimilarity_methods, # compute for multiple dissimilarities
    consistency_metric = "silhouette"
  )
#> Computing internal validation metrics for Pseudobulk:Euclidean ...
#> Computing internal validation metrics for Pseudobulk:Cosine ...
#> Computing internal validation metrics for WasserStein ...

results_df
#>        celltype      measure consistency_metric dissimilarity_method    ident
#> TypeA     TypeA  0.009929989         silhouette Pseudobulk:Euclidean celltype
#> TypeB     TypeB -0.011988536         silhouette Pseudobulk:Euclidean celltype
#> TypeC     TypeC -0.005448364         silhouette Pseudobulk:Euclidean celltype
#> TypeD     TypeD -0.020708784         silhouette Pseudobulk:Euclidean celltype
#> TypeA1    TypeA -0.003214443         silhouette    Pseudobulk:Cosine celltype
#> TypeB1    TypeB -0.002765134         silhouette    Pseudobulk:Cosine celltype
#> TypeC1    TypeC -0.013348235         silhouette    Pseudobulk:Cosine celltype
#> TypeD1    TypeD -0.022767009         silhouette    Pseudobulk:Cosine celltype
#> TypeA2    TypeA  0.034467020         silhouette          WasserStein celltype
#> TypeB2    TypeB -0.026486747         silhouette          WasserStein celltype
#> TypeC2    TypeC -0.026581239         silhouette          WasserStein celltype
#> TypeD2    TypeD -0.043528190         silhouette          WasserStein celltype

3.2 Evaluate Multiple Consistency Metrics

# Compute multiple consistency metrics
consistency_metrics <- c("silhouette",
                         "NeighborhoodPurity",
                         "Average_similarity")

all_metrics <- 
  get_consistency(
    sceval,
    dissimilarity_slot = "Pseudobulk:Euclidean",
    consistency_metric = consistency_metrics
  )
#> Computing internal validation metrics for Pseudobulk:Euclidean ...

all_metrics
#>        celltype      measure consistency_metric dissimilarity_method    ident
#> TypeA     TypeA  0.009929989         silhouette Pseudobulk:Euclidean celltype
#> TypeB     TypeB -0.011988536         silhouette Pseudobulk:Euclidean celltype
#> TypeC     TypeC -0.005448364         silhouette Pseudobulk:Euclidean celltype
#> TypeD     TypeD -0.020708784         silhouette Pseudobulk:Euclidean celltype
#> TypeA1    TypeA  0.320000000 NeighborhoodPurity Pseudobulk:Euclidean celltype
#> TypeB1    TypeB  0.240000000 NeighborhoodPurity Pseudobulk:Euclidean celltype
#> TypeC1    TypeC  0.240000000 NeighborhoodPurity Pseudobulk:Euclidean celltype
#> TypeD1    TypeD  0.200000000 NeighborhoodPurity Pseudobulk:Euclidean celltype
#> TypeA2    TypeA  0.504617271 Average_similarity Pseudobulk:Euclidean celltype
#> TypeB2    TypeB  0.500755570 Average_similarity Pseudobulk:Euclidean celltype
#> TypeC2    TypeC  0.501285885 Average_similarity Pseudobulk:Euclidean celltype
#> TypeD2    TypeD  0.498308342 Average_similarity Pseudobulk:Euclidean celltype

3.3 Visualize Results

# Heatmap of dissimilarities
plot_heatmap(
  sceval,
  dissimilarity_slot = "Pseudobulk:Euclidean",
  sort_consistency = "silhouette"
)
#> Computing consistency metric for silhouette.
#> Consistency computed.


# Pseudobulk PCA per sample & cell type
plot_pca(
  sceval,
  reduction_slot = "pseudobulk"
)

3.4 Using Marker Genes Instead of HVGs

# Identify cell type markers
sceval <- run_gene_markers(
  sceval,
  method = "scran.findMarkers",
  ngenes_celltype = 50
)
#> Not using black gene list
#> Computing cell type markers for celltype...

# Use markers for dissimilarity calculation
sceval <- run_dissimilarity(
  sceval,
  method = "Pseudobulk:Euclidean",
  gene_list = "scran.findMarkers", # gene list recently added
  reduction = FALSE
)
#> 
#> Using scran.findMarkers gene list.
#> Not using black gene list
#>    Filtering gene list...
#>    Filtering empty rows and cols...
#>    Running distance for euclidean...

3.5 Focus on Specific Gene Sets

# Add custom gene list
immune_genes <- c("CD3D", "CD4", "CD8A", "CD19", "CD14", "NCAM1")
sceval <- add_gene_list(
  sceval,
  gene_list = list("immune_markers" = immune_genes) # add a named list
)

# Run analysis on custom genes
sceval <- run_dissimilarity(
  sceval,
  method = "Pseudobulk:Euclidean",
  gene_list = "immune_markers" # name of the list to use
)
#>    Running distance for euclidean...

4 Interpreting Results

4.1 What Low Scores Mean

Low consistency scores may indicate:

  • Ambiguous cell type boundaries between related types
  • Heterogeneous populations needing refinement
  • Annotation inconsistencies across samples

4.2 Next Steps for Low-Scoring Cell Types

  1. Visualize using plot_heatmap() or plot_pca() to identify problematic samples
  2. Investigate biological differences (e.g., disease vs. healthy)
  3. Refine annotations by splitting or merging cell types

5 Available Methods and Metrics

5.1 Dissimilarity Methods

  • Pseudobulk:Euclidean - Euclidean distance on pseudobulk profiles
  • Pseudobulk:Cosine - Cosine distance on pseudobulk profiles
  • Pseudobulk:Pearson - Pearson correlation distance on pseudobulk profiles
  • WasserStein - Wasserstein distance between cell distributions
  • recip_classif:Match - Reciprocal classification matching
  • recip_classif:Score - Reciprocal classification scoring

5.2 Consistency Metrics

  • silhouette - Standard silhouette coefficient
  • 2label_silhouette - Two-label silhouette variant
  • NeighborhoodPurity - K-nearest neighbor purity
  • ward_PropMatch - Ward clustering proportion match
  • Orbital_medoid - Medoid-based orbital metric
  • Average_similarity - Average within-group similarity

6 Tips and Best Practices

  1. Always use multiple samples (minimum 3-5 per cell type)
  2. Compare different methods - no single method is perfect
  3. Use PCA for speed - similar results, much faster
  4. Start with HVGs - then try marker genes if needed
  5. Check sample sizes - ensure adequate cells per type per sample
  6. Interpret in context - consider biological heterogeneity

7 Getting Help

8 Session Info

sessionInfo()
#> R version 4.6.0 alpha (2026-04-05 r89794)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 24.04.4 LTS
#> 
#> Matrix products: default
#> BLAS:   /home/biocbuild/bbs-3.23-bioc/R/lib/libRblas.so 
#> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0  LAPACK version 3.12.0
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_GB              LC_COLLATE=C              
#>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> time zone: America/New_York
#> tzcode source: system (glibc)
#> 
#> attached base packages:
#> [1] stats4    stats     graphics  grDevices utils     datasets  methods  
#> [8] base     
#> 
#> other attached packages:
#>  [1] SingleCellExperiment_1.33.2 SummarizedExperiment_1.41.1
#>  [3] Biobase_2.71.0              GenomicRanges_1.63.2       
#>  [5] Seqinfo_1.1.0               IRanges_2.45.0             
#>  [7] S4Vectors_0.49.1-1          BiocGenerics_0.57.0        
#>  [9] generics_0.1.4              MatrixGenerics_1.23.0      
#> [11] matrixStats_1.5.0           Seurat_5.4.0               
#> [13] SeuratObject_5.4.0          sp_2.2-1                   
#> [15] Matrix_1.7-5                scTypeEval_0.99.31         
#> [17] BiocStyle_2.39.0           
#> 
#> loaded via a namespace (and not attached):
#>   [1] RColorBrewer_1.1-3     jsonlite_2.0.0         magrittr_2.0.5        
#>   [4] magick_2.9.1           spatstat.utils_3.2-2   farver_2.1.2          
#>   [7] rmarkdown_2.31         vctrs_0.7.3            ROCR_1.0-12           
#>  [10] spatstat.explore_3.8-0 tinytex_0.59           S4Arrays_1.11.1       
#>  [13] htmltools_0.5.9        BiocNeighbors_2.5.4    SparseArray_1.11.13   
#>  [16] sass_0.4.10            sctransform_0.4.3      parallelly_1.46.1     
#>  [19] KernSmooth_2.23-26     bslib_0.10.0           htmlwidgets_1.6.4     
#>  [22] ica_1.0-3              plyr_1.8.9             plotly_4.12.0         
#>  [25] zoo_1.8-15             cachem_1.1.0           igraph_2.2.3          
#>  [28] mime_0.13              lifecycle_1.0.5        pkgconfig_2.0.3       
#>  [31] rsvd_1.0.5             R6_2.6.1               fastmap_1.2.0         
#>  [34] fitdistrplus_1.2-6     future_1.70.0          shiny_1.13.0          
#>  [37] digest_0.6.39          patchwork_1.3.2        tensor_1.5.1          
#>  [40] dqrng_0.4.1            RSpectra_0.16-2        irlba_2.3.7           
#>  [43] beachmat_2.27.5        labeling_0.4.3         progressr_0.19.0      
#>  [46] spatstat.sparse_3.1-0  httr_1.4.8             polyclip_1.10-7       
#>  [49] abind_1.4-8            compiler_4.6.0         withr_3.0.2           
#>  [52] S7_0.2.1-1             BiocParallel_1.45.0    fastDummies_1.7.5     
#>  [55] MASS_7.3-65            DelayedArray_0.37.1    bluster_1.21.1        
#>  [58] tools_4.6.0            lmtest_0.9-40          otel_0.2.0            
#>  [61] httpuv_1.6.17          future.apply_1.20.2    goftest_1.2-3         
#>  [64] glue_1.8.0             nlme_3.1-169           promises_1.5.0        
#>  [67] grid_4.6.0             Rtsne_0.17             cluster_2.1.8.2       
#>  [70] reshape2_1.4.5         gtable_0.3.6           spatstat.data_3.1-9   
#>  [73] tidyr_1.3.2            data.table_1.18.2.1    metapod_1.19.2        
#>  [76] ScaledMatrix_1.19.0    BiocSingular_1.27.1    XVector_0.51.0        
#>  [79] spatstat.geom_3.7-3    RcppAnnoy_0.0.23       ggrepel_0.9.8         
#>  [82] RANN_2.6.2             pillar_1.11.1          stringr_1.6.0         
#>  [85] limma_3.67.1           spam_2.11-3            RcppHNSW_0.6.0        
#>  [88] later_1.4.8            splines_4.6.0          dplyr_1.2.1           
#>  [91] lattice_0.22-9         survival_3.8-6         deldir_2.0-4          
#>  [94] tidyselect_1.2.1       locfit_1.5-9.12        scuttle_1.21.6        
#>  [97] miniUI_0.1.2           pbapply_1.7-4          transport_0.15-4      
#> [100] knitr_1.51             gridExtra_2.3          bookdown_0.46         
#> [103] edgeR_4.9.7            scattermore_1.2        xfun_0.57             
#> [106] statmod_1.5.1          stringi_1.8.7          lazyeval_0.2.3        
#> [109] yaml_2.3.12            evaluate_1.0.5         codetools_0.2-20      
#> [112] tibble_3.3.1           BiocManager_1.30.27    cli_3.6.6             
#> [115] uwot_0.2.4             xtable_1.8-8           reticulate_1.46.0     
#> [118] jquerylib_0.1.4        dichromat_2.0-0.1      Rcpp_1.1.1-1          
#> [121] globals_0.19.1         spatstat.random_3.4-5  png_0.1-9             
#> [124] spatstat.univar_3.1-7  parallel_4.6.0         ggplot2_4.0.2         
#> [127] dotCall64_1.2          scran_1.39.2           listenv_0.10.1        
#> [130] viridisLite_0.4.3      scales_1.4.0           ggridges_0.5.7        
#> [133] purrr_1.2.2            rlang_1.2.0            cowplot_1.2.0