RFGeneRank is a Bioconductor package that provides a leakage-aware, machine-learning workflow for gene ranking and classification using bulk RNA-seq–derived expression matrices. The package emphasizes reproducible evaluation with cross-validation and out-of-fold (OOF) preprocessing, and provides interpretable gene importance outputs for downstream biological interpretation.
High-throughput transcriptomic studies are often collected across different cohorts, sequencing runs, or laboratories. When datasets are integrated, batch effects and confounding can inflate predictive performance if preprocessing is performed outside cross-validation (information leakage). RFGeneRank addresses this by providing a fold-safe workflow for gene ranking and classification that integrates clean inputs, batch-aware options, and convenient reporting.
Bioconductor provides well-established methodologies for differential expression analysis and batch effect correction. RFGeneRank is not intended to replace these methodologies. Instead, it is designed to complement existing analytical workflows by adding a machine-learning–based gene ranking layer, with leakage-aware evaluation and diagnostic capabilities, implemented in a Bioconductor-compatible framework using standard data containers.
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("RFGeneRank")
This vignette demonstrates the core protocol used in RFGeneRank:
prepare_data().rank_genes().suppressPackageStartupMessages({
library(RFGeneRank)
library(SummarizedExperiment)
library(S4Vectors)
})
set.seed(42)
n_genes <- 300
n_samples <- 60
genes <- paste0("Gene", seq_len(n_genes))
samples <- paste0("Sample", seq_len(n_samples))
# Sample metadata (rows must match sample names)
meta_df <- data.frame(
state = factor(rep(c("CTRL","CASE"), each = n_samples/2), levels = c("CTRL","CASE")),
batch = factor(rep(c("B1","B2"), length.out = n_samples)),
sex = factor(rep(c("M","F"), length.out = n_samples)),
age = round(stats::runif(n_samples, 25, 65)),
stringsAsFactors = FALSE,
check.names = TRUE
)
# Transcriptomics-like expression: strictly positive values (log-normal)
expr <- matrix(
exp(rnorm(n_genes * n_samples, mean = 2.5, sd = 0.6)),
nrow = n_genes, ncol = n_samples,
dimnames = list(genes, samples)
)
# Inject signal in CASE for a subset of genes
signal_genes <- genes[1:25]
case_cols <- meta_df$state == "CASE"
expr[signal_genes, case_cols] <- expr[signal_genes, case_cols] * 1.8
# Critical alignment: metadata rownames must match expression colnames
rownames(meta_df) <- colnames(expr)
stopifnot(identical(colnames(expr), rownames(meta_df)))
# Build SummarizedExperiment
se <- SummarizedExperiment(
assays = list(expr = expr),
colData = DataFrame(meta_df)
)
se
#> class: SummarizedExperiment
#> dim: 300 60
#> metadata(0):
#> assays(1): expr
#> rownames(300): Gene1 Gene2 ... Gene299 Gene300
#> rowData names(0):
#> colnames(60): Sample1 Sample2 ... Sample59 Sample60
#> colData names(4): state batch sex age
# Detect whether the matrix is count-like (integer); our simulated data are continuous.
is_integerish <- function(x) all(abs(x - round(x)) < 1e-8, na.rm = TRUE)
counts_flag <- is_integerish(expr)
se_prep <- prepare_data(
mats = list(SummarizedExperiment::assay(se, "expr")),
metas = list(meta_df), # use data.frame for robustness in vignettes
label_col = "state",
batch_col = "batch",
log1p = counts_flag,
batch_method = "combat",
batch_correction_scope = "global"
)
se_prep
#> class: SummarizedExperiment
#> dim: 300 60
#> metadata(3): ..batch_col ..batch_corrected ..batch_correction_scope
#> assays(1): expr
#> rownames(300): Gene1 Gene2 ... Gene299 Gene300
#> rowData names(0):
#> colnames(60): Sample1 Sample2 ... Sample59 Sample60
#> colData names(6): state batch ... ..batch_col ..batch_covariates
cw <- c(CTRL = 1, CASE = 2)
fit <- rank_genes(
se_prep,
label_col = "state",
cv = "kfold", k = 3,
n_top = 100,
trees = 300,
fold_batch_correction = FALSE,
batch_col = "batch",
class_weights = cw,
auto_confounds = FALSE,
seed = 42
)
fit
#> GeneRankFit
#> k=3, trees=300, importance=permutation
#> features ranked: 146
#> finalized model: <none>
#> calibration: <none>
top_genes(fit, n = 10)
#> $gene
#> [1] "Gene19" "Gene24" "Gene3" "Gene17" "Gene25" "Gene1" "Gene10" "Gene14"
#> [9] "Gene20" "Gene8"
#>
#> $table
#> gene importance SelectedInFolds
#> 1 Gene19 3.7461757 3
#> 4 Gene24 3.0019289 3
#> 30 Gene3 2.6706210 2
#> 18 Gene17 2.3047686 3
#> 73 Gene25 2.0296908 3
#> 26 Gene1 1.9635692 3
#> 85 Gene10 1.7124708 3
#> 31 Gene14 1.4511371 3
#> 29 Gene20 1.1966736 3
#> 13 Gene8 0.8929695 3
tab_signed <- sign_importance(
fit, se_prep,
y = SummarizedExperiment::colData(se_prep)[["state"]],
method = "mean"
)
head(tab_signed, 10)
#> gene importance direction signed_importance mean_case mean_ctrl mean_diff
#> 1 Gene19 3.7461757 1 3.7461757 31.20445 12.33282 18.87162
#> 2 Gene24 3.0019289 1 3.0019289 30.45064 15.37228 15.07835
#> 3 Gene3 2.6706210 1 2.6706210 24.30786 12.60644 11.70143
#> 4 Gene17 2.3047686 1 2.3047686 26.72206 14.64225 12.07981
#> 5 Gene25 2.0296908 1 2.0296908 23.90360 12.89791 11.00569
#> 6 Gene1 1.9635692 1 1.9635692 28.33095 12.32328 16.00767
#> 7 Gene10 1.7124708 1 1.7124708 23.66650 12.15267 11.51383
#> 8 Gene14 1.4511371 1 1.4511371 27.35223 14.35958 12.99264
#> 9 Gene20 1.1966736 1 1.1966736 26.16339 14.36079 11.80261
#> 10 Gene8 0.8929695 1 0.8929695 24.56881 13.15816 11.41066
#> log2FC shap_dir
#> 1 1.3392484 NA
#> 2 0.9861409 NA
#> 3 0.9472625 NA
#> 4 0.8678937 NA
#> 5 0.8900905 NA
#> 6 1.2009927 NA
#> 7 0.9615733 NA
#> 8 0.9296444 NA
#> 9 0.8654150 NA
#> 10 0.9008705 NA
plot_importance(
fit,
top = 20,
map_to_symbol = FALSE
)
plot_roc(fit)
plot_sign_importance(
fit,
tab = tab_signed,
top = 20
)
sessionInfo()
#> R version 4.6.0 alpha (2026-04-05 r89794)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 24.04.4 LTS
#>
#> Matrix products: default
#> BLAS: /home/biocbuild/bbs-3.23-bioc/R/lib/libRblas.so
#> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0 LAPACK version 3.12.0
#>
#> locale:
#> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_GB LC_COLLATE=C
#> [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
#> [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#>
#> time zone: America/New_York
#> tzcode source: system (glibc)
#>
#> attached base packages:
#> [1] stats4 stats graphics grDevices utils datasets methods
#> [8] base
#>
#> other attached packages:
#> [1] SummarizedExperiment_1.41.1 Biobase_2.71.0
#> [3] GenomicRanges_1.63.2 Seqinfo_1.1.0
#> [5] IRanges_2.45.0 S4Vectors_0.49.1
#> [7] BiocGenerics_0.57.0 generics_0.1.4
#> [9] MatrixGenerics_1.23.0 matrixStats_1.5.0
#> [11] RFGeneRank_0.99.4 BiocStyle_2.39.0
#>
#> loaded via a namespace (and not attached):
#> [1] DBI_1.3.0 pROC_1.19.0.1 rlang_1.2.0
#> [4] magrittr_2.0.5 otel_0.2.0 compiler_4.6.0
#> [7] RSQLite_2.4.6 mgcv_1.9-4 reshape2_1.4.5
#> [10] png_0.1-9 vctrs_0.7.2 stringr_1.6.0
#> [13] sva_3.59.0 pkgconfig_2.0.3 crayon_1.5.3
#> [16] fastmap_1.2.0 magick_2.9.1 XVector_0.51.0
#> [19] labeling_0.4.3 rmarkdown_2.31 prodlim_2026.03.11
#> [22] tinytex_0.59 purrr_1.2.1 bit_4.6.0
#> [25] xfun_0.57 cachem_1.1.0 jsonlite_2.0.0
#> [28] recipes_1.3.2 blob_1.3.0 DelayedArray_0.37.1
#> [31] BiocParallel_1.45.0 parallel_4.6.0 R6_2.6.1
#> [34] stringi_1.8.7 bslib_0.10.0 RColorBrewer_1.1-3
#> [37] ranger_0.18.0 limma_3.67.0 genefilter_1.93.0
#> [40] parallelly_1.46.1 rpart_4.1.27 lubridate_1.9.5
#> [43] jquerylib_0.1.4 Rcpp_1.1.1 bookdown_0.46
#> [46] iterators_1.0.14 knitr_1.51 future.apply_1.20.2
#> [49] timechange_0.4.0 Matrix_1.7-5 splines_4.6.0
#> [52] nnet_7.3-20 tidyselect_1.2.1 dichromat_2.0-0.1
#> [55] abind_1.4-8 yaml_2.3.12 timeDate_4052.112
#> [58] codetools_0.2-20 listenv_0.10.1 lattice_0.22-9
#> [61] tibble_3.3.1 plyr_1.8.9 withr_3.0.2
#> [64] KEGGREST_1.51.1 S7_0.2.1 evaluate_1.0.5
#> [67] future_1.70.0 survival_3.8-6 Biostrings_2.79.5
#> [70] pillar_1.11.1 BiocManager_1.30.27 foreach_1.5.2
#> [73] ggplot2_4.0.2 scales_1.4.0 globals_0.19.1
#> [76] xtable_1.8-8 class_7.3-23 glue_1.8.0
#> [79] tools_4.6.0 data.table_1.18.2.1 ModelMetrics_1.2.2.2
#> [82] annotate_1.89.0 gower_1.0.2 locfit_1.5-9.12
#> [85] XML_3.99-0.23 grid_4.6.0 ipred_0.9-15
#> [88] AnnotationDbi_1.73.1 edgeR_4.9.5 nlme_3.1-169
#> [91] cli_3.6.5 S4Arrays_1.11.1 lava_1.9.0
#> [94] dplyr_1.2.1 gtable_0.3.6 sass_0.4.10
#> [97] digest_0.6.39 caret_7.0-1 SparseArray_1.11.13
#> [100] farver_2.1.2 memoise_2.0.1 htmltools_0.5.9
#> [103] lifecycle_1.0.5 hardhat_1.4.3 httr_1.4.8
#> [106] statmod_1.5.1 bit64_4.6.0-1 MASS_7.3-65