Contents

1 Abstract

RFGeneRank is a Bioconductor package that provides a leakage-aware, machine-learning workflow for gene ranking and classification using bulk RNA-seq–derived expression matrices. The package emphasizes reproducible evaluation with cross-validation and out-of-fold (OOF) preprocessing, and provides interpretable gene importance outputs for downstream biological interpretation.

2 Introduction and Motivation

High-throughput transcriptomic studies are often collected across different cohorts, sequencing runs, or laboratories. When datasets are integrated, batch effects and confounding can inflate predictive performance if preprocessing is performed outside cross-validation (information leakage). RFGeneRank addresses this by providing a fold-safe workflow for gene ranking and classification that integrates clean inputs, batch-aware options, and convenient reporting.

3 Relation to existing Bioconductor packages

Bioconductor provides well-established methodologies for differential expression analysis and batch effect correction. RFGeneRank is not intended to replace these methodologies. Instead, it is designed to complement existing analytical workflows by adding a machine-learning–based gene ranking layer, with leakage-aware evaluation and diagnostic capabilities, implemented in a Bioconductor-compatible framework using standard data containers.

4 Installation

if (!requireNamespace("BiocManager", quietly = TRUE))
  install.packages("BiocManager")
BiocManager::install("RFGeneRank")

5 A small runnable example

This vignette demonstrates the core protocol used in RFGeneRank:

  1. Generate a small transcriptomics-like dataset (genes × samples).
  2. Run prepare_data().
  3. Run rank_genes().
  4. Run a small set of downstream functions and basic plots.
suppressPackageStartupMessages({
  library(RFGeneRank)
  library(SummarizedExperiment)
  library(S4Vectors)
})

set.seed(42)

n_genes <- 300
n_samples <- 60

genes   <- paste0("Gene", seq_len(n_genes))
samples <- paste0("Sample", seq_len(n_samples))

# Sample metadata (rows must match sample names)
meta_df <- data.frame(
  state = factor(rep(c("CTRL","CASE"), each = n_samples/2), levels = c("CTRL","CASE")),
  batch = factor(rep(c("B1","B2"), length.out = n_samples)),
  sex   = factor(rep(c("M","F"), length.out = n_samples)),
  age   = round(stats::runif(n_samples, 25, 65)),
  stringsAsFactors = FALSE,
  check.names = TRUE
)

# Transcriptomics-like expression: strictly positive values (log-normal)
expr <- matrix(
  exp(rnorm(n_genes * n_samples, mean = 2.5, sd = 0.6)),
  nrow = n_genes, ncol = n_samples,
  dimnames = list(genes, samples)
)

# Inject signal in CASE for a subset of genes
signal_genes <- genes[1:25]
case_cols <- meta_df$state == "CASE"
expr[signal_genes, case_cols] <- expr[signal_genes, case_cols] * 1.8

# Critical alignment: metadata rownames must match expression colnames
rownames(meta_df) <- colnames(expr)
stopifnot(identical(colnames(expr), rownames(meta_df)))

# Build SummarizedExperiment
se <- SummarizedExperiment(
  assays  = list(expr = expr),
  colData = DataFrame(meta_df)
)

se
#> class: SummarizedExperiment 
#> dim: 300 60 
#> metadata(0):
#> assays(1): expr
#> rownames(300): Gene1 Gene2 ... Gene299 Gene300
#> rowData names(0):
#> colnames(60): Sample1 Sample2 ... Sample59 Sample60
#> colData names(4): state batch sex age

6 Step 1: prepare_data()

# Detect whether the matrix is count-like (integer); our simulated data are continuous.
is_integerish <- function(x) all(abs(x - round(x)) < 1e-8, na.rm = TRUE)
counts_flag <- is_integerish(expr)

se_prep <- prepare_data(
  mats   = list(SummarizedExperiment::assay(se, "expr")),
  metas  = list(meta_df),   # use data.frame for robustness in vignettes
  label_col  = "state",
  batch_col  = "batch",
  log1p      = counts_flag,
  batch_method = "combat",
  batch_correction_scope = "global"
)

se_prep
#> class: SummarizedExperiment 
#> dim: 300 60 
#> metadata(3): ..batch_col ..batch_corrected ..batch_correction_scope
#> assays(1): expr
#> rownames(300): Gene1 Gene2 ... Gene299 Gene300
#> rowData names(0):
#> colnames(60): Sample1 Sample2 ... Sample59 Sample60
#> colData names(6): state batch ... ..batch_col ..batch_covariates

7 Step 2: rank_genes()

cw <- c(CTRL = 1, CASE = 2)

fit <- rank_genes(
  se_prep,
  label_col = "state",
  cv = "kfold", k = 3,
  n_top = 100,
  trees = 300,
  fold_batch_correction = FALSE,
  batch_col = "batch",
  class_weights = cw,
  auto_confounds = FALSE,
  seed = 42
)

fit
#> GeneRankFit
#>   k=3, trees=300, importance=permutation
#>   features ranked: 146
#>   finalized model: <none>
#>   calibration: <none>

8 Step 3: downstream utilities

8.1 Top genes

top_genes(fit, n = 10)
#> $gene
#>  [1] "Gene19" "Gene24" "Gene3"  "Gene17" "Gene25" "Gene1"  "Gene10" "Gene14"
#>  [9] "Gene20" "Gene8" 
#> 
#> $table
#>      gene importance SelectedInFolds
#> 1  Gene19  3.7461757               3
#> 4  Gene24  3.0019289               3
#> 30  Gene3  2.6706210               2
#> 18 Gene17  2.3047686               3
#> 73 Gene25  2.0296908               3
#> 26  Gene1  1.9635692               3
#> 85 Gene10  1.7124708               3
#> 31 Gene14  1.4511371               3
#> 29 Gene20  1.1966736               3
#> 13  Gene8  0.8929695               3

8.2 Signed importance (directionality)

tab_signed <- sign_importance(
  fit, se_prep,
  y = SummarizedExperiment::colData(se_prep)[["state"]],
  method = "mean"
)

head(tab_signed, 10)
#>      gene importance direction signed_importance mean_case mean_ctrl mean_diff
#> 1  Gene19  3.7461757         1         3.7461757  31.20445  12.33282  18.87162
#> 2  Gene24  3.0019289         1         3.0019289  30.45064  15.37228  15.07835
#> 3   Gene3  2.6706210         1         2.6706210  24.30786  12.60644  11.70143
#> 4  Gene17  2.3047686         1         2.3047686  26.72206  14.64225  12.07981
#> 5  Gene25  2.0296908         1         2.0296908  23.90360  12.89791  11.00569
#> 6   Gene1  1.9635692         1         1.9635692  28.33095  12.32328  16.00767
#> 7  Gene10  1.7124708         1         1.7124708  23.66650  12.15267  11.51383
#> 8  Gene14  1.4511371         1         1.4511371  27.35223  14.35958  12.99264
#> 9  Gene20  1.1966736         1         1.1966736  26.16339  14.36079  11.80261
#> 10  Gene8  0.8929695         1         0.8929695  24.56881  13.15816  11.41066
#>       log2FC shap_dir
#> 1  1.3392484       NA
#> 2  0.9861409       NA
#> 3  0.9472625       NA
#> 4  0.8678937       NA
#> 5  0.8900905       NA
#> 6  1.2009927       NA
#> 7  0.9615733       NA
#> 8  0.9296444       NA
#> 9  0.8654150       NA
#> 10 0.9008705       NA

8.3 Basic plots

plot_importance(
  fit,
  top = 20,
  map_to_symbol = FALSE
)


plot_roc(fit)


plot_sign_importance(
  fit,
  tab = tab_signed,
  top = 20
)

9 Session information

sessionInfo()
#> R version 4.6.0 alpha (2026-04-05 r89794)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 24.04.4 LTS
#> 
#> Matrix products: default
#> BLAS:   /home/biocbuild/bbs-3.23-bioc/R/lib/libRblas.so 
#> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0  LAPACK version 3.12.0
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_GB              LC_COLLATE=C              
#>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> time zone: America/New_York
#> tzcode source: system (glibc)
#> 
#> attached base packages:
#> [1] stats4    stats     graphics  grDevices utils     datasets  methods  
#> [8] base     
#> 
#> other attached packages:
#>  [1] SummarizedExperiment_1.41.1 Biobase_2.71.0             
#>  [3] GenomicRanges_1.63.2        Seqinfo_1.1.0              
#>  [5] IRanges_2.45.0              S4Vectors_0.49.1           
#>  [7] BiocGenerics_0.57.0         generics_0.1.4             
#>  [9] MatrixGenerics_1.23.0       matrixStats_1.5.0          
#> [11] RFGeneRank_0.99.4           BiocStyle_2.39.0           
#> 
#> loaded via a namespace (and not attached):
#>   [1] DBI_1.3.0            pROC_1.19.0.1        rlang_1.2.0         
#>   [4] magrittr_2.0.5       otel_0.2.0           compiler_4.6.0      
#>   [7] RSQLite_2.4.6        mgcv_1.9-4           reshape2_1.4.5      
#>  [10] png_0.1-9            vctrs_0.7.2          stringr_1.6.0       
#>  [13] sva_3.59.0           pkgconfig_2.0.3      crayon_1.5.3        
#>  [16] fastmap_1.2.0        magick_2.9.1         XVector_0.51.0      
#>  [19] labeling_0.4.3       rmarkdown_2.31       prodlim_2026.03.11  
#>  [22] tinytex_0.59         purrr_1.2.1          bit_4.6.0           
#>  [25] xfun_0.57            cachem_1.1.0         jsonlite_2.0.0      
#>  [28] recipes_1.3.2        blob_1.3.0           DelayedArray_0.37.1 
#>  [31] BiocParallel_1.45.0  parallel_4.6.0       R6_2.6.1            
#>  [34] stringi_1.8.7        bslib_0.10.0         RColorBrewer_1.1-3  
#>  [37] ranger_0.18.0        limma_3.67.0         genefilter_1.93.0   
#>  [40] parallelly_1.46.1    rpart_4.1.27         lubridate_1.9.5     
#>  [43] jquerylib_0.1.4      Rcpp_1.1.1           bookdown_0.46       
#>  [46] iterators_1.0.14     knitr_1.51           future.apply_1.20.2 
#>  [49] timechange_0.4.0     Matrix_1.7-5         splines_4.6.0       
#>  [52] nnet_7.3-20          tidyselect_1.2.1     dichromat_2.0-0.1   
#>  [55] abind_1.4-8          yaml_2.3.12          timeDate_4052.112   
#>  [58] codetools_0.2-20     listenv_0.10.1       lattice_0.22-9      
#>  [61] tibble_3.3.1         plyr_1.8.9           withr_3.0.2         
#>  [64] KEGGREST_1.51.1      S7_0.2.1             evaluate_1.0.5      
#>  [67] future_1.70.0        survival_3.8-6       Biostrings_2.79.5   
#>  [70] pillar_1.11.1        BiocManager_1.30.27  foreach_1.5.2       
#>  [73] ggplot2_4.0.2        scales_1.4.0         globals_0.19.1      
#>  [76] xtable_1.8-8         class_7.3-23         glue_1.8.0          
#>  [79] tools_4.6.0          data.table_1.18.2.1  ModelMetrics_1.2.2.2
#>  [82] annotate_1.89.0      gower_1.0.2          locfit_1.5-9.12     
#>  [85] XML_3.99-0.23        grid_4.6.0           ipred_0.9-15        
#>  [88] AnnotationDbi_1.73.1 edgeR_4.9.5          nlme_3.1-169        
#>  [91] cli_3.6.5            S4Arrays_1.11.1      lava_1.9.0          
#>  [94] dplyr_1.2.1          gtable_0.3.6         sass_0.4.10         
#>  [97] digest_0.6.39        caret_7.0-1          SparseArray_1.11.13 
#> [100] farver_2.1.2         memoise_2.0.1        htmltools_0.5.9     
#> [103] lifecycle_1.0.5      hardhat_1.4.3        httr_1.4.8          
#> [106] statmod_1.5.1        bit64_4.6.0-1        MASS_7.3-65