iscream compatible data structures

The examples here show how iscream output can be converted into other data structures for further analysis.

library(iscream)

## iscream using 1 thread by default but parallelly::availableCores() detects 4 possibly available threads. See `?set_threads` for information on multithreading before trying to use more.
## 'tabix' executable not found in $PATH. tabix() will use htslib to make queries instead which can be slower. See ?tabix for details.

data_dir <- system.file("extdata", package = "iscream")
bedfiles <- list.files(data_dir, pattern = "[a|b|c|d].bed.gz$", full.names = TRUE)
regions <- c(A = "chr1:1-6", B = "chr1:7-10", C = "chr1:11-14")

0.1 GenomicRanges

if (!require("GenomicRanges", quietly = TRUE)) {
  stop("The 'GenomicRanges' package must be installed for this functionality")
}

GRanges objects can be used as the input regions to all of iscream’s functions and can be returned by tabix_gr() and make_mat_gr().

0.1.1 From `tabix` queries

tabix_gr() returns a GenomicRanges object. The regions parameter can be a string vector, a data frame or a GRanges object. If given a GRanges object with metadata, those columns will be preserved in the output.

gr <- GRanges(regions)
values(gr) <- DataFrame(
    gene = c("gene1", "gene2", "gene3"),
    some_metadata = c("s1", "s2", "s3")
)
gr

## GRanges object with 3 ranges and 2 metadata columns:
##     seqnames    ranges strand |        gene some_metadata
##        <Rle> <IRanges>  <Rle> | <character>   <character>
##   A     chr1       1-6      * |       gene1            s1
##   B     chr1      7-10      * |       gene2            s2
##   C     chr1     11-14      * |       gene3            s3
##   -------
##   seqinfo: 1 sequence from an unspecified genome; no seqlengths

tabix_gr(bedfiles[1], gr)

## GRanges object with 7 ranges and 4 metadata columns:
##       seqnames    ranges strand |        V1        V2        gene some_metadata
##          <Rle> <IRanges>  <Rle> | <numeric> <integer> <character>   <character>
##   [1]     chr1       1-2      * |       1.0         1       gene1            s1
##   [2]     chr1       3-4      * |       1.0         1       gene1            s1
##   [3]     chr1       5-6      * |       0.0         2       gene1            s1
##   [4]     chr1       7-8      * |       0.0         1       gene2            s2
##   [5]     chr1      9-10      * |       0.5         2       gene2            s2
##   [6]     chr1     11-12      * |       1.0         2       gene3            s3
##   [7]     chr1     13-14      * |       1.0         3       gene3            s3
##   -------
##   seqinfo: 1 sequence from an unspecified genome; no seqlengths

The data.table output of tabix() can also be piped into GRanges, but does not preserve input metadata.

tabix(bedfiles[1], gr) |>
  makeGRangesFromDataFrame(
    starts.in.df.are.0based = TRUE,
    keep.extra.columns = TRUE
  )

## GRanges object with 7 ranges and 2 metadata columns:
##       seqnames    ranges strand |        V1        V2
##          <Rle> <IRanges>  <Rle> | <numeric> <integer>
##   [1]     chr1       1-2      * |       1.0         1
##   [2]     chr1       3-4      * |       1.0         1
##   [3]     chr1       5-6      * |       0.0         2
##   [4]     chr1       7-8      * |       0.0         1
##   [5]     chr1      9-10      * |       0.5         2
##   [6]     chr1     11-12      * |       1.0         2
##   [7]     chr1     13-14      * |       1.0         3
##   -------
##   seqinfo: 1 sequence from an unspecified genome; no seqlengths

If the input BED file is not zero-based (e.g. Bismark coverage files), set zero_based = FALSE in the tabix() call to get the correct conversion from data frame to GenomicRanges.

0.1.2 From `summarize_regions()`

summarize_regions() returns a data frame with the input regions positions and summaries of the BED-file data columns.

(summary <- summarize_regions(
  bedfiles,
  regions,
  column = 4,
  col_names = ("data_col"),
  fun = c("sum", "mean"))
)

## [18:19:10.482745] [iscream::summarize_regions] [info] Summarizing 3 regions from 4 bedfiles
## [18:19:10.482790] [iscream::summarize_regions] [info] using sum, mean
## [18:19:10.482796] [iscream::summarize_regions] [info] with columns 4 as data_col

##        chr start   end   file feature data_col.sum data_col.mean
##     <char> <int> <int> <char>  <char>        <num>         <num>
##  1:   chr1     1     6      a       A          2.0     0.6666667
##  2:   chr1     7    10      a       B          0.5     0.2500000
##  3:   chr1    11    14      a       C          2.0     1.0000000
##  4:   chr1     1     6      b       A          1.0     0.5000000
##  5:   chr1     7    10      b       B          1.0     1.0000000
##  6:   chr1    11    14      b       C          1.0     0.5000000
##  7:   chr1     1     6      c       A          1.0     1.0000000
##  8:   chr1     7    10      c       B          1.0     0.5000000
##  9:   chr1    11    14      c       C           NA            NA
## 10:   chr1     1     6      d       A          2.0     1.0000000
## 11:   chr1     7    10      d       B          0.5     0.2500000
## 12:   chr1    11    14      d       C          1.0     1.0000000

GRanges(summary)

## GRanges object with 12 ranges and 4 metadata columns:
##        seqnames    ranges strand |        file     feature data_col.sum
##           <Rle> <IRanges>  <Rle> | <character> <character>    <numeric>
##    [1]     chr1       1-6      * |           a           A          2.0
##    [2]     chr1      7-10      * |           a           B          0.5
##    [3]     chr1     11-14      * |           a           C          2.0
##    [4]     chr1       1-6      * |           b           A          1.0
##    [5]     chr1      7-10      * |           b           B          1.0
##    ...      ...       ...    ... .         ...         ...          ...
##    [8]     chr1      7-10      * |           c           B          1.0
##    [9]     chr1     11-14      * |           c           C           NA
##   [10]     chr1       1-6      * |           d           A          2.0
##   [11]     chr1      7-10      * |           d           B          0.5
##   [12]     chr1     11-14      * |           d           C          1.0
##        data_col.mean
##            <numeric>
##    [1]      0.666667
##    [2]      0.250000
##    [3]      1.000000
##    [4]      0.500000
##    [5]      1.000000
##    ...           ...
##    [8]          0.50
##    [9]            NA
##   [10]          1.00
##   [11]          0.25
##   [12]          1.00
##   -------
##   seqinfo: 1 sequence from an unspecified genome; no seqlengths

0.1.3 From `make_mat`

make_mat_gr() returns a GRanges object for dense matrices.

make_mat_gr(bedfiles, regions, column = 4, mat_name = "beta")

## [18:19:10.539632] [iscream::query_all] [info] Querying 3 regions from 4 bedfiles
## 
## [18:19:10.539972] [iscream::query_all] [info] Creating metadata vectors
## [18:19:10.540026] [iscream::query_all] [info] 7 loci found - 9993 extra rows allocated with 0 resizes
## [18:19:10.540029] [iscream::query_all] [info] Creating dense matrix

## GRanges object with 7 ranges and 4 metadata columns:
##       seqnames    ranges strand |         a         b         c         d
##          <Rle> <IRanges>  <Rle> | <numeric> <numeric> <numeric> <numeric>
##   [1]     chr1         0      * |       1.0         0         0       1.0
##   [2]     chr1         2      * |       1.0         0         1       1.0
##   [3]     chr1         4      * |       0.0         1         0       0.0
##   [4]     chr1         6      * |       0.0         1         0       0.0
##   [5]     chr1         8      * |       0.5         0         1       0.5
##   [6]     chr1        10      * |       1.0         0         0       0.0
##   [7]     chr1        12      * |       1.0         1         0       1.0
##   -------
##   seqinfo: 1 sequence from an unspecified genome; no seqlengths

0.2 SummarizedExperiment

make_mat_se() returns a RangedSummarizedExperiment for both sparse and dense matrices.

if (!require("SummarizedExperiment", quietly = TRUE)) {
  stop("The 'SummarizedExperiment' package must be installed for this functionality")
}
make_mat_se(bedfiles, regions, column = 4, mat_name = "beta", sparse = TRUE)

## [18:19:13.730464] [iscream::query_all] [info] Querying 3 regions from 4 bedfiles
## 
## [18:19:13.730804] [iscream::query_all] [info] Creating metadata vectors
## [18:19:13.730832] [iscream::query_all] [info] 7 loci found - 9993 extra rows allocated with 0 resizes
## [18:19:13.730840] [iscream::query_all] [info] Creating sparse matrix

## class: RangedSummarizedExperiment 
## dim: 7 4 
## metadata(0):
## assays(1): beta
## rownames: NULL
## rowData names(0):
## colnames(4): a b c d
## colData names(0):

0.2.1 Making `BSseq` objects

A bsseq object is a type of SummarizedExperiment, but it cannot handle sparse matrices:

if (!require("bsseq", quietly = TRUE)) {
  stop("The 'bsseq' package must be installed for this functionality")
}
mats <- make_mat_bsseq(bedfiles, regions, sparse = FALSE)

## [18:19:19.065634] [iscream::query_all] [info] Querying 3 regions from 4 bedfiles
## 
## [18:19:19.065993] [iscream::query_all] [info] Creating metadata vectors
## [18:19:19.066020] [iscream::query_all] [info] 7 loci found - 9993 extra rows allocated with 0 resizes
## [18:19:19.066023] [iscream::query_all] [info] Creating dense matrix

do.call(BSseq, mats)

## An object of type 'BSseq' with
##   7 methylation loci
##   4 samples
## has not been smoothed
## All assays are in-memory

0.3 Session info

sessionInfo()

## R Under development (unstable) (2026-03-05 r89546)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 24.04.4 LTS
## 
## Matrix products: default
## BLAS:   /home/biocbuild/bbs-3.23-bioc/R/lib/libRblas.so 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0  LAPACK version 3.12.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB              LC_COLLATE=C              
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: America/New_York
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] bsseq_1.47.1                SummarizedExperiment_1.41.1
##  [3] Biobase_2.71.0              MatrixGenerics_1.23.0      
##  [5] matrixStats_1.5.0           GenomicRanges_1.63.1       
##  [7] Seqinfo_1.1.0               IRanges_2.45.0             
##  [9] S4Vectors_0.49.0            BiocGenerics_0.57.0        
## [11] generics_0.1.4              iscream_1.1.6              
## [13] BiocStyle_2.39.0           
## 
## loaded via a namespace (and not attached):
##  [1] farver_2.1.2              R.utils_2.13.0           
##  [3] Biostrings_2.79.5         bitops_1.0-9             
##  [5] fastmap_1.2.0             RCurl_1.98-1.17          
##  [7] GenomicAlignments_1.47.0  stringfish_0.18.0        
##  [9] XML_3.99-0.22             digest_0.6.39            
## [11] lifecycle_1.0.5           statmod_1.5.1            
## [13] compiler_4.6.0            rlang_1.1.7              
## [15] sass_0.4.10               tools_4.6.0              
## [17] yaml_2.3.12               data.table_1.18.2.1      
## [19] rtracklayer_1.71.3        knitr_1.51               
## [21] S4Arrays_1.11.1           curl_7.0.0               
## [23] DelayedArray_0.37.0       RColorBrewer_1.1-3       
## [25] abind_1.4-8               BiocParallel_1.45.0      
## [27] HDF5Array_1.39.0          R.oo_1.27.1              
## [29] grid_4.6.0                beachmat_2.27.3          
## [31] Rhdf5lib_1.33.0           scales_1.4.0             
## [33] gtools_3.9.5              dichromat_2.0-0.1        
## [35] cli_3.6.5                 rmarkdown_2.30           
## [37] crayon_1.5.3              otel_0.2.0               
## [39] RcppParallel_5.1.11-2     httr_1.4.8               
## [41] rjson_0.2.23              DelayedMatrixStats_1.33.0
## [43] pbapply_1.7-4             cachem_1.1.0             
## [45] rhdf5_2.55.13             parallel_4.6.0           
## [47] BiocManager_1.30.27       XVector_0.51.0           
## [49] restfulr_0.0.16           Matrix_1.7-4             
## [51] jsonlite_2.0.0            bookdown_0.46            
## [53] h5mread_1.3.2             locfit_1.5-9.12          
## [55] limma_3.67.0              jquerylib_0.1.4          
## [57] glue_1.8.0                parallelly_1.46.1        
## [59] codetools_0.2-20          BiocIO_1.21.0            
## [61] htmltools_0.5.9           rhdf5filters_1.23.3      
## [63] BSgenome_1.79.1           R6_2.6.1                 
## [65] sparseMatrixStats_1.23.0  evaluate_1.0.5           
## [67] lattice_0.22-9            R.methodsS3_1.8.2        
## [69] Rsamtools_2.27.1          cigarillo_1.1.0          
## [71] bslib_0.10.0              Rcpp_1.1.1               
## [73] SparseArray_1.11.11       permute_0.9-10           
## [75] xfun_0.56

iscream compatible data structures

8 March 2026

0.1 GenomicRanges

0.1.1 From tabix queries

0.1.2 From summarize_regions()

0.1.3 From make_mat