## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(collapse = TRUE, comment = "#>")

## ----install, eval=FALSE------------------------------------------------------
# if (!requireNamespace("BiocManager", quietly=TRUE))
#     install.packages("BiocManager")
# BiocManager::install("pgen2gds")

## ----library------------------------------------------------------------------
library(SeqArray)
library(pgen2gds)

## ----example-files------------------------------------------------------------
pgen_fn <- system.file("extdata", "plink2_gen.pgen", package = "pgen2gds")
pvar_fn <- system.file("extdata", "plink2_gen.pvar", package = "pgen2gds")
psam_fn <- system.file("extdata", "plink2_gen.psam", package = "pgen2gds")

pgen_fn

## ----read-pvar----------------------------------------------------------------
pvar <- seqReadPVAR(pvar_fn)
head(pvar)
dim(pvar)

## ----read-pvar-subset---------------------------------------------------------
# Select the first 5 variants
head(seqReadPVAR(pvar_fn, sel = 1:5))

## ----convert------------------------------------------------------------------
gds_fn <- tempfile(fileext = ".gds")

seqPGEN2GDS(pgen_fn, out.gdsfn=gds_fn)

## ----open-gds-----------------------------------------------------------------
gds <- seqOpen(gds_fn)
gds

## ----basic-info---------------------------------------------------------------
# Sample and variant counts
cat("Samples: ", length(seqGetData(gds, "sample.id")), "\n")
cat("Variants:", length(seqGetData(gds, "variant.id")), "\n")

## ----chrom-table--------------------------------------------------------------
# Chromosome distribution
table(seqGetData(gds, "chromosome"))

## ----genotype-----------------------------------------------------------------
# Read genotypes for the first 5 variants
seqSetFilter(gds, variant.sel = 1:5)
geno <- seqGetData(gds, "genotype")
dim(geno)  # ploidy x samples x variants
geno[, 1:6, ]

## ----close-gds----------------------------------------------------------------
# close the file 
seqClose(gds)

## ----variant-sel--------------------------------------------------------------
gds_sub <- tempfile(fileext = ".gds")

# Convert only variants 10 through 20
seqPGEN2GDS(pgen_fn, out.gdsfn=gds_sub, variant.sel=10:20, verbose=FALSE)

f <- seqOpen(gds_sub)
cat("Variants:", length(seqGetData(f, "variant.id")), "\n")
seqClose(f)

unlink(gds_sub, force = TRUE)

## ----start-count--------------------------------------------------------------
gds_range <- tempfile(fileext = ".gds")

seqPGEN2GDS(pgen_fn, out.gdsfn = gds_range, start=100, count=50, verbose=FALSE)

f <- seqOpen(gds_range)
vid <- seqGetData(f, "variant.id")
cat("Variant IDs:", head(vid), "...", tail(vid), "\n")
cat("Total:", length(vid), "variants\n")
seqClose(f)

unlink(gds_range, force = TRUE)

## ----bioc-integration---------------------------------------------------------
suppressPackageStartupMessages(library(GenomicRanges))

# Re-open the converted GDS file
gds <- seqOpen(gds_fn)

# Compute reference allele frequency per variant
af <- seqAlleleFreq(gds)
summary(af)

## ----bioc-granges-------------------------------------------------------------
# Build a GRanges object from variant positions
gr <- GRanges(
    seqnames = seqGetData(gds, "chromosome"),
    ranges = IRanges(start=seqGetData(gds, "position"), width=1L),
    ref.freq = af
)
gr

## ----bioc-missing-------------------------------------------------------------
# Per-variant missing rate
mr <- seqMissing(gds)
summary(mr)

## ----bioc-close---------------------------------------------------------------
seqClose(gds)

## ----cleanup------------------------------------------------------------------
# Remove the generated GDS file
unlink(gds_fn, force = TRUE)

## ----session-info-------------------------------------------------------------
sessionInfo()

