% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/annotation.R
\name{getORFs}
\alias{getORFs}
\title{Extract Genomic ORFs from Transcript Sequences}
\usage{
getORFs(
  sequences,
  annotation,
  txdb_output_dir = NULL,
  organism = "Homo sapiens",
  require_ids = c("hgnc_id", "protein_id", "ccdsid"),
  source_filter = NULL,
  circ_seqs = NULL,
  start_codons = "ATG",
  stop_codons = "TAA|TAG|TGA",
  min_len = 0,
  longest_orf = TRUE,
  verbose = TRUE
)
}
\arguments{
\item{sequences}{Transcript sequences. Can be a character string (path
to a FASTA file),
a \code{DNAStringSet} object, or a \code{BSgenome} object.}

\item{annotation}{Transcript annotation. Can be a character string (path
to a GTF or GFF file) or a \code{TxDb} object.}

\item{txdb_output_dir}{TxDb output directory. The TxDb file path is
linked to the GRanges returned in the metadata slot. Default: \code{NULL}.}

\item{organism}{Character string specifying the organism name (used
only when building a TxDb from a GTF/GFF file). Default is
\code{"Homo sapiens"}.}

\item{require_ids}{Character vector. Metadata column names that must
be present and non-NA. Default: \code{c("hgnc_id", "protein_id", 
"ccdsid")}.}

\item{source_filter}{Character or NULL. If provided, filters
transcripts by the 'source' column. Default: \code{NULL}.}

\item{circ_seqs}{Character vector of circular sequences to exclude
(e.g., \code{"chrM"}).
Default is \code{"chrM"}.}

\item{start_codons}{Character vector of start codons to search for
(e.g., \code{"ATG"}).
Default is \code{"ATG"}.}

\item{stop_codons}{Character string of stop codons separated by
\code{"|"} (e.g., \code{"TAA|TAG|TGA"}).
Default is \code{"TAA|TAG|TGA"}.}

\item{min_len}{Integer specifying the minimum ORF length in bases.
Default is \code{0}.}

\item{longest_orf}{Logical. If \code{TRUE}, only the longest ORF per
transcript is returned. Default is \code{TRUE}.}

\item{verbose}{Logical. If \code{TRUE}, prints progress messages and
timing information. Default is \code{TRUE}.}
}
\value{
A \code{GRanges} object containing genomic coordinates of
ORFs, with metadata columns \code{gene_id} and \code{orf_type}.
Main ORFs are labeled as \code{"mORF"}, and small ORFs are
classified as \code{"uORF"}, \code{"dORF"}, or \code{"oORF"}.
}
\description{
Identifies open reading frames (ORFs) from transcript sequences and maps
them to genomic coordinates using a GTF/GFF annotation file or a TxDb
object. Supports input sequences as a FASTA file, a \code{DNAStringSet},
or a \code{BSgenome} object. Classifies small ORFs (sORFs) as upstream
(uORF), downstream (dORF), or overlapping (oORF) relative to the main
ORFs (mORFs).
}
\details{
\itemize{
\item ORFs are identified in transcript space using \code{findORFsFasta()}.
\item Coordinates are mapped to the genome using \code{mapFromTranscripts()}
and exon annotations.
\item Main ORFs are defined by overlap with annotated CDS regions.
\item Small ORFs are classified relative to mORFs based on strand-aware
genomic position.
}
}
\examples{
\dontshow{if (requireNamespace("TxDb.Hsapiens.UCSC.hg38.knownGene", quietly = TRUE) && requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly = TRUE) && requireNamespace("GenomicFeatures", quietly = TRUE)) withAutoprint(\{ # examplesIf}
library(BSgenome.Hsapiens.UCSC.hg38)
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
library(GenomicFeatures)

# Load genome and TxDb
genome <- BSgenome.Hsapiens.UCSC.hg38
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene

# Get exons grouped by transcript
exons_by_tx <- exonsBy(txdb, by = "tx", use.names = TRUE)

# Select a single transcript for demonstration
tx1 <- head(exons_by_tx, 100)

# Extract transcript sequence
tx_seqs <- extractTranscriptSeqs(genome, tx1)

# Run getORFs on the transcript sequence
txdb_output_dir <- tempdir()
gr <- getORFs(
    sequences = tx_seqs,
    annotation = txdb,
    txdb_output_dir = txdb_output_dir
)
print(gr)

# Clean up
sqlite_files <- list.files(txdb_output_dir, pattern = "\\\\.sqlite$", full.names = TRUE)
unlink(sqlite_files)
\dontshow{\}) # examplesIf}
}
\references{
Lawrence, M., Huber, W., Pagès, H., Aboyoun, P., Carlson, M.,
Gentleman, R., Morgan, M., Carey, V. (2013). Software for Computing
and Annotating Genomic Ranges. PLoS Computational Biology, 9.
DOI: 10.1371/journal.pcbi.1003118

Tjeldnes, H., Labun, K., Torres Cleuren, Y. et al.
ORFik: a comprehensive R toolkit for the analysis of translation.
BMC Bioinformatics 22, 336 (2021). DOI: 10.1186/s12859-021-04254-w
}
