% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/agglomerate.R, R/getPrevalence.R
\name{agglomerate-methods}
\alias{agglomerate-methods}
\alias{agglomerateByRank}
\alias{mergeFeaturesByRank}
\alias{agglomerateByRank,SummarizedExperiment-method}
\alias{mergeFeaturesByRank,SummarizedExperiment-method}
\alias{agglomerateByRank,SingleCellExperiment-method}
\alias{mergeFeaturesByRank,SingleCellExperiment-method}
\alias{agglomerateByRank,TreeSummarizedExperiment-method}
\alias{mergeFeaturesByRank,TreeSummarizedExperiment-method}
\alias{agglomerateByPrevalence}
\alias{agglomerateByPrevalence,SummarizedExperiment-method}
\title{Agglomerate data using taxonomic information}
\usage{
agglomerateByRank(x, ...)

mergeFeaturesByRank(x, ...)

\S4method{agglomerateByRank}{SummarizedExperiment}(
  x,
  rank = taxonomyRanks(x)[1],
  onRankOnly = FALSE,
  na.rm = FALSE,
  empty.fields = c(NA, "", " ", "\\t", "-", "_"),
  ...
)

\S4method{mergeFeaturesByRank}{SummarizedExperiment}(
  x,
  rank = taxonomyRanks(x)[1],
  onRankOnly = FALSE,
  na.rm = FALSE,
  empty.fields = c(NA, "", " ", "\\t", "-", "_"),
  ...
)

\S4method{agglomerateByRank}{SingleCellExperiment}(x, ..., altexp = NULL, strip_altexp = TRUE)

\S4method{mergeFeaturesByRank}{SingleCellExperiment}(x, ..., altexp = NULL, strip_altexp = TRUE)

\S4method{agglomerateByRank}{TreeSummarizedExperiment}(
  x,
  ...,
  agglomerate.tree = agglomerateTree,
  agglomerateTree = FALSE
)

\S4method{mergeFeaturesByRank}{TreeSummarizedExperiment}(x, ..., agglomerate.tree = FALSE)

agglomerateByPrevalence(x, ...)

\S4method{agglomerateByPrevalence}{SummarizedExperiment}(
  x,
  rank = taxonomyRanks(x)[1L],
  other_label = "Other",
  ...
)
}
\arguments{
\item{x}{a
\code{\link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment}}
object}

\item{...}{arguments passed to \code{agglomerateByRank} function for
\code{SummarizedExperiment} objects,
to \code{getPrevalence} and \code{getPrevalentTaxa} and used in
\code{agglomeratebyPrevalence},
to \code{\link[=merge-methods]{mergeRows}} and
\code{\link[scuttle:sumCountsAcrossFeatures]{sumCountsAcrossFeatures}}.
\itemize{
\item{\code{remove_empty_ranks}}{A single boolean value for selecting
whether to remove those columns of rowData that include only NAs after
agglomeration. (By default: \code{remove_empty_ranks = FALSE})}
\item{\code{make_unique}}{A single boolean value for selecting
whether to make rownames unique. (By default: \code{make_unique = TRUE})}
\item{\code{detection}}{Detection threshold for absence/presence.
Either an absolute value compared directly to the values of \code{x}
or a relative value between 0 and 1, if \code{as_relative = FALSE}.}
\item{\code{prevalence}}{Prevalence threshold (in 0 to 1). The
required prevalence is strictly greater by default. To include the
limit, set \code{include_lowest} to \code{TRUE}.}
\item{\code{as.relative}}{Logical scalar: Should the detection
threshold be applied on compositional (relative) abundances?
(default: \code{FALSE})}
}}

\item{rank}{a single character defining a taxonomic rank. Must be a value of
\code{taxonomyRanks()} function.}

\item{onRankOnly}{\code{TRUE} or \code{FALSE}: Should information only from
the specified rank be used or from ranks equal and above? See details.
(default: \code{onRankOnly = FALSE})}

\item{na.rm}{\code{TRUE} or \code{FALSE}: Should taxa with an empty rank be
removed? Use it with caution, since empty entries on the selected rank
will be dropped. This setting can be tweaked by defining
\code{empty.fields} to your needs. (default: \code{na.rm = TRUE})}

\item{empty.fields}{a \code{character} value defining, which values should be
regarded as empty. (Default: \code{c(NA, "", " ", "\t")}). They will be
removed if \code{na.rm = TRUE} before agglomeration.}

\item{altexp}{String or integer scalar specifying an alternative experiment
containing the input data.}

\item{strip_altexp}{\code{TRUE} or \code{FALSE}: Should alternative
experiments be removed prior to agglomeration? This prevents to many
nested alternative experiments by default (default:
\code{strip_altexp = TRUE})}

\item{agglomerate.tree}{\code{TRUE} or \code{FALSE}: should
\code{rowTree()} also be agglomerated? (Default:
\code{agglomerate.tree = FALSE})}

\item{agglomerateTree}{alias for \code{agglomerate.tree}.}

\item{other_label}{A single \code{character} valued used as the label for the
summary of non-prevalent taxa. (default: \code{other_label = "Other"})}
}
\value{
\code{agglomerateByRank} returns a taxonomically-agglomerated,
optionally-pruned object of the same class as \code{x}.

\code{agglomerateByPrevalence} returns a taxonomically-agglomerated object
of the same class as x and based on prevalent taxonomic results.
}
\description{
Agglomeration functions can be used to sum-up data based on specific criteria
such as taxonomic ranks, variables or prevalence.
}
\details{
Depending on the available taxonomic data and its structure, setting
\code{onRankOnly = TRUE} has certain implications on the interpretability of
your results. If no loops exist (loops meaning two higher ranks containing
the same lower rank), the results should be comparable. You can check for
loops using \code{\link[TreeSummarizedExperiment:detectLoop]{detectLoop}}.

Agglomeration sums up the values of assays at the specified taxonomic level. With
certain assays, e.g. those that include binary or negative values, this summing
can produce meaningless values. In those cases, consider performing agglomeration
first, and then applying the transformation afterwards.

\code{agglomerateByPrevalence} sums up the values of assays at the taxonomic
level specified by \code{rank} (by default the highest taxonomic level
available) and selects the summed results that exceed the given population
prevalence at the given detection level. The other summed values (below the
threshold) are agglomerated in an additional row taking the name indicated by
\code{other_label} (by default "Other").
}
\examples{
data(GlobalPatterns)
# print the available taxonomic ranks
colnames(rowData(GlobalPatterns))
taxonomyRanks(GlobalPatterns)

# agglomerate at the Family taxonomic rank
x1 <- agglomerateByRank(GlobalPatterns, rank="Family")
## How many taxa before/after agglomeration?
nrow(GlobalPatterns)
nrow(x1)

# agglomerate the tree as well
x2 <- agglomerateByRank(GlobalPatterns, rank="Family",
                       agglomerate.tree = TRUE)
nrow(x2) # same number of rows, but
rowTree(x1) # ... different
rowTree(x2) # ... tree

 # If assay contains binary or negative values, summing might lead to meaningless
 # values, and you will get a warning. In these cases, you might want to do 
 # agglomeration again at chosen taxonomic level.
 tse <- transformAssay(GlobalPatterns, method = "pa")
 tse <- agglomerateByRank(tse, rank = "Genus")
 tse <- transformAssay(tse, method = "pa")

# removing empty labels by setting na.rm = TRUE
sum(is.na(rowData(GlobalPatterns)$Family))
x3 <- agglomerateByRank(GlobalPatterns, rank="Family", na.rm = TRUE)
nrow(x3) # different from x2

# Because all the rownames are from the same rank, rownames do not include 
# prefixes, in this case "Family:". 
print(rownames(x3[1:3,]))

# To add them, use getTaxonomyLabels function.
rownames(x3) <- getTaxonomyLabels(x3, with_rank = TRUE)
print(rownames(x3[1:3,]))

# use 'remove_empty_ranks' to remove columns that include only NAs
x4 <- agglomerateByRank(GlobalPatterns, rank="Phylum", remove_empty_ranks = TRUE)
head(rowData(x4))

# If the assay contains NAs, you might want to consider replacing them,
# since summing-up NAs lead to NA
x5 <- GlobalPatterns
# Replace first value with NA
assay(x5)[1,1] <- NA
x6 <- agglomerateByRank(x5, "Kingdom")
head( assay(x6) )
# Replace NAs with 0. This is justified when we are summing-up counts.
assay(x5)[ is.na(assay(x5)) ] <- 0
x6 <- agglomerateByRank(x5, "Kingdom")
head( assay(x6) )

## Look at enterotype dataset...
data(enterotype)
## Print the available taxonomic ranks. Shows only 1 available rank,
## not useful for agglomerateByRank
taxonomyRanks(enterotype)
## Data can be aggregated based on prevalent taxonomic results
tse <- GlobalPatterns
tse <- agglomerateByPrevalence(tse,
                              rank = "Phylum",
                              detection = 1/100,
                              prevalence = 50/100,
                              as_relative = TRUE)

tse

# Here data is aggregated at the taxonomic level "Phylum". The five phyla
# that exceed the population prevalence threshold of 50/100 represent the 
# five first rows of the assay in the aggregated data. The sixth and last row
# named by default "Other" takes the summed up values of all the other phyla 
# that are below the prevalence threshold.

assay(tse)[,1:5]

}
\seealso{
\code{\link[=merge-methods]{mergeRows}},
\code{\link[scuttle:sumCountsAcrossFeatures]{sumCountsAcrossFeatures}}
}
