% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/COTAN-estimators.R, R/COTAN-getters.R,
%   R/COTAN-modifiers.R, R/DEAOnClusters.R, R/UMAP-plot.R,
%   R/clustersMarkersHeatmapPlot.R, R/clustersSummaryPlot.R,
%   R/findClustersMarkers.R, R/geneSetEnrichment.R, R/reorderClusterization.R
\name{HandlingClusterizations}
\alias{HandlingClusterizations}
\alias{estimateNuLinearByCluster,COTAN-method}
\alias{estimateNuLinearByCluster}
\alias{getClusterizations,COTAN-method}
\alias{getClusterizations}
\alias{getClusterizationName,COTAN-method}
\alias{getClusterizationName}
\alias{getClusterizationData,COTAN-method}
\alias{getClusterizationData}
\alias{getClusters}
\alias{getClustersCoex,COTAN-method}
\alias{getClustersCoex}
\alias{addClusterization,COTAN-method}
\alias{addClusterization}
\alias{addClusterizationCoex,COTAN-method}
\alias{addClusterizationCoex}
\alias{dropClusterization,COTAN-method}
\alias{dropClusterization}
\alias{DEAOnClusters}
\alias{clusterGeneContingencyTables}
\alias{pValueFromDEA}
\alias{logFoldChangeOnClusters}
\alias{distancesBetweenClusters}
\alias{UMAPPlot}
\alias{cellsUMAPPlot}
\alias{clustersMarkersHeatmapPlot}
\alias{clustersSummaryData}
\alias{clustersSummaryPlot}
\alias{clustersTreePlot}
\alias{findClustersMarkers}
\alias{geneSetEnrichment}
\alias{reorderClusterization}
\title{Handling cells' \emph{clusterization} and related functions}
\usage{
\S4method{estimateNuLinearByCluster}{COTAN}(objCOTAN, clName = "", clusters = NULL)

\S4method{getClusterizations}{COTAN}(objCOTAN, dropNoCoex = FALSE, keepPrefix = FALSE)

\S4method{getClusterizationName}{COTAN}(objCOTAN, clName = "", keepPrefix = FALSE)

\S4method{getClusterizationData}{COTAN}(objCOTAN, clName = "")

getClusters(objCOTAN, clName = "")

\S4method{getClustersCoex}{COTAN}(objCOTAN)

\S4method{addClusterization}{COTAN}(
  objCOTAN,
  clName,
  clusters,
  coexDF = data.frame(),
  override = FALSE
)

\S4method{addClusterizationCoex}{COTAN}(objCOTAN, clName, coexDF)

\S4method{dropClusterization}{COTAN}(objCOTAN, clName)

DEAOnClusters(objCOTAN, clName = "", clusters = NULL)

clusterGeneContingencyTables(objCOTAN, gene, cells)

pValueFromDEA(coexDF, numCells, adjustmentMethod)

logFoldChangeOnClusters(
  objCOTAN,
  clName = "",
  clusters = NULL,
  floorLambdaFraction = 0.05
)

distancesBetweenClusters(
  objCOTAN,
  clName = "",
  clusters = NULL,
  coexDF = NULL,
  useDEA = TRUE,
  distance = NULL
)

UMAPPlot(
  dataIn,
  clusters = NULL,
  elements = NULL,
  title = "",
  colors = NULL,
  numNeighbors = 0L,
  minPointsDist = NaN
)

cellsUMAPPlot(
  objCOTAN,
  clName = "",
  clusters = NULL,
  useCoexEigen = FALSE,
  dataMethod = "",
  numComp = 25L,
  genesSel = "",
  numGenes = 200L,
  colors = NULL,
  numNeighbors = 0L,
  minPointsDist = NA
)

clustersMarkersHeatmapPlot(
  objCOTAN,
  groupMarkers = list(),
  clName = "",
  clusters = NULL,
  coexDF = NULL,
  kCuts = 3L,
  adjustmentMethod = "bonferroni",
  condNameList = NULL,
  conditionsList = NULL
)

clustersSummaryData(
  objCOTAN,
  clName = "",
  clusters = NULL,
  condName = "",
  conditions = NULL
)

clustersSummaryPlot(
  objCOTAN,
  clName = "",
  clusters = NULL,
  condName = "",
  conditions = NULL,
  plotTitle = ""
)

clustersTreePlot(
  objCOTAN,
  kCuts,
  clName = "",
  clusters = NULL,
  useDEA = TRUE,
  distance = NULL,
  hclustMethod = "ward.D2"
)

findClustersMarkers(
  objCOTAN,
  n = 10L,
  markers = NULL,
  clName = "",
  clusters = NULL,
  coexDF = NULL,
  adjustmentMethod = "bonferroni"
)

geneSetEnrichment(clustersCoex, groupMarkers = list())

reorderClusterization(
  objCOTAN,
  clName = "",
  clusters = NULL,
  coexDF = NULL,
  reverse = FALSE,
  keepMinusOne = TRUE,
  useDEA = TRUE,
  distance = NULL,
  hclustMethod = "ward.D2"
)
}
\arguments{
\item{objCOTAN}{a \code{COTAN} object}

\item{clName}{The name of the \emph{clusterization}. If not given the last
available \emph{clusterization} will be used, as it is probably the most
significant!}

\item{clusters}{A \emph{clusterization} to use. If given it will take precedence
on the one indicated by \code{clName}}

\item{dropNoCoex}{When \code{TRUE} drops the names from the \emph{clusterizations} with
empty associated \code{COEX} \code{data.frame}}

\item{keepPrefix}{When \code{TRUE} returns the internal name of the
\emph{clusterization}: the one with the \code{CL_} prefix.}

\item{coexDF}{a \code{data.frame} where each column indicates the \code{COEX} for each
of the \emph{clusters} of the \emph{clusterization}}

\item{override}{When \code{TRUE} silently allows overriding data for an existing
\emph{clusterization} name. Otherwise the default behavior will avoid potential
data losses}

\item{gene}{a gene}

\item{cells}{a sub-set of the cells}

\item{numCells}{the number of overall cells in all \emph{clusters}}

\item{adjustmentMethod}{\emph{p-value} multi-test adjustment method, see
\code{\link[stats:p.adjust]{stats::p.adjust.methods()}}. Defaults to \code{"bonferroni"}; use \code{"none"} for
no adjustment}

\item{floorLambdaFraction}{Indicates the lower bound to the average count
sums inside or outside the cluster for each gene as fraction of the
relevant \code{lambda} parameter. Default is \eqn{5\%}}

\item{useDEA}{Boolean indicating whether to use the \emph{DEA} to define the
distance; alternatively it will use the average \emph{Zero-One} counts, that is
faster but less precise.}

\item{distance}{type of distance to use. Default is \code{"cosine"} for \emph{DEA} and
\code{"euclidean"} for \emph{Zero-One}. Can be chosen among those supported by
\code{\link[parallelDist:parDist]{parallelDist::parDist()}}}

\item{dataIn}{The \code{matrix} to plot. It must have a row names containing the
given elements (the columns are features)}

\item{elements}{a named \code{list} of elements to label. Each array in the list
will be shown with a different color}

\item{title}{a string giving the plot title. Will default to \code{UMAP} Plot if
not specified}

\item{colors}{an \code{array} of colors to use in the plot. If not sufficient
colors are given it will complete the list using colors from
\code{\link[=getColorsVector]{getColorsVector()}}}

\item{numNeighbors}{Overrides the default \code{n_neighbors} value}

\item{minPointsDist}{Overrides the default \code{min_dist} value}

\item{useCoexEigen}{Boolean to determine whether to project the data \code{matrix}
onto the first eigenvectors of the \strong{COEX} \code{matrix} or instead restrict
the data \code{matrix} to the selected genes before applying the \code{PCA} reduction}

\item{dataMethod}{selects the method to use to create the \code{data.frame} to
pass to the \code{\link[=UMAPPlot]{UMAPPlot()}}. See \code{\link[=getDataMatrix]{getDataMatrix()}} for more details.}

\item{numComp}{Number of components of the reduced \code{matrix}, it defaults to
25L.}

\item{genesSel}{Decides whether and how to perform gene-selection. See
\code{\link[=getSelectedGenes]{getSelectedGenes()}} for more details.}

\item{numGenes}{the number of genes to select using the above method. Will be
ignored when an explicit list of genes has been passed in}

\item{groupMarkers}{an optional named \code{list} with an element for each group
comprised of one or more marker genes}

\item{kCuts}{the number of estimated \emph{cluster} (this defines the height for
the tree cut)}

\item{condNameList}{a \code{list} of \emph{conditions}' names to be used for additional
columns in the final plot. When none are given no new columns will be added
using data extracted via the function \code{\link[=clustersSummaryData]{clustersSummaryData()}}}

\item{conditionsList}{a \code{list} of \emph{conditions} to use. If given they will
take precedence on the ones indicated by \code{condNameList}}

\item{condName}{The name of a condition in the \code{COTAN} object to further
separate the cells in more sub-groups. When no condition is given it is
assumed to be the same for all cells (no further sub-divisions)}

\item{conditions}{The \emph{conditions} to use. If given it will take precedence
on the one indicated by \code{condName} that will only indicate the relevant
column name in the returned \code{data.frame}}

\item{plotTitle}{The title to use for the returned plot}

\item{hclustMethod}{It defaults is \code{"ward.D2"} but can be any of the methods
defined by the \code{\link[stats:hclust]{stats::hclust()}} function.}

\item{n}{the number of extreme \code{COEX} values to return}

\item{markers}{a \code{list} of marker genes}

\item{clustersCoex}{the \code{COEX} \code{data.frame}}

\item{reverse}{a flag to the output order}

\item{keepMinusOne}{a flag to decide whether to keep the cluster \code{"-1"}
(representing the non-clustered cells) untouched}
}
\value{
\code{estimateNuLinearByCluster()} returns the updated \code{COTAN} object

\code{getClusterizations()} returns a vector of \emph{clusterization} names,
usually without the \code{CL_} prefix

\code{getClusterizationName()} returns the normalized \emph{clusterization}
name or \code{NULL} if no \emph{clusterizations} are present

\code{getClusterizationData()} returns a \code{list} with 2 elements:
\itemize{
\item \code{"clusters"} the named cluster labels array
\item \code{"coex"} the associated \code{COEX} \code{data.frame}. This will be an \strong{empty}
\code{data.frame} when not specified for the relevant \emph{clusterization}
}

\code{getClusters()} returns the named cluster labels array

\code{getClustersCoex()} returns the list with a \code{COEX} \code{data.frame} for
each \emph{clusterization}. When not empty, each \code{data.frame} contains a \code{COEX}
column for each \emph{cluster}.

\code{addClusterization()} returns the updated \code{COTAN} object

\code{addClusterizationCoex()} returns the updated \code{COTAN} object

\code{dropClusterization()} returns the updated \code{COTAN} object

\code{DEAOnClusters()} returns the co-expression \code{data.frame} for the
genes in each \emph{cluster}

\code{clusterGeneContingencyTables()} returns a list containing the
observed and expected contingency tables

\code{pValueFromDEA()} returns a \code{data.frame} containing the \emph{p-values}
corresponding to the given \code{COEX} adjusted for \emph{multi-test}

\code{logFoldChangeOnClusters()} returns the log-expression-change
\code{data.frame} for the genes in each \emph{cluster}

\code{distancesBetweenClusters()} returns a \code{dist} object

\code{UMAPPlot()} returns a \code{ggplot2} object

\code{cellsUMAPPlot()} returns a list with 2 objects:
\itemize{
\item \code{"plot"} a \code{ggplot2} object representing the \code{umap} plot
\item \code{"cellsRDM"} the \emph{Reduced Data Matrix} used to create the plot
}

\code{clustersMarkersHeatmapPlot()} returns a list with:
\itemize{
\item \code{"heatmapPlot"} the complete heatmap plot
\item \code{"dataScore"} the \code{data.frame} with the score values
\item \code{"pValueDF"}  the \code{data.frame} with the corresponding adjusted
\eqn{p-}values
}

\code{clustersSummaryData()} returns a \code{data.frame}  with the following
statistics: The calculated statistics are:
\itemize{
\item \code{"clName"} the \emph{cluster} \strong{labels}
\item \code{"condName"} the relevant condition (that sub-divides the \emph{clusters})
\item \code{"CellNumber"} the number of cells in the group
\item \code{"MeanUDE"} the average \code{UDE} in the group of cells
\item \code{"MedianUDE"} the median \code{UDE} in the group of cells
\item \code{"ExpGenes25"} the number of genes expressed in at the least 25\% of the
cells in the group
\item \code{"ExpGenes"} the number of genes expressed at the least once in any of
the cells in the group
\item \code{"CellPercentage"} fraction of the cells with respect to the total cells
}

\code{clustersSummaryPlot()} returns a \code{list} with a \code{data.frame} and a
\code{ggplot} objects
\itemize{
\item \code{"data"} contains the data,
\item \code{"plot"} is the returned plot
}

\code{clustersTreePlot()} returns a list with 2 objects:
\itemize{
\item \code{"dend"} a \code{ggplot2} object representing the \code{dendrogram} plot
\item \code{"objCOTAN"} the updated \code{COTAN} object
}

\code{findClustersMarkers()} returns a \code{data.frame} containing \code{n} genes
for each \emph{cluster} scoring top/bottom \code{COEX} scores. The \code{data.frame} also
contains:
\itemize{
\item \code{"CL"} the cluster
\item \code{"Gene"} the gene
\item \code{"Score"} the \code{COEX} score of the gene
\item \code{"adjPVal"} the \emph{p-values} associated to the \code{COEX}
adjusted for \emph{multi-testing}
\item \code{"DEA"} the differential expression of the gene
\item \code{"IsMarker"} whether the gene is among the given markers
\item \code{"logFoldCh"} the \emph{log-fold-change} of the gene expression inside versus
outside the cluster from \code{\link[=logFoldChangeOnClusters]{logFoldChangeOnClusters()}}
}

\code{geneSetEnrichment()} returns a \code{data.frame} with the cumulative
score

\code{reorderClusterization()} returns a \code{list} with 3 elements:
\itemize{
\item \code{"clusters"} the newly reordered cluster labels array
\item \code{"coex"} the associated \code{COEX} \code{data.frame}
\item \code{"permMap"} the reordering mapping
}
}
\description{
These functions manage the \emph{clusterizations} and their
associated \emph{cluster} \code{COEX} \code{data.frame}s.

A \emph{clusterization} is any partition of the cells where to each cell it is
assigned a \strong{label}; a group of cells with the same label is called
\emph{cluster}.

For each \emph{cluster} is also possible to define a \code{COEX} value for each gene,
indicating its increased or decreased expression in the \emph{cluster} compared
to the whole background. A \code{data.frame} with these values listed in a
column for each \emph{cluster} is stored separately for each \emph{clusterization} in
the \code{clustersCoex} member.

The formulae for this \emph{In/Out} \code{COEX} are similar to those used in the
\code{\link[=calculateCoex]{calculateCoex()}} method, with the \strong{role} of the second gene taken by the
\emph{In/Out} status of the cells with respect to each \emph{cluster}.
}
\details{
\code{estimateNuLinearByCluster()} does a linear estimation of \code{nu}:
cells' counts averages normalized \emph{cluster} by \emph{cluster}

\code{getClusterizations()} extracts the list of the \emph{clusterizations}
defined in the \code{COTAN} object.

\code{getClusterizationName()} normalizes the given \emph{clusterization} name
or, if none were given, returns the name of last available \emph{clusterization}
in the \code{COTAN} object. It can return the \emph{clusterization} \strong{internal name}
if needed

\code{getClusterizationData()} extracts the asked \emph{clusterization} and
its associated \code{COEX} \code{data.frame} from the \code{COTAN} object

\code{getClusters()} extracts the asked \emph{clusterization} from the \code{COTAN}
object

\code{getClustersCoex()} extracts the full \code{clusterCoex} member \code{list}

\code{addClusterization()} adds a \emph{clusterization} to the current \code{COTAN}
object, by adding a new column in the \code{metaCells} \code{data.frame} and adding a
new element in the \code{clustersCoex} \code{list} using the passed in \code{COEX}
\code{data.frame} or an empty \code{data.frame} if none were passed in.

\code{addClusterizationCoex()} adds a \emph{clusterization} \code{COEX}
\code{data.frame} to the current \code{COTAN} object. It requires the named
\emph{clusterization} to be already present.

\code{dropClusterization()} drops a \emph{clusterization} from the current
\code{COTAN} object, by removing the corresponding column in the \code{metaCells}
\code{data.frame} and the corresponding \code{COEX} \code{data.frame} from the
\code{clustersCoex} \code{list}.

\code{DEAOnClusters()} is used to run the Differential Expression
analysis using the \code{COTAN} contingency tables on each \emph{cluster} in the
given \emph{clusterization}

\code{clusterGeneContingencyTables()} returns the observed and expected
contingency tables for a given gene and a given set of cells (a cluster).
The implementation runs the same algorithms used to calculate the full
observed/expected contingency tables used for DEA, but restricted to only
the relevant gene and cluster, thus much faster and less memory intensive

\code{pValueFromDEA()} is used to convert to \emph{p-value} the Differential
Expression analysis using the \code{COTAN} contingency tables on each \emph{cluster}
in the given \emph{clusterization}

\code{logFoldChangeOnClusters()} is used to get the log difference of the
expression levels for each \emph{cluster} in the given \emph{clusterization} against
the rest of the data-set

\code{distancesBetweenClusters()} is used to obtain a distance between
the clusters. Depending on the value of the \code{useDEA} flag will base the
distance on the \emph{DEA} columns or the averages of the \emph{Zero-One} matrix.

\code{UMAPPlot()} plots the given \code{data.frame} containing genes
information related to clusters after applying the \code{umap} transformation
via \code{\link[Seurat:RunUMAP]{Seurat::RunUMAP()}}

\code{cellsUMAPPlot()} returns a \code{ggplot2} plot where the given
\emph{clusters} are placed on the base of their relative distance. Also if
needed calculates and stores the \code{DEA} of the relevant \emph{clusterization}.

\code{clustersMarkersHeatmapPlot()} returns the heatmap plot of a summary
score for each \emph{cluster} and each gene marker in the given
\emph{clusterization}. It also returns the size and percentage of each
\emph{cluster} on the right and a \emph{clusterization} \code{dendogram} on the left, as
returned by the function \code{\link[=clustersTreePlot]{clustersTreePlot()}}. The heatmap cells' colors
express the \strong{DEA}, that is whether a gene is enriched or depleted in the
cluster, while the stars are aligned to the corresponding adjusted
\eqn{p-}value: \verb{***} for \eqn{p < 0.1\%}, \verb{**} for \eqn{p < 1\%}, \code{*} for
\eqn{p < 5\%}, \code{.} for \eqn{p < 10\%}

\code{clustersSummaryData()} calculates various statistics about each
cluster (with an optional further \code{condition} to separate the cells).

\code{clustersSummaryPlot()} calculates various statistics about each
cluster via \code{\link[=clustersSummaryData]{clustersSummaryData()}} and puts them together into a plot.

\code{clustersTreePlot()} returns the \code{dendogram} plot where the given
\emph{clusters} are placed on the base of their relative distance. Also if
needed calculates and stores the \code{DEA} of the relevant \emph{clusterization}.

\code{findClustersMarkers()} takes in a \code{COTAN} object and a
\emph{clusterization} and produces a \code{data.frame} with the \code{n} most positively
enriched and the \code{n} most negatively enriched genes for each \emph{cluster}. The
function also provides whether and the found genes are in the given
\code{markers} list or not. It also returns the \emph{adjusted p-value} for
multi-tests using the \code{\link[stats:p.adjust]{stats::p.adjust()}}

\code{geneSetEnrichment()} returns a cumulative score of enrichment in a
\emph{cluster} over a gene set. In formulae it calculates
\eqn{\frac{1}{n}\sum_i(1-e^{-\theta X_i})}, where the \eqn{X_i} are the
positive values from \code{\link[=DEAOnClusters]{DEAOnClusters()}} and \eqn{\theta = -\frac{1}{0.1}
  \ln(0.25)}

\code{reorderClusterization()} takes in a \emph{clusterizations} and reorder
its labels so that in the new order near labels indicate near clusters
according to a \emph{DEA} (or \emph{Zero-One}) based distance
}
\examples{
data("test.dataset")
objCOTAN <- COTAN(raw = test.dataset)
objCOTAN <- proceedToCoex(objCOTAN, cores = 6L, calcCoex = TRUE,
                          optimizeForSpeed = TRUE, saveObj = FALSE)

data("test.dataset.clusters1")
clusters <- test.dataset.clusters1

coexDF <- DEAOnClusters(objCOTAN, clusters = clusters)

groupMarkers <- list(G1 = c("g-000010", "g-000020", "g-000138",
                            "g-000150", "g-000160", "g-000170"),
                     G2 = c("g-000300", "g-000330", "g-000450",
                            "g-000460", "g-000470"),
                     G3 = c("g-000510", "g-000530", "g-000550",
                            "g-000570", "g-000590"))

geneClusters <- rep(1:3, each = 240)[1:600]
names(geneClusters) <- getGenes(objCOTAN)

umapPlot <- UMAPPlot(coexDF, clusters = NULL, elements = groupMarkers)
plot(umapPlot)

objCOTAN <- addClusterization(objCOTAN, clName = "first_clusterization",
                              clusters = clusters, coexDF = coexDF)

lfcDF <- logFoldChangeOnClusters(objCOTAN, clusters = clusters)
umapPlot2 <- UMAPPlot(lfcDF, clusters = geneClusters)
plot(umapPlot2)

if (FALSE) {
  objCOTAN <- estimateNuLinearByCluster(objCOTAN, clusters = clusters)
}

clSummaryPlotAndData <-
  clustersSummaryPlot(objCOTAN, clName = "first_clusterization",
                      plotTitle = "first clusterization")
plot(clSummaryPlotAndData[["plot"]])

if (FALSE) {
  objCOTAN <- dropClusterization(objCOTAN, "first_clusterization")
}

clusterizations <- getClusterizations(objCOTAN, dropNoCoex = TRUE)
stopifnot(length(clusterizations) == 1)

cellsUmapPlotAndDF <- cellsUMAPPlot(objCOTAN, dataMethod = "LogLikelihood",
                                    useCoexEigen = TRUE, numComp = 25L,
                                    clName = "first_clusterization")
plot(cellsUmapPlotAndDF[["plot"]])

enrichment <- geneSetEnrichment(clustersCoex = coexDF,
                                groupMarkers = groupMarkers)

clHeatmapPlotAndData <- clustersMarkersHeatmapPlot(objCOTAN, groupMarkers)
clHeatmapPlotAndData[["heatmapPlot"]]

conditions <- as.integer(substring(getCells(objCOTAN), 3L))
conditions <- factor(ifelse(conditions <= 600, "L", "H"))
names(conditions) <- getCells(objCOTAN)

objCOTAN <- addCondition(objCOTAN, condName = "High/Low",
                         conditions = conditions)

clHeatmapPlotAndData2 <-
  clustersMarkersHeatmapPlot(objCOTAN, groupMarkers, kCuts = 2,
                             condNameList = list("High/Low"))
clHeatmapPlotAndData2[["heatmapPlot"]]

clName <- getClusterizationName(objCOTAN)

clusterDataList <- getClusterizationData(objCOTAN, clName = clName)

clusters <- getClusters(objCOTAN, clName = clName)

allClustersCoexDF <- getClustersCoex(objCOTAN)

summaryData <- clustersSummaryData(objCOTAN)

treePlotAndObj <- clustersTreePlot(objCOTAN, 2)
objCOTAN <- treePlotAndObj[["objCOTAN"]]
plot(treePlotAndObj[["dend"]])

clMarkers <- findClustersMarkers(objCOTAN, markers = list(),
                                 clusters = clusters)

}
