% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/remove_redundancy.R
\docType{methods}
\name{remove_redundancy}
\alias{remove_redundancy}
\alias{remove_redundancy,SummarizedExperiment-method}
\alias{remove_redundancy,RangedSummarizedExperiment-method}
\title{Drop redundant elements (e.g., samples) for which feature (e.g., transcript/gene) abundances are correlated}
\usage{
remove_redundancy(
  .data,
  .element = NULL,
  .feature = NULL,
  .abundance = NULL,
  method,
  of_samples = TRUE,
  correlation_threshold = 0.9,
  top = Inf,
  transform = identity,
  Dim_a_column,
  Dim_b_column,
  log_transform = NULL
)

\S4method{remove_redundancy}{SummarizedExperiment}(
  .data,
  .element = NULL,
  .feature = NULL,
  .abundance = NULL,
  method,
  of_samples = TRUE,
  correlation_threshold = 0.9,
  top = Inf,
  transform = identity,
  Dim_a_column = NULL,
  Dim_b_column = NULL,
  log_transform = NULL
)

\S4method{remove_redundancy}{RangedSummarizedExperiment}(
  .data,
  .element = NULL,
  .feature = NULL,
  .abundance = NULL,
  method,
  of_samples = TRUE,
  correlation_threshold = 0.9,
  top = Inf,
  transform = identity,
  Dim_a_column = NULL,
  Dim_b_column = NULL,
  log_transform = NULL
)
}
\arguments{
\item{.data}{A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment))}

\item{.element}{The name of the element column (normally samples).}

\item{.feature}{The name of the feature column (normally transcripts/genes)}

\item{.abundance}{The name of the column including the numerical value the clustering is based on (normally transcript abundance)}

\item{method}{A character string. The method to use, correlation and reduced_dimensions are available. The latter eliminates one of the most proximar pairs of samples in PCA reduced dimensions.}

\item{of_samples}{A boolean. In case the input is a tidybulk object, it indicates Whether the element column will be sample or transcript column}

\item{correlation_threshold}{A real number between 0 and 1. For correlation based calculation.}

\item{top}{An integer. How many top genes to select for correlation based method}

\item{transform}{A function that will tranform the counts, by default it is log1p for RNA sequencing data, but for avoinding tranformation you can use identity}

\item{Dim_a_column}{A character string. For reduced_dimension based calculation. The column of one principal component}

\item{Dim_b_column}{A character string. For reduced_dimension based calculation. The column of another principal component}

\item{log_transform}{DEPRECATED - A boolean, whether the value should be log-transformed (e.g., TRUE for RNA sequencing data)}
}
\value{
A tbl object with with dropped redundant elements (e.g., samples).

A `SummarizedExperiment` object

A `SummarizedExperiment` object
}
\description{
remove_redundancy() takes as input A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) for correlation method or | <DIMENSION 1> | <DIMENSION 2> | <...> | for reduced_dimensions method, and returns a consistent object (to the input) with dropped elements (e.g., samples).
}
\details{
`r lifecycle::badge("maturing")`

This function removes redundant elements from the original data set (e.g., samples or transcripts).
For example, if we want to define cell-type specific signatures with low sample redundancy.
This function returns a tibble with dropped redundant elements (e.g., samples).
Two redundancy estimation approaches are supported:
(i) removal of highly correlated clusters of elements (keeping a representative) with method="correlation";
(ii) removal of most proximal element pairs in a reduced dimensional space.

Underlying method for correlation:
widyr::pairwise_cor(sample, transcript,count, sort = TRUE, diag = FALSE, upper = FALSE)

Underlying custom method for reduced dimensions:
select_closest_pairs = function(df) {
  couples <- df |> head(n = 0)
  while (df |> nrow() > 0) {
    pair <- df |> arrange(dist) |> head(n = 1)
    couples <- couples |> bind_rows(pair)
    df <- df |> filter(!`sample 1` %in% (pair |> select(1:2) |> as.character()) & !`sample 2` %in% (pair |> select(1:2) |> as.character()))

  }
  couples
}
}
\examples{
## Load airway dataset for examples

  data('airway', package = 'airway')
  # Ensure a 'condition' column exists for examples expecting it

    SummarizedExperiment::colData(airway)$condition <- SummarizedExperiment::colData(airway)$dex




 airway |>
 identify_abundant() |>
   remove_redundancy(
	   .element = sample,
	   .feature = transcript,
	   	.abundance =  count,
	   	method = "correlation"
	   	)

}
\references{
Mangiola, S., Molania, R., Dong, R., Doyle, M. A., & Papenfuss, A. T. (2021). tidybulk: an R tidy framework for modular transcriptomic data analysis. Genome Biology, 22(1), 42. doi:10.1186/s13059-020-02233-7
}
