% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/dgsa_seq.R
\name{dgsa_seq}
\alias{dgsa_seq}
\title{Time-course Gene Set Analysis}
\usage{
dgsa_seq(
  exprmat = NULL,
  object = NULL,
  covariates = NULL,
  variables2test,
  weights_var2test_condi = (which_test != "permutation"),
  genesets,
  sample_group = NULL,
  cov_variables2test_eff = NULL,
  which_test = c("permutation", "asymptotic"),
  which_weights = c("loclin", "voom", "none"),
  n_perm = 1000,
  progressbar = TRUE,
  parallel_comp = TRUE,
  nb_cores = parallel::detectCores(logical = FALSE) - 1,
  preprocessed = FALSE,
  gene_based_weights = TRUE,
  bw = "nrd",
  kernel = c("gaussian", "epanechnikov", "rectangular", "triangular", "biweight",
    "tricube", "cosine", "optcosine"),
  transform = TRUE,
  padjust_methods = c("BH", "BY", "holm", "hochberg", "hommel", "bonferroni"),
  lowess_span = 0.5,
  R = NULL,
  adaptive = TRUE,
  max_adaptive = 64000,
  homogen_traj = FALSE,
  na.rm_gsaseq = TRUE,
  verbose = TRUE
)
}
\arguments{
\item{exprmat}{a numeric matrix of size \code{G x n} containing the raw
RNA-seq counts or preprocessed expressions from \code{n} samples for \code{G}
genes. Default is \code{NULL}, in which case \code{object} must not be
\code{NULL}.}

\item{object}{an object that can be either an
\code{\link[SummarizedExperiment:RangedSummarizedExperiment-class]{SummarizedExperiment}},
an \code{\link[Biobase:class.ExpressionSet]{ExpressionSet}}, a
\code{\link[DESeq2:DESeqDataSet]{DESeqDataSet}}, or a
\code{\link[edgeR:DGEList]{DGEList}}.
Default is \code{NULL}, in which case \code{exprmat} must not be
\code{NULL}.}

\item{covariates}{\itemize{
\item If \code{exprmat} is specified as a matrix:
then \code{covariates} must be a numeric matrix of size \code{n x p}
containing the model covariates for \code{n} samples (design matrix).
Usually, its first column is the intercept (full of \code{1}s).
\item If \code{object} is specified: then \code{covariates} must be a
character vector of length \code{p} containing the colnames of the
design matrix given in \code{object}.
} If \code{covariates} is \code{NULL} (the default), then it is just the
intercept.}

\item{variables2test}{\itemize{
\item If \code{exprmat} is specified as a matrix:
a numeric design matrix of size \code{n x K} containing
the \code{K} variables to be tested.
\item If \code{object} is specified: then \code{variables2test} must be a
character vector of length \code{K} containing the colnames of the
design matrix given in \code{object}.
}}

\item{weights_var2test_condi}{a logical flag indicating whether
heteroscedasticity weights computation should be conditional on both the
variable(s) to be tested \code{phi} and on covariate(s) \code{x}, or on
\code{x} alone.  Default is \code{TRUE} for the asymptotic test
(in which case conditional means are estimated conditionally on both
\code{variables2test} and \code{covariates}), and \code{FALSE} for the
permutation test (in which case conditional means are estimated
conditionally on only the  \code{covariates}).}

\item{genesets}{Can be either:\itemize{
\item a \code{vector}
\item a \code{list}
\item a \code{BiocSet} object
}
Can be a vector of index or subscripts that defines which
rows of \code{y} constitute the investigated gene set (when only 1 gene
set is being tested).

Can also be a \code{list} of index (or \code{rownames} of \code{y}) when
several gene sets are tested at once, such as the first element of a
\code{\link[GSA:GSA.read.gmt]{gmt}} object.

Finally, can also be a \code{\link[BiocSet:BiocSet-class]{BiocSet}} object

If \code{NULL}, then gene-wise p-values are returned.}

\item{sample_group}{a vector of length \code{n} indicating whether the samples
should be grouped (e.g. paired samples or longitudinal data). Coerced
to be a \code{factor}. Default is \code{NULL} in which case no grouping is
performed.}

\item{cov_variables2test_eff}{a matrix of size \code{K x K} containing the
covariance matrix of the \code{K} random effects. Only used if
\code{homogen_traj} is \code{FALSE}. Default assume diagonal correlation
matrix, i.e. independence of random effects.}

\item{which_test}{a character string indicating which method to use to
approximate the variance component score test, either \code{'permutation'} or
\code{'asymptotic'}. Default is \code{'permutation'}.}

\item{which_weights}{a character string indicating which method to use to
estimate the mean-variance relationship weights. Possibilities are
\code{'loclin'}, \code{'voom'} or \code{'none'} (in which case no weighting is
performed). Default is \code{'loclin'}.
See \code{\link{sp_weights}} and \code{\link{voom_weights}} for details.}

\item{n_perm}{the number of perturbations. Default is \code{1000}.}

\item{progressbar}{logical indicating wether a progressBar should be displayed
when computing permutations (only in interactive mode).}

\item{parallel_comp}{a logical flag indicating whether parallel computation
should be enabled. Only Linux and MacOS are supported, this is ignored on
Windows. Default is \code{TRUE}.}

\item{nb_cores}{an integer indicating the number of cores to be used when
\code{parallel_comp} is \code{TRUE}.
Default is \code{parallel::detectCores(logical=FALSE) - 1}.}

\item{preprocessed}{a logical flag indicating whether the expression data have
already been preprocessed (e.g. log2 transformed). Default is \code{FALSE}, in
which case \code{y} is assumed to contain raw counts and is normalized into
log(counts) per million.}

\item{gene_based_weights}{a logical flag used for \code{'loclin'} weights,
indicating whether to estimate weights at the gene-level, or rather at the
observation-level. Default is \code{TRUE}, and weights are then estimated at
the gene-level.}

\item{bw}{a character string indicating the smoothing bandwidth selection
method to use. See \code{\link[stats]{bandwidth}} for details. Possible values
are \code{'ucv'}, \code{'SJ'}, \code{'bcv'}, \code{'nrd'} or \code{'nrd0'}}

\item{kernel}{a character string indicating which kernel should be used.
Possibilities are \code{'gaussian'}, \code{'epanechnikov'},
\code{'rectangular'}, \code{'triangular'}, \code{'biweight'},
\code{'tricube'}, \code{'cosine'}, \code{'optcosine'}. Default is
\code{'gaussian'} (NB: \code{'tricube'} kernel
corresponds to the loess method).}

\item{transform}{a logical flag used for \code{'loclin'} weights, indicating
whether values should be transformed to uniform for the purpose of local
linear smoothing. This may be helpful if tail observations are sparse and the
specified bandwidth gives suboptimal performance there. Default is
\code{TRUE}.}

\item{padjust_methods}{multiple testing correction method used if
\code{genesets} is a list. Default is 'BH', i.e. Benjamini-Hochberg procedure
for controlling the FDR. Other possibilities are: \code{'holm'},
\code{'hochberg'}, \code{'hommel'}, \code{'bonferroni'} or \code{'BY'}
(for Benjamini-Yekutieli procedure).}

\item{lowess_span}{smoother span for the lowess function, between 0 and 1.
This gives the proportion of points in the plot which influence the smooth at
each value. Larger values give more smoothness. Only used if
\code{which_weights} is \code{'voom'}. Default is \code{0.5}.}

\item{R}{library size (optional, important to provide if
\code{preprocessed = TRUE}). Default is \code{NULL}}

\item{adaptive}{a logical flag indicating whether adaptive permutation should
be performed. Default is \code{TRUE}}

\item{max_adaptive}{The maximum number of permutations considered.
Default is \code{64000}}

\item{homogen_traj}{a logical flag indicating whether trajectories should be
considered homogeneous. Default is \code{FALSE} in which case trajectories are
not only tested for trend, but also for heterogeneity.}

\item{na.rm_gsaseq}{logical: should missing values in \code{y} (including
\code{NA} and \code{NaN}) be omitted from the calculations?
Default is \code{TRUE}.}

\item{verbose}{logical: should informative messages be printed during the
computation? Default is \code{TRUE}.}
}
\value{
A list with the following elements:\itemize{
  \item \code{which_test}: a character string carrying forward the value of
  the '\code{which_test}' argument indicating which test was perform (either
  'asymptotic' or 'permutation').
  \item \code{preprocessed}: a logical flag carrying forward the value of the
  '\code{preprocessed}' argument indicating whether the expression data were
  already preprocessed, or were provided as raw counts and transformed into
  log-counts per million.
  \item \code{n_perm}: an integer carrying forward the value of the
  '\code{n_perm}' argument indicating the number of perturbations performed
  (\code{NA} if asymptotic test was performed).
  \item \code{genesets}: carrying forward the value of the '\code{genesets}'
  argument defining the gene sets of interest (\code{NULL} for gene-wise t
  esting).
  \item \code{pval}: computed p-values. A \code{data.frame} with one raw for
  each each gene set, or for each gene if \code{genesets} argument is
  \code{NULL}, and with 2 columns: the first one '\code{rawPval}' contains
  the raw p-values, the second one contains the FDR adjusted p-values
  (according to the '\code{padjust_methods}' argument) and is named
  '\code{adjPval}'.
}
}
\description{
Wrapper function for performing gene set analysis of (potentially
longitudinal) RNA-seq data
}
\examples{

nsims <- 2 #100
res_quant <- list()
for(i in 1:2){
 n <- 2000#0
 nr <- 3
 r <- nr*20 #4*nr#100*nr
 t <- matrix(rep(1:nr), r/nr, ncol=1, nrow=r)
 sigma <- 0.4
 b0 <- 1

 #under the null:
 b1 <- 0

 y.tilde <- b0 + b1*t + rnorm(r, sd = sigma)
 y <- t(matrix(rnorm(n*r, sd = sqrt(sigma*abs(y.tilde))), ncol=n, nrow=r) +
        matrix(rep(y.tilde, n), ncol=n, nrow=r))
 x <- matrix(1, ncol=1, nrow=r)

 #run test
 res <- dgsa_seq(exprmat = y, covariates = x, variables2test = t,
                genesets=lapply(0:9, function(x){x*10+(1:10)}),
                cov_variables2test_eff = matrix(1),
                sample_group = rep(1:(r/nr), each=nr),
                which_test='asymptotic',
                which_weights='none', preprocessed=TRUE)
 res_genes <- dgsa_seq(exprmat = y, covariates = x,
                      variables2test = cbind(t),#, rnorm(r)), #t^2
                      genesets = NULL,
                      cov_variables2test_eff = diag(1),
                      sample_group = rep(1:(r/nr), each=nr),
                      which_test = 'asymptotic',
                      which_weights = 'none', preprocessed = TRUE)
 length(res_genes$pvals[, 'rawPval'])
 quantile(res_genes$pvals[, 'rawPval'])
 res_quant[[i]] <- res_genes$pvals[, 'rawPval']
}


#round(rowMeans(vapply(res_quant, FUN = quantile, FUN.VALUE = rep(1.1, 5))), 3)
#plot(density(unlist(res_quant)))
#mean(unlist(res_quant)<0.05)

if(interactive()){
res_genes <- dgsa_seq(exprmat = y, covariates = x, variables2test = t,
                    genesets = NULL,
                    cov_variables2test_eff = matrix(1),
                    sample_group = rep(1:(r/nr), each=nr),
                    which_test = 'permutation',
                    which_weights = 'none', preprocessed = TRUE,
                    n_perm = 1000, parallel_comp = FALSE)

mean(res_genes$pvals$rawPval < 0.05)
summary(res_genes$pvals$adjPval)
}
}
\references{
Agniel D & Hejblum BP (2017). Variance component score test for
time-course gene set analysis of longitudinal RNA-seq data,
\emph{Biostatistics}, 18(4):589-604.
\href{https://doi.org/10.1093/biostatistics/kxx005}{10.1093/biostatistics/kxx005}.
\href{https://arxiv.org/abs/1605.02351}{arXiv:1605.02351}.

Law, C. W., Chen, Y., Shi, W., & Smyth, G. K. (2014). voom:
Precision weights unlock linear model analysis tools for RNA-seq read counts.
\emph{Genome Biology}, 15(2), R29.
}
\seealso{
\code{\link{sp_weights}} \code{\link{vc_test_perm}}
\code{\link{vc_test_asym}} \code{\link{p.adjust}}
}
