% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/dataProcess.R
\name{dataProcess}
\alias{dataProcess}
\title{Process MS data: clean, normalize and summarize before differential analysis}
\usage{
dataProcess(
  raw,
  logTrans = 2,
  normalization = "equalizeMedians",
  nameStandards = NULL,
  featureSubset = "topN",
  remove_uninformative_feature_outlier = FALSE,
  min_feature_count = 2,
  n_top_feature = 100,
  summaryMethod = "TMP",
  equalFeatureVar = TRUE,
  censoredInt = "NA",
  MBimpute = TRUE,
  remove50missing = FALSE,
  fix_missing = NULL,
  maxQuantileforCensored = 0.999,
  use_log_file = TRUE,
  append = FALSE,
  verbose = TRUE,
  log_file_path = NULL,
  numberOfCores = 1,
  aft_iterations = 90
)
}
\arguments{
\item{raw}{name of the raw (input) data set.}

\item{logTrans}{base of logarithm transformation: 2 (default) or 10.}

\item{normalization}{normalization to remove systematic bias between MS runs. 
There are three different normalizations supported:
'equalizeMedians' (default) represents constant normalization (equalizing the medians) 
based on reference signals is performed. 
'quantile' represents quantile normalization based on reference signals 
'globalStandards' represents normalization with global standards proteins. 
If FALSE, no normalization is performed.  See MSstats vignettes for 
recommendations on which normalization option to use.}

\item{nameStandards}{optional vector of global standard peptide names. 
Required only for normalization with global standard peptides.}

\item{featureSubset}{"topN" (default) uses top N features which has highest average of log-intensity across runs. 
"top3" uses top 3 features which have highest average of log-intensity across runs. 
"all" uses all features that the data set has (not recommended in DIA experiments).
It needs the input for n_top_feature option. 
"highQuality" flags uninformative feature and outliers. See MSstats vignettes for 
recommendations on which feature selection option to use.}

\item{remove_uninformative_feature_outlier}{optional. Only required if 
featureSubset = "highQuality". TRUE allows to remove 
1) noisy features (flagged in the column feature_quality with "Uninformative"),
2) outliers (flagged in the column, is_outlier with TRUE, 
before run-level summarization. FALSE (default) uses all features and intensities 
for run-level summarization.}

\item{min_feature_count}{optional. Only required if featureSubset = "highQuality".
Defines a minimum number of informative features a protein needs to be considered
in the feature selection algorithm.}

\item{n_top_feature}{Specifies the number of top features to use in summarization (100 default). 
Only required if featureSubset = 'topN'.  
Default is 100, which means to use top 100 features. 
Smaller numbers can be set to improve processing times. This option is by default on 
at a high number (100) to improve processing times without affecting differential analysis.}

\item{summaryMethod}{"TMP" (default) means Tukey's median polish, 
which is robust estimation method. "linear" uses linear mixed model. If 
anomaly detection algorithm is performed, "linear" must be used.}

\item{equalFeatureVar}{only for summaryMethod = "linear". default is TRUE. 
Logical variable for whether the model should account for heterogeneous variation 
among intensities from different features. Default is TRUE, which assume equal 
variance among intensities from features. FALSE means that we cannot assume equal 
variance among intensities from features, then we will account for heterogeneous 
variation from different features.}

\item{censoredInt}{Missing values are censored or at random. 
'NA' (default) assumes that all 'NA's in 'Intensity' column are censored. 
'0' uses zero intensities as censored intensity. 
In this case, NA intensities are missing at random. 
The output from Skyline should use '0'. 
Null assumes that all NA intensites are randomly missing.}

\item{MBimpute}{only for summaryMethod = "TMP" and censoredInt = 'NA' or '0'. 
TRUE (default) imputes missing values with 'NA' or '0' (depending on censoredInt option) 
by Accelerated failure model. If set to FALSE, no missing values are imputed. 
FALSE is appropriate only when missingness is assumed to be at random.
See MSstats vignettes for recommendations on which imputation option to use.}

\item{remove50missing}{only for summaryMethod = "TMP". TRUE removes the proteins 
where every run has at least 50\% missing values for each peptide. FALSE is default.}

\item{fix_missing}{Optional, same as the `fix_missing` parameter in MSstatsConvert::MSstatsBalancedDesign function}

\item{maxQuantileforCensored}{Maximum quantile for deciding censored missing values, default is 0.999}

\item{use_log_file}{logical. If TRUE, information about data processing
will be saved to a file.}

\item{append}{logical. If TRUE, information about data processing will be added
to an existing log file.}

\item{verbose}{logical. If TRUE, information about data processing wil be printed
to the console.}

\item{log_file_path}{character. Path to a file to which information about 
data processing will be saved. 
If not provided, such a file will be created automatically.
If `append = TRUE`, has to be a valid path to a file.}

\item{numberOfCores}{Number of cores for parallel processing. When > 1, 
a logfile named `MSstats_dataProcess_log_progress.log` is created to 
track progress. Only works for Linux & Mac OS. Default is 1.}

\item{aft_iterations}{Number of iterations for AFT model fitting. Default is 90.}
}
\value{
A list containing:
\describe{
  \item{FeatureLevelData}{A data frame with feature-level information after processing. Columns include:
    \describe{
      \item{PROTEIN}{Identifier for the protein associated with the feature.}
      \item{PEPTIDE}{Identifier for the peptide sequence.}
      \item{TRANSITION}{Identifier for the transition, typically representing a specific ion pair.}
      \item{FEATURE}{Unique identifier for the feature, which could be a combination of peptide and transition.}
      \item{LABEL}{Specifies the isotopic labeling of peptides, notably for SRM-based experiments. "L" indicates light-labeled peptides while "H" denotes heavy-labeled peptides.}
      \item{GROUP}{Experimental group identifier.}
      \item{RUN}{Identifier for the specific MS run.}
      \item{SUBJECT}{Subject identifier within the experimental group.}
      \item{FRACTION}{Fraction identifier if fractionation was performed.}
      \item{originalRUN}{Original run identifier before any processing.}
      \item{censored}{Logical indicator of whether the intensity value is considered missing or below limit of detection.}
      \item{INTENSITY}{Original intensity measurement of the feature in the given run.}
      \item{ABUNDANCE}{Processed abundance or intensity value after log-transformation and normalization.}
      \item{newABUNDANCE}{The ABUNDANCE column but includes imputed missing values. It is the column that is used for protein summarization.}
      \item{predicted}{Predicted intensity values for censored data, typically derived from a statistical model.}
    }
  }
  \item{ProteinLevelData}{A data frame with run-level summarized information for each protein. Columns include:
    \describe{
      \item{RUN}{Identifier for the specific MS run.}
      \item{Protein}{Identifier for the protein.}
      \item{LogIntensities}{Log-transformed intensities for the protein in each run.}
      \item{originalRUN}{Original run identifier before any processing.}
      \item{GROUP}{Experimental group identifier.}
      \item{SUBJECT}{Subject identifier within the experimental group.}
      \item{TotalGroupMeasurements}{Total number of feature measurements for the protein in the given group.}
      \item{NumMeasuredFeatures}{Number of features measured for the protein in the given run.}
      \item{MissingPercentage}{Percentage of missing feature values for the protein in the given run.}
      \item{more50missing}{Logical indicator of whether more than 50 percent of the features values are missing for the protein in the given run.}
      \item{NumImputedFeature}{Number of features for which values were imputed due to missing or censored data for the protein in the given run.}
    }
  }
}
}
\description{
Process MS data: clean, normalize and summarize before differential analysis
}
\examples{
# Consider a raw data (i.e. SRMRawData) for a label-based SRM experiment from a yeast study
# with ten time points (T1-T10) of interests and three biological replicates.
# It is a time course experiment. The goal is to detect protein abundance changes
# across time points.
head(SRMRawData)
# Log2 transformation and normalization are applied (default)
QuantData<-dataProcess(SRMRawData, use_log_file = FALSE)
head(QuantData$FeatureLevelData)
# Log10 transformation and normalization are applied
QuantData1<-dataProcess(SRMRawData, logTrans=10, use_log_file = FALSE)
head(QuantData1$FeatureLevelData)
# Log2 transformation and no normalization are applied
QuantData2<-dataProcess(SRMRawData,normalization=FALSE, use_log_file = FALSE)
head(QuantData2$FeatureLevelData)

}
