% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/PSM-class.R, R/PSM-reduce.R,
%   R/adjacencyMatrix-accessors.R
\name{PSM}
\alias{PSM}
\alias{PSM-class}
\alias{reducePSMs}
\alias{readPSMs}
\alias{psmVariables}
\alias{PSM,character}
\alias{PSM,data.frame}
\alias{PSM,PSM}
\alias{reduced}
\alias{reduced<-}
\alias{adjacencyMatrix,PSM-method}
\title{A class for peptide-spectrum matches}
\usage{
PSM(
  x,
  spectrum = NA,
  peptide = NA,
  protein = NA,
  decoy = NA,
  rank = NA,
  score = NA,
  fdr = NA,
  parser = c("mzR", "mzID"),
  BPPARAM = SerialParam()
)

reduced(object, spectrum = psmVariables(object)["spectrum"])

reduced(object) <- value

psmVariables(object, which = "all")

reducePSMs(object, k = object[[psmVariables(object)["spectrum"]]])

\S4method{adjacencyMatrix}{PSM}(object)
}
\arguments{
\item{x}{\code{character()} of mzid file names, an instance of class
\code{PSM}, or a \code{data.frame}.}

\item{spectrum}{\code{character(1)} variable name that defines a
spectrum in the PSM data. Default are \code{"spectrumID"} (mzR
parser) or \code{"spectrumid"} (mzID parser). It is also used to
calculate the reduced state.}

\item{peptide}{\code{character(1)} variable name that defines a peptide
in the PSM data. Detaults are \code{"sequence"} (mzR parser) or
\code{"pepSeq"} (mzID parser).}

\item{protein}{\code{character(1)} variable name that defines a protein
in the PSM data. Detaults are \code{"DatabaseAccess"} (mzR parser)
or \code{"accession"} (mzID parser).}

\item{decoy}{\code{character(1)} variable name that defines a decoy hit
in the PSM data. Detaults are \code{"isDecoy"} (mzR parser) or
\code{"isdecoy"} (mzID parser).}

\item{rank}{\code{character(1)} variable name that defines the rank of
the peptide spectrum match in the PSM data. Default is \code{"rank"}.}

\item{score}{\code{character(1)} variable name that defines the PSM
score. This value isn't set by default as it depends on the
search engine and application. Default is \code{NA}.}

\item{fdr}{\code{character(1)} variable name that defines that defines
the spectrum FDR (or any similar/relevant metric that can be
used for filtering). This value isn't set by default as it
depends on the search engine and application. Default is \code{NA}.}

\item{parser}{\code{character(1)} defining the parser to be used to
read the \code{mzIdentML} files. One of \code{"mzR"} (default) or
\code{"mzID"}.}

\item{BPPARAM}{an object from the \code{BiocParallel} package to control parallel
processing. The default value is SerialParam()` to read files in series.}

\item{object}{An instance of class \code{PSM}.}

\item{value}{new value to be passed to setter.}

\item{which}{\code{character()} with the PSM variable name to
retrieve. If \code{"all"} (default), all named variables are
returned. See \code{\link[=PSM]{PSM()}} for valid PSM variables.}

\item{k}{A \code{vector} or \code{factor} of length equal to \code{nrow(x)} that
defines the primary key used to reduce \code{x}. This typically
corresponds to the spectrum identifier. The defauls is to use
the spectrum PSM variable.}
}
\value{
\code{PSM()} returns a \code{PSM} object.

\code{reducePSMs()} returns a reduced version of the \code{x} input.
}
\description{
The \code{PSM} class is a simple class to store and manipulate
peptide-spectrum matches. The class encapsulates PSM data as a
\code{DataFrame} (or more specifically a \code{DFrame}) with additional
lightweight metadata annotation.

There are two types of \code{PSM} objects:
\itemize{
\item Objects with duplicated spectrum identifiers. This holds for
multiple matches to the same spectrum, be it different peptide
sequences or the same sequence with or without a
post-translational modification. Such objects are typically
created with the \code{PSM()} constructor starting from \code{mzIdentML}
files.
\item Reduced objects where the spectrum identifiers (or any
equivalent column) are unique keys within the PSM table. Matches
to the same scan/spectrum are merged into a single PSM data
row. Reduced \code{PSM} object are created with the \code{reducePSMs()}
function. See examples below.
}

Objects can be checked for their reduced state with the
\code{reduced()} function which returns \code{TRUE} for reduced instances,
\code{FALSE} when the spectrum identifiers are duplicated, or NA when
unknown. The flag can also be set explicitly with the
\verb{reduced()<-} setter.
}
\section{Creating and using PSM objects}{

\itemize{
\item The \code{\link[=PSM]{PSM()}} constructor uses parsers provided by the \code{mzR} or
\code{mzID} packages to read the \code{mzIdentML} data. The vignette
describes some apparent differences in their outputs. The
constructor input is a character of one more multiple file
names.
\item \code{PSM} objects can also be created from a \code{data.frame} object (or
any variable that can be coerced into a \link{DataFrame}.
\item Finally, \code{\link[=PSM]{PSM()}} can also take a \code{PSM} object as input, which
leaves the PSM data as is and is used to set/update the PSM
variables.
\item The constructor can also initialise variables (called \emph{PSM
variables}) needed for downstream processing, notably filtering
(see \code{\link[=filterPSMs]{filterPSMs()}}) and to generate a peptide-by-protein
adjacency matrix (see \code{\link[=makeAdjacencyMatrix]{makeAdjacencyMatrix()}}). These variables
can be extracted with the \code{\link[=psmVariables]{psmVariables()}} function. They
represent the columns in the PSM table that identify spectra,
peptides, proteins, decoy peptides hit ranks and, optionally, a
PSM score. The value of these variables will depend on the
backend used to create the object, or left blank (i.e. encoded
as \code{NA}) when building an object by hand from a \code{data.frame}. In
such situation, they need to be passed explicitly by the user as
arguments to \code{\link[=PSM]{PSM()}}.
\item The \code{adjacencyMatrix()} accessor can be used to retrieve the
binary sparse peptide-by-protein adjacency matrix from the PSM
object. It also relies on PSM variables which thus need to be
set beforehand. For more flexibility in the generation of the
adjacency matrix (for non-binary matrices), use
\code{\link[=makeAdjacencyMatrix]{makeAdjacencyMatrix()}}.
}
}

\examples{

## ---------------------------------
## Example with a single mzid file
## ---------------------------------

f <- msdata::ident(full.names = TRUE, pattern = "TMT")
basename(f)

## mzR parser (default)
psm <- PSM(f)
psm

## PSM variables
psmVariables(psm)

## mzID parser
psm_mzid <- PSM(f, parser = "mzID")
psm_mzid

## different PSM variables
psmVariables(psm_mzid)

## Reducing the PSM data
(i <- which(duplicated(psm$spectrumID))[1:2])
(i <- which(psm$spectrumID \%in\% psm$spectrumID[i]))
psm2 <- psm[i, ]
reduced(psm2)

## Peptide sequence CIDRARHVEVQIFGDGKGRVVALGERDCSLQRR with
## Carbamidomethyl modifications at positions 1 and 28.
DataFrame(psm2[, c("sequence", "spectrumID", "modName", "modLocation")])
reduced(psm2) <- FALSE
reduced(psm2)

## uses by default the spectrum PSM variable, as defined during
## the construction - see psmVariables()
rpsm2 <- reducePSMs(psm2)
rpsm2
DataFrame(rpsm2[, c("sequence", "spectrumID", "modName", "modLocation")])
reduced(rpsm2)

## ---------------------------------
## Multiple mzid files
## ---------------------------------

library(rpx)
PXD022816 <- PXDataset("PXD022816")
PXD022816

(mzids <- pxget(PXD022816, grep("mzID", pxfiles(PXD022816))[1:2]))
psm <- PSM(mzids)
psm
psmVariables(psm)

## Here, spectrum identifiers are repeated accross files
psm[grep("scan=20000", psm$spectrumID), "spectrumFile"]

## Let's create a new primary identifier composed of the scan
## number and the file name
psm$pkey <- paste(sub("^.+Task\\\\\\\\", "", psm$spectrumFile),
                  sub("^.+scan=", "", psm$spectrumID),
                  sep = "::")
head(psm$pkey)

## the PSM is not reduced
reduced(psm, "pkey")
DataFrame(psm[6:7, ])

## same sequence, same spectrumID, same file
psm$sequence[6:7]
psm$pkey[6:7]

## different modification locations
psm$modLocation[6:7]

## here, we need to *explicitly* set pkey to reduce
rpsm <- reducePSMs(psm, psm$pkey)
rpsm
reduced(rpsm, "pkey")

## the two rows are now merged into a single one; the distinct
## modification locations are preserved.
(i <- which(rpsm$pkey == "QEP2LC6_HeLa_50ng_251120_01-calib.mzML::12894"))
DataFrame(rpsm[i, c("sequence", "pkey", "modName", "modLocation")])

## ---------------------------------
## PSM from a data.frame
## ---------------------------------

psmdf <- data.frame(spectrum = paste0("sp", 1:10),
                    sequence = replicate(10,
                                         paste(sample(getAminoAcids()[-1, "AA"], 10),
                                               collapse = "")),
                    protein = sample(paste0("Prot", LETTERS[1:7]), 10,
                                     replace = TRUE),
                    decoy = rep(FALSE, 10),
                    rank = rep(1, 10),
                    score = runif(10))
psmdf

psm <- PSM(psmdf)
psm
psmVariables(psm)

## no PSM variables set
try(adjacencyMatrix(psm))

## set PSM variables
psm <- PSM(psm, spectrum = "spectrum", peptide = "sequence",
           protein = "protein", decoy = "decoy", rank = "rank")
psm
psmVariables(psm)

adjacencyMatrix(psm)
}
