% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/compare_motifs.R
\name{compare_motifs}
\alias{compare_motifs}
\title{Compare motifs.}
\usage{
compare_motifs(motifs, compare.to, db.scores, use.freq = 1,
  use.type = "PPM", method = "PCC", tryRC = TRUE, min.overlap = 6,
  min.mean.ic = 0.25, min.position.ic = 0, relative_entropy = FALSE,
  normalise.scores = FALSE, max.p = 0.01, max.e = 10, nthreads = 1,
  score.strat = "a.mean", output.report, output.report.max.print = 10)
}
\arguments{
\item{motifs}{See \code{\link[=convert_motifs]{convert_motifs()}} for acceptable motif formats.}

\item{compare.to}{\code{numeric} If missing, compares all motifs to all other motifs.
Otherwise compares all motifs to the specified motif(s).}

\item{db.scores}{\code{data.frame} or \code{DataFrame}. See \code{details}.}

\item{use.freq}{\code{numeric(1)}. For comparing the \code{multifreq} slot.}

\item{use.type}{\code{character(1)} One of \code{'PPM'} and \code{'ICM'}.
The latter allows for taking into account the background
frequencies if \code{relative_entropy = TRUE}. Note that \code{'ICM'} is not
allowed when \code{method = c("ALLR", "ALLR_LL")}.}

\item{method}{\code{character(1)} One of PCC, EUCL, SW, KL, ALLR, BHAT, HELL,
SEUCL, MAN, ALLR_LL, WEUCL, WPCC. See details.}

\item{tryRC}{\code{logical(1)} Try the reverse complement of the motifs as well,
report the best score.}

\item{min.overlap}{\code{numeric(1)} Minimum overlap required when aligning the
motifs. Setting this to a number higher then the width of the motifs
will not allow any overhangs. Can also be a number between 0 and 1,
representing the minimum fraction that the motifs must overlap.}

\item{min.mean.ic}{\code{numeric(1)} Minimum mean information content between the
two motifs for an alignment to be scored. This helps prevent scoring
alignments between low information content regions of two motifs. Note that
this can result in some comparisons failing if no alignment passes the
mean IC threshold. Use \code{\link[=average_ic]{average_ic()}} to filter out low IC motifs to get around
this if you want to avoid getting \code{NA}s in your output.}

\item{min.position.ic}{\code{numeric(1)} Minimum information content required between
individual alignment positions for it to be counted in the final alignment
score. It is recommended to use this together with \code{normalise.scores = TRUE},
as this will help punish scores resulting from only a fraction of an
alignment.}

\item{relative_entropy}{\code{logical(1)} Change the ICM calculation affecting
\code{min.position.ic} and \code{min.mean.ic}. See \code{\link[=convert_type]{convert_type()}}.}

\item{normalise.scores}{\code{logical(1)} Favour alignments which leave fewer
unaligned positions, as well as alignments between motifs of similar length.
Similarity scores are multiplied by the ratio of
aligned positions to the total number of positions in the larger motif,
and the inverse for distance scores.}

\item{max.p}{\code{numeric(1)} Maximum P-value allowed in reporting matches.
Only used if \code{compare.to} is set.}

\item{max.e}{\code{numeric(1)} Maximum E-value allowed in reporting matches.
Only used if \code{compare.to} is set. The E-value is the P-value multiplied
by the number of input motifs times two.}

\item{nthreads}{\code{numeric(1)} Run \code{\link[=compare_motifs]{compare_motifs()}} in parallel with \code{nthreads}
threads. \code{nthreads = 0} uses all available threads.}

\item{score.strat}{\code{character(1)} How to handle column scores calculated from
motif alignments. "sum": add up all scores. "a.mean": take the arithmetic
mean. "g.mean": take the geometric mean. "median": take the median.
"wa.mean", "wg.mean": weighted arithmetic/geometric mean. "fzt": Fisher
Z-transform. Weights are the
total information content shared between aligned columns.}

\item{output.report}{\code{character(1)} Provide a filename for \code{\link[=compare_motifs]{compare_motifs()}}
to write an html ouput report to. The top matches are shown alongside
figures of the match alignments. This requires the \code{knitr} and \code{rmarkdown}
packages. (Note: still in development.)}

\item{output.report.max.print}{\code{numeric(1)} Maximum number of top matches to
print.}
}
\value{
\code{matrix} if \code{compare.to} is missing; \code{DataFrame} otherwise. For the
latter, function args are stored in the \code{metadata} slot.
}
\description{
Compare motifs using one of the several available metrics. See the
"Motif comparisons and P-values" vignette for detailed information.
}
\details{
\subsection{Available metrics}{

The following metrics are available:
\itemize{
\item Euclidean distance (\code{EUCL}) (Choi et al. 2004)
\item Weighted Euclidean distance (\code{WEUCL})
\item Kullback-Leibler divergence (\code{KL}) (Kullback and Leibler 1951; Roepcke et al. 2005)
\item Hellinger distance (\code{HELL}) (Hellinger 1909)
\item Squared Euclidean distance (\code{SEUCL})
\item Manhattan distance (\code{MAN})
\item Pearson correlation coefficient (\code{PCC})
\item Weighted Pearson correlation coefficient (\code{WPCC})
\item Sandelin-Wasserman similarity (\code{SW}), or sum of squared distances (Sandelin and Wasserman 2004)
\item Average log-likelihood ratio (\code{ALLR}) (Wang and Stormo 2003)
\item Lower limit ALLR (\code{ALLR_LL}) (Mahony et al. 2007)
\item Bhattacharyya coefficient (\code{BHAT}) (Bhattacharyya 1943)
}

Comparisons are calculated between two motifs at a time. All possible alignments
are scored, and the best score is reported. In an alignment scores are calculated
individually between columns. How those scores are combined to generate the final
alignment scores depends on \code{score.strat}.

See the "Motif comparisons and P-values" vignette for a description of the
various metrics. Note that \code{PCC}, \code{WPCC}, \code{SW}, \code{ALLR}, \code{ALLR_LL} and \code{BHAT}
are similarities;
higher values mean more similar motifs. For the remaining metrics, values closer
to zero represent more similar motifs.

Small pseudocounts are automatically added when one of the following methods
is used: \code{KL}, \code{ALLR}, \code{ALLR_LL}, \code{IS}. This is avoid
zeros in the calculations.
}

\subsection{Calculating P-values}{

To note regarding p-values: P-values are pre-computed using the
\code{\link[=make_DBscores]{make_DBscores()}} function. If not given, then uses a set of internal
precomputed P-values from the JASPAR2018 CORE motifs. These precalculated
scores are dependent on the length of the motifs being compared. This takes
into account that comparing small motifs with larger motifs leads to higher
scores, since the probability of finding a higher scoring alignment is
higher.

The default P-values have been precalculated for regular DNA motifs. They
are of little use for motifs with a different number of alphabet letters
(or even the \code{multifreq} slot).
}
}
\examples{
motif1 <- create_motif(name = "1")
motif2 <- create_motif(name = "2")
motif1vs2 <- compare_motifs(c(motif1, motif2), method = "PCC")
## To get a dist object:
as.dist(1 - motif1vs2)

motif3 <- create_motif(name = "3")
motif4 <- create_motif(name = "4")
motifs <- c(motif1, motif2, motif3, motif4)
## Compare motif "2" to all the other motifs:
if (R.Version()$arch != "i386") {
compare_motifs(motifs, compare.to = 2, max.p = 1, max.e = Inf)
}

## If you are working with a large list of motifs and the mean.min.ic
## option is not set to zero, you may get a number of failed comparisons
## due to low IC. To filter the list of motifs to avoid these, use
## the average_ic() function to remove motifs with low average IC:
\dontrun{
library(MotifDb)
motifs <- convert_motifs(MotifDb)[1:100]
compare_motifs(motifs)
#> Warning in compare_motifs(motifs) :
#>   Some comparisons failed due to low IC
motifs <- motifs[average_ic(motifs) > 0.5]
compare_motifs(motifs)
}


}
\references{
Bhattacharyya A (1943). “On a measure of divergence between two
statistical populations defined by their probability
distributions.” \emph{Bulletin of the Calcutta Mathematical Society},
\strong{35}, 99-109.

Choi I, Kwon J, Kim S (2004). “Local feature frequency profile: a
method to measure structural similarity in proteins.” \emph{PNAS},
\strong{101}, 3797-3802.

Hellinger E (1909). “Neue Begrundung der Theorie quadratischer
Formen von unendlichvielen Veranderlichen.” \emph{Journal fur die reine
und angewandte Mathematik}, \strong{136}, 210-271.

Khan A, Fornes O, Stigliani A, Gheorghe M, Castro-Mondragon JA,
van der Lee R, Bessy A, Cheneby J, Kulkarni SR, Tan G, Baranasic
D, Arenillas DJ, Sandelin A, Vandepoele K, Lenhard B, Ballester B,
Wasserman WW, Parcy F, Mathelier A (2018). “JASPAR 2018: update of
the open-access database of transcription factor binding profiles
and its web framework.” \emph{Nucleic Acids Research}, \strong{46}, D260-D266.

Kullback S, Leibler RA (1951). “On information and sufficiency.”
\emph{The Annals of Mathematical Statistics}, \strong{22}, 79-86.

Itakura F, Saito S (1968). “Analysis synthesis telephony based on
the maximum likelihood method.” In \emph{6th International Congress on
Acoustics}, C-17.

Mahony S, Auron PE, Benos PV (2007). “DNA Familial Binding
Profiles Made Easy: Comparison of Various Motif Alignment and
Clustering Strategies.” \emph{PLoS Computational Biology}, \strong{3}.

Pietrokovski S (1996). “Searching databases of conserved sequence
regions by aligning protein multiple-alignments.” \emph{Nucleic Acids
Research}, \strong{24}, 3836-3845.

Roepcke S, Grossmann S, Rahmann S, Vingron M (2005). “T-Reg
Comparator: an analysis tool for the comparison of position weight
matrices.” \emph{Nucleic Acids Research}, \strong{33}, W438-W441.

Sandelin A, Wasserman WW (2004). “Constrained binding site
diversity within families of transcription factors enhances
pattern discovery bioinformatics.” \emph{Journal of Molecular Biology},
\strong{338}, 207-215.

Wang T, Stormo GD (2003). “Combining phylogenetic data with
co-regulated genes to identify motifs.” \emph{Bioinformatics}, \strong{19},
2369-2380.
}
\seealso{
\code{\link[=convert_motifs]{convert_motifs()}}, \code{\link[=motif_tree]{motif_tree()}}, \code{\link[=view_motifs]{view_motifs()}},
\code{\link[=make_DBscores]{make_DBscores()}}
}
\author{
Benjamin Jean-Marie Tremblay, \email{benjamin.tremblay@uwaterloo.ca}
}
