% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/similarity.R
\name{apd_similarity}
\alias{apd_similarity}
\alias{apd_similarity.default}
\alias{apd_similarity.data.frame}
\alias{apd_similarity.matrix}
\alias{apd_similarity.formula}
\alias{apd_similarity.recipe}
\title{Applicability domain methods using binary similarity analysis}
\usage{
apd_similarity(x, ...)

\method{apd_similarity}{default}(x, quantile = NA_real_, ...)

\method{apd_similarity}{data.frame}(x, quantile = NA_real_, ...)

\method{apd_similarity}{matrix}(x, quantile = NA_real_, ...)

\method{apd_similarity}{formula}(formula, data, quantile = NA_real_, ...)

\method{apd_similarity}{recipe}(x, data, quantile = NA_real_, ...)
}
\arguments{
\item{x}{Depending on the context:
\itemize{
\item A \strong{data frame} of binary predictors.
\item A \strong{matrix} of binary predictors.
\item A \strong{recipe} specifying a set of preprocessing steps
created from \code{\link[recipes:recipe]{recipes::recipe()}}.
}}

\item{...}{Options to pass to \code{proxyC::simil()}, such as \code{method}. If no
options are specified, \code{method = "jaccard"} is used.}

\item{quantile}{A real number between 0 and 1 or NA for how the similarity
values for each sample versus the training set should be summarized. A value
of \code{NA} specifies that the mean similarity is computed. Otherwise, the
appropriate quantile is computed.}

\item{formula}{A formula specifying the predictor terms on the right-hand
side. No outcome should be specified.}

\item{data}{When a \strong{recipe} or \strong{formula} is used, \code{data} is specified as:
\itemize{
\item A \strong{data frame} containing the binary predictors. Any predictors with
no 1's will be removed (with a warning).
}}
}
\value{
A \code{apd_similarity} object.
}
\description{
\code{apd_similarity()} is used to analyze samples in terms of similarity scores
for binary data. All features in the data should be binary (i.e. zero or
one).
}
\details{
The function computes measures of similarity for different samples
points. For example, suppose samples \code{A} and \code{B} both contain \emph{p} binary
variables. First, a 2x2 table is constructed between \code{A} and \code{B} \emph{across
their elements}. The table will contain \emph{p} entries across the four cells
(see the example below). From this, different measures of likeness are
computed.

For a training set of \emph{n} samples, a new sample is compared to each,
resulting in \emph{n} similarity scores. These can be summarized into a single
value; the median similarity is used by default by the scoring function.

For this method, the computational methods are fairly taxing for large data
sets. The training set must be stored (albeit in a sparse matrix format) so
object sizes may become large.

By default, the computations are run in parallel using \emph{all possible
cores}. To change this, call the \code{setThreadOptions} function in the
\code{RcppParallel} package.
}
\examples{
\donttest{
data(qsar_binary)

jacc_sim <- apd_similarity(binary_tr)
jacc_sim

# plot the empirical cumulative distribution function (ECDF) for the training set:
library(ggplot2)
autoplot(jacc_sim)

# Example calculations for two samples:
A <- as.matrix(binary_tr[1,])
B <- as.matrix(binary_tr[2,])
xtab <- table(A, B)
xtab

# Jaccard statistic
xtab[2, 2] / (xtab[1, 2] + xtab[2, 1] + xtab[2, 2])

# Hamman statistic
( ( xtab[1, 1] + xtab[2, 2] ) - ( xtab[1, 2] + xtab[2, 1] ) ) / sum(xtab)

# Faith statistic
( xtab[1, 1] + xtab[2, 2]/2 ) / sum(xtab)

# Summarize across all training set similarities
mean_sim <- score(jacc_sim, new_data = binary_unk)
mean_sim
}
}
\references{
Leach, A. and Gillet V. (2007). \emph{An Introduction to
Chemoinformatics}. Springer, New York
}
