% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/jaccard_logical_joins.R
\name{jaccard_full_join}
\alias{jaccard_full_join}
\title{Fuzzy full-join using minihashing}
\usage{
jaccard_full_join(
  a,
  b,
  by = NULL,
  block_by = NULL,
  n_gram_width = 2,
  n_bands = 50,
  band_width = 8,
  threshold = 0.7,
  progress = FALSE,
  clean = FALSE,
  similarity_column = NULL
)
}
\arguments{
\item{a}{the first dataframe you wish to join.}

\item{b}{the second dataframe you wish to join.}

\item{by}{a named vector indicating which columns to join on. Format should
be the same as dplyr: \code{by = c("column_name_in_df_a" = "column_name_in_df_b")}, but
two columns must be specified in each dataset (x column and y column). Specification
made with \code{dplyr::join_by()} are also accepted.}

\item{block_by}{a named vector indicating which column to block on, such that
rows that disagree on this field cannot be considered a match. Format should
be the same as dplyr: \code{by = c("column_name_in_df_a" =
"column_name_in_df_b")}}

\item{n_gram_width}{the length of the n_grams used in calculating the
jaccard similarity. For best performance, I set this large enough that the
chance any string has a specific n_gram is low (i.e. \code{n_gram_width} = 2
or 3 when matching on first names, 5 or 6 when matching on entire
sentences).}

\item{n_bands}{the number of bands used in the minihash algorithm (default
is 40). Use this in conjunction with the \code{band_width} to determine the
performance of the hashing. The default settings are for a
(.2,.8,.001,.999)-sensitive hash i.e. that pairs with a similarity of less
than .2 have a >.1\% chance of being compared, while pairs with a similarity
of greater than .8 have a >99.9\% chance of being compared.}

\item{band_width}{the length of each band used in the minihashing algorithm
(default is 8) Use this in conjunction with the \code{n_bands} to determine
the performance of the hashing. The default settings are for a
(.2,.8,.001,.999)-sensitive hash i.e. that pairs with a similarity of less
than .2 have a >.1\% chance of being compared, while pairs with a similarity
of greater than .8 have a >99.9\% chance of being compared.}

\item{threshold}{the jaccard similarity threshold above which two strings
should be considered a match (default is .95). The similarity is euqal to 1
\itemize{
\item the jaccard distance between the two strings, so 1 implies the strings are
identical, while a similarity of zero implies the strings are completely
dissimilar.
}}

\item{progress}{set to \code{TRUE} to print progress}

\item{clean}{should the strings that you fuzzy join on be cleaned (coerced
to lower-case, stripped of punctuation and spaces)? Default is FALSE}

\item{similarity_column}{an optional character vector. If provided, the data
frame will contain a column with this name giving the jaccard similarity
between the two fields. Extra column will not be present if anti-joining.}
}
\value{
a tibble fuzzily-joined on the basis of the variables in \code{by.} Tries
to adhere to the same standards as the dplyr-joins, and uses the same
logical joining patterns (i.e. inner-join joins and keeps only observations in both datasets).
}
\description{
Fuzzy full-join using minihashing
}
\examples{
# load baby names data
#install.packages("babynames")
library(babynames)

baby_names <- data.frame(name = tolower(unique(babynames$name))[1:500])
baby_names_sans_vowels <- data.frame(
                name_wo_vowels =gsub("[aeiouy]","", baby_names$name)
   )
# Check the probability two pairs of strings with
# similarity .8 will be matched with a band width of 30
# and 30 bands using the `jaccard_probability()` function:
jaccard_probability(.8,30,8)
# Run the join:
joined_names <- jaccard_full_join(
              baby_names,
              baby_names_sans_vowels,
              by = c("name"= "name_wo_vowels"),
              threshold = .8,
              n_bands = 20,
              band_width = 6,
              n_gram_width = 1,
              clean = FALSE # default
              )
joined_names
}
