% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/fuzzy_join.R
\name{fuzzy_join}
\alias{fuzzy_join}
\title{Fuzzy join two data.tables together}
\usage{
fuzzy_join(x, y, exact = NULL, exact.or.NA = NULL, fuzzy = NULL,
  gen = "distance", suffixes = c(".x", ".y"), which = FALSE, w = rep(1,
  length(fuzzy)), na.score = 1/3, method = "jw", p = 0.1, ...)
}
\arguments{
\item{x}{The master data.table}

\item{y}{The using data.table}

\item{exact}{Character vector specifying variables on which to match exactly.}

\item{exact.or.NA}{Character vector specifying variables that should not differ if both are non missing.}

\item{fuzzy}{Character vector specifying columns on which to match in a fuzzy way}

\item{gen}{Name of new variable with the distance between matched observations. Default to "distance".}

\item{suffixes}{A character vector of length 2 specifying  suffix of overlapping columns. Defaut to ".x" and ".y".}

\item{which}{With \code{which = TRUE}, returns a three columns data.tables where he first column corresponds to \code{x}'s row number, the second column corresponds to \code{y}'s row number and the third column corresponds to the score of the match. Default is \code{FALSE}, which returns a join with the rows in y.}

\item{w}{Numeric vector of the same length as \code{fuzzy} specifying the weights to use when summing across different column of \code{fuzzy}. Default to \code{rep(1, length(fuzzy))}.}

\item{na.score}{Numeric that specifies the distance between NA and another string. Default to 1/3}

\item{method}{See the \code{\link[stringdist]{stringdist}} documentation. Default to \code{"jw"}}

\item{p}{See  the \code{\link[stringdist]{stringdist}} documentation. Default to \code{0.1}}

\item{...}{Other arguments to pass to \code{stringdist}. See the \code{\link[stringdist]{stringdist}} documentation.}
}
\description{
\code{fuzzy_join} uses record linkage methods to match observations between two datasets where no perfect key fields exist.  For each row in x, \code{fuzzy_join} finds the closest row(s) in y. The distance is a weighted average of the string distances defined in \code{method} over multiple columns.
}
\details{
Typically, \code{x} is a dataset with dirty names, while \code{y} is the dataset with true names. When \code{exact} or \code{exact.or.NA} is specified, rows without matches are returned with distance NA.
}
\examples{
library(data.table)
x <- data.table(a = c("france", "franc"), b = c("arras", "dijon"))
y <- data.table(a = c("franc", "france"), b = c("arvars", "dijjon"))
fuzzy_join(x, y, fuzzy = c("a", "b"))
fuzzy_join(x, y, fuzzy = c("a", "b"), w = c(0.9, 0.1))
fuzzy_join(x,y, fuzzy = c("a", "b"), w = c(0, 0.9))
x <- data.table(a = c(1, 1), b = c("arras", "dijon"))
y <- data.table(a = c(1, 1), b = c("arvars", "dijjon"))
fuzzy_join(x, y, exact = "a", fuzzy = "b")
x <- data.table(a = c(1, 2), b = c("arras", "dijon"))
y <- data.table(a = c(1, 1), b = c("arvars", "dijjon"))
fuzzy_join(x, y, exact = "a", fuzzy = "b")
x <- data.table(a = c(1, NA), b = c("arras", "dijon"))
y <- data.table(a = c(1, 1), b = c("arvars", "dijjon"))
fuzzy_join(x, y, exact.or.NA = "a", fuzzy = "b")
}

