% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/predefined_tests.R
\name{predefined_tests}
\alias{predefined_tests}
\alias{exact_match}
\alias{range_match}
\alias{range_match_legacy}
\alias{prob_link}
\alias{true}
\alias{false}
\title{Predefined logical tests in \bold{\code{diyar}}}
\usage{
exact_match(x, y)

range_match(x, y, range = 10)

range_match_legacy(x, y)

prob_link(
  x,
  y,
  cmp_func,
  attr_threshold,
  score_threshold,
  probabilistic,
  return_weights = FALSE
)

true(x, y)

false(x, y)
}
\arguments{
\item{x}{Attribute(s) to be compared against.}

\item{y}{Attribute(s) to be compared by.}

\item{range}{Difference between \code{y} and \code{x}.}

\item{cmp_func}{Logical tests such as string comparators. See \code{\link{links_wf_probabilistic}}.}

\item{attr_threshold}{Matching set of weight thresholds for each result of \code{cmp_func}. See \code{\link{links_wf_probabilistic}}.}

\item{score_threshold}{Score threshold determining matched or linked records. See \code{\link{links_wf_probabilistic}}.}

\item{probabilistic}{If \code{TRUE}, matches determined through a score derived base on Fellegi-Sunter model for probabilistic linkage. See \code{\link{links_wf_probabilistic}}.}

\item{return_weights}{If \code{TRUE}, returns the match-weights and score-thresholds for record pairs.}
}
\description{
A collection of predefined logical tests used with \bold{\code{\link{sub_criteria}}} objects
}
\details{
\bold{\code{exact_match()}} - test that  \code{x == y}

\bold{\code{range_match()}} - test that \code{x} \eqn{\le} \code{y} \eqn{\le} \code{(x + range)}

\bold{\code{range_match_legacy()}} - test that \code{overlap(as.number_line(x@gid), y)} is \code{TRUE}.

\bold{\code{prob_link()}} - Test that a record-pair relate to the same entity based on Fellegi and Sunter (1969) model for deciding if two records belong to the same entity.

In summary, record-pairs are created and categorised as matches and non-matches (\code{attr_threshold}) with user-defined functions (\code{cmp_func}).
If \code{probabilistic} is \code{TRUE}, two probabilities (\code{m} and \code{u}) are used to calculate weights for matches and non-matches.
The \code{m}-probability is the probability that matched records are actually from the same entity i.e. a true match,
while \code{u}-probability is the probability that matched records are not from the same entity i.e. a false match.
Record-pairs whose total score are above a certain threshold (\code{score_threshold}) are assumed to belong to the same entity.

Agreement (match) and disagreement (non-match) scores are calculated as described by Asher et al. (2020).

For each record pair, an agreement for attribute \eqn{i} is calculated as;

\deqn{\log_{2}(m_{i}/u_{i})}{log_2 (m_i / u_i)}

For each record pair, a disagreement score for attribute \eqn{i} is calculated as;

\deqn{\log_{2}((1-m_{i})/(1-u_{i}))}{log_2 ((1-m_i) / (1-u_i))}

where \eqn{m_{i}}{m_i} and \eqn{u_{i}}{u_i} are the \code{m} and \code{u}-probabilities for each value of attribute \eqn{i}.

Note that each probability is calculated as a combined probability for the record pair.
For example, if the values of the record-pair have \code{u}-probabilities of \code{0.1} and \code{0.2} respectively,
then the \code{u}-probability for the pair will be \code{0.02}.

Missing data (\code{NA}) are considered non-matches and assigned a \code{u}-probability of \code{0}.
}
\examples{
`exact_match`
exact_match(x = 1, y = 1)
exact_match(x = 1, y = 2)

`range_match`
range_match(x = 10, y = 16, range = 6)
range_match(x = 16, y = 10, range = 6)

`range_match_legacy`
x_nl <- number_line(10, 16, gid = 10)
y_nl1 <- number_line(16, 10)
y_nl2 <- number_line(16, 10)

range_match_legacy(x = x_nl, y = y_nl1)
range_match_legacy(x = x_nl, y = y_nl2)

}
