% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/fuzzy_w.R
\docType{class}
\name{FuzzExtract}
\alias{FuzzExtract}
\title{Fuzzy extraction from a sequence}
\usage{
# init <- FuzzExtract$new(decoding = NULL)
}
\description{
Fuzzy extraction from a sequence

Fuzzy extraction from a sequence
}
\details{
the \emph{decoding} parameter is useful in case of non-ascii character strings. If this parameter is not NULL then the \emph{force_ascii} parameter (if applicable) is internally set to FALSE. Decoding applies only to python 2 configurations, as in python 3 character strings are decoded to unicode by default.

the \emph{Extract} method selects the best match of a character string vector. It returns a list with the match and it's score.

the \emph{ExtractBests} method returns a list of the best matches for a sequence of character strings.

the \emph{ExtractWithoutOrder} method returns the best match of a character string vector (in python it returns a generator of tuples containing the match and it's score).

the \emph{ExtractOne} method finds the single best match above a score for a character string vector. This is a convenience method which returns the single best choice.

the \emph{Dedupe} is a convenience method which takes a character string vector containing duplicates and uses fuzzy matching to identify and remove duplicates. Specifically, it uses the \emph{Extract} method
to identify duplicates that score greater than a user defined threshold. Then, it looks for the longest item in the duplicate vector since we assume this item contains the most entity information and returns that.
It breaks string length ties on an alphabetical sort. Note: as the threshold DECREASES the number of duplicates that are found INCREASES. This means that the returned deduplicated list will likely be shorter.
Raise the threshold for fuzzy_dedupe to be less sensitive.
}
\section{Methods}{


\describe{
 \item{\code{FuzzExtract$new(decoding = NULL)}}{}

 \item{\code{--------------}}{}

 \item{\code{Extract(string = NULL, sequence_strings = NULL, processor = NULL, scorer = NULL, limit = 5L)}}{}

 \item{\code{--------------}}{}

 \item{\code{ExtractBests(string = NULL, sequence_strings = NULL, processor = NULL, scorer = NULL, score_cutoff = 0L, limit = 5L)}}{}

 \item{\code{--------------}}{}

 \item{\code{ExtractWithoutOrder(string = NULL, sequence_strings = NULL, processor = NULL, scorer = NULL, score_cutoff = 0L)}}{}

 \item{\code{--------------}}{}

 \item{\code{ExtractOne(string = NULL, sequence_strings = NULL, processor = NULL, scorer = NULL, score_cutoff = 0L)}}{}

 \item{\code{--------------}}{}

 \item{\code{Dedupe(contains_dupes = NULL, threshold = 70L, scorer = NULL)}}{}

 }
}

\examples{

try({
  if (reticulate::py_available(initialize = FALSE)) {

    if (check_availability()) {

      library(fuzzywuzzyR)

      word = "new york jets"

      choices = c("Atlanta Falcons", "New York Jets", "New York Giants", "Dallas Cowboys")

      duplicat = c('Frodo Baggins', 'Tom Sawyer', 'Bilbo Baggin', 'Samuel L. Jackson',

                   'F. Baggins', 'Frody Baggins', 'Bilbo Baggins')

      #------------
      # processor :
      #------------

      init_proc = FuzzUtils$new()

      PROC = init_proc$Full_process    # class process-method

      PROC1 = tolower                  # base R function

      #---------
      # scorer :
      #---------

      init_scor = FuzzMatcher$new()

      SCOR = init_scor$WRATIO


      init <- FuzzExtract$new()

      init$Extract(string = word, sequence_strings = choices, processor = PROC, scorer = SCOR)

      init$ExtractBests(string = word, sequence_strings = choices, processor = PROC1,

                        scorer = SCOR, score_cutoff = 0L, limit = 2L)

      init$ExtractWithoutOrder(string = word, sequence_strings = choices, processor = PROC,

                               scorer = SCOR, score_cutoff = 0L)

      init$ExtractOne(string = word, sequence_strings = choices, processor = PROC,

                      scorer = SCOR, score_cutoff = 0L)

      init$Dedupe(contains_dupes = duplicat, threshold = 70L, scorer = SCOR)

    }
  }
}, silent=TRUE)
}
\references{
https://github.com/seatgeek/fuzzywuzzy/blob/master/fuzzywuzzy/process.py, https://docs.python.org/3/library/codecs.html#standard-encodings
}
\section{Methods}{
\subsection{Public methods}{
\itemize{
\item \href{#method-new}{\code{FuzzExtract$new()}}
\item \href{#method-Extract}{\code{FuzzExtract$Extract()}}
\item \href{#method-ExtractBests}{\code{FuzzExtract$ExtractBests()}}
\item \href{#method-ExtractWithoutOrder}{\code{FuzzExtract$ExtractWithoutOrder()}}
\item \href{#method-ExtractOne}{\code{FuzzExtract$ExtractOne()}}
\item \href{#method-Dedupe}{\code{FuzzExtract$Dedupe()}}
\item \href{#method-clone}{\code{FuzzExtract$clone()}}
}
}
\if{html}{\out{<hr>}}
\if{html}{\out{<a id="method-new"></a>}}
\if{latex}{\out{\hypertarget{method-new}{}}}
\subsection{Method \code{new()}}{
\subsection{Usage}{
\if{html}{\out{<div class="r">}}\preformatted{FuzzExtract$new(decoding = NULL)}\if{html}{\out{</div>}}
}

\subsection{Arguments}{
\if{html}{\out{<div class="arguments">}}
\describe{
\item{\code{decoding}}{either NULL or a character string. If not NULL then the \emph{decoding} parameter takes one of the standard python encodings (such as 'utf-8'). See the \emph{details} and \emph{references} link for more information.}
}
\if{html}{\out{</div>}}
}
}
\if{html}{\out{<hr>}}
\if{html}{\out{<a id="method-Extract"></a>}}
\if{latex}{\out{\hypertarget{method-Extract}{}}}
\subsection{Method \code{Extract()}}{
\subsection{Usage}{
\if{html}{\out{<div class="r">}}\preformatted{FuzzExtract$Extract(
  string = NULL,
  sequence_strings = NULL,
  processor = NULL,
  scorer = NULL,
  limit = 5L
)}\if{html}{\out{</div>}}
}

\subsection{Arguments}{
\if{html}{\out{<div class="arguments">}}
\describe{
\item{\code{string}}{a character string.}

\item{\code{sequence_strings}}{a character string vector}

\item{\code{processor}}{either NULL or a function of the form f(a) -> b, where a is the query or individual choice and b is the choice to be used in matching. See the examples for more details.}

\item{\code{scorer}}{a function for scoring matches between the query and an individual processed choice. This should be a function of the form f(query, choice) -> int. By default, FuzzMatcher.WRATIO() is used and expects both query and choice to be strings. See the examples for more details.}

\item{\code{limit}}{An integer value for the maximum number of elements to be returned. Defaults to 5L}
}
\if{html}{\out{</div>}}
}
}
\if{html}{\out{<hr>}}
\if{html}{\out{<a id="method-ExtractBests"></a>}}
\if{latex}{\out{\hypertarget{method-ExtractBests}{}}}
\subsection{Method \code{ExtractBests()}}{
\subsection{Usage}{
\if{html}{\out{<div class="r">}}\preformatted{FuzzExtract$ExtractBests(
  string = NULL,
  sequence_strings = NULL,
  processor = NULL,
  scorer = NULL,
  score_cutoff = 0L,
  limit = 5L
)}\if{html}{\out{</div>}}
}

\subsection{Arguments}{
\if{html}{\out{<div class="arguments">}}
\describe{
\item{\code{string}}{a character string.}

\item{\code{sequence_strings}}{a character string vector}

\item{\code{processor}}{either NULL or a function of the form f(a) -> b, where a is the query or individual choice and b is the choice to be used in matching. See the examples for more details.}

\item{\code{scorer}}{a function for scoring matches between the query and an individual processed choice. This should be a function of the form f(query, choice) -> int. By default, FuzzMatcher.WRATIO() is used and expects both query and choice to be strings. See the examples for more details.}

\item{\code{score_cutoff}}{an integer value for the score threshold. No matches with a score less than this number will be returned. Defaults to 0}

\item{\code{limit}}{An integer value for the maximum number of elements to be returned. Defaults to 5L}
}
\if{html}{\out{</div>}}
}
}
\if{html}{\out{<hr>}}
\if{html}{\out{<a id="method-ExtractWithoutOrder"></a>}}
\if{latex}{\out{\hypertarget{method-ExtractWithoutOrder}{}}}
\subsection{Method \code{ExtractWithoutOrder()}}{
\subsection{Usage}{
\if{html}{\out{<div class="r">}}\preformatted{FuzzExtract$ExtractWithoutOrder(
  string = NULL,
  sequence_strings = NULL,
  processor = NULL,
  scorer = NULL,
  score_cutoff = 0L
)}\if{html}{\out{</div>}}
}

\subsection{Arguments}{
\if{html}{\out{<div class="arguments">}}
\describe{
\item{\code{string}}{a character string.}

\item{\code{sequence_strings}}{a character string vector}

\item{\code{processor}}{either NULL or a function of the form f(a) -> b, where a is the query or individual choice and b is the choice to be used in matching. See the examples for more details.}

\item{\code{scorer}}{a function for scoring matches between the query and an individual processed choice. This should be a function of the form f(query, choice) -> int. By default, FuzzMatcher.WRATIO() is used and expects both query and choice to be strings. See the examples for more details.}

\item{\code{score_cutoff}}{an integer value for the score threshold. No matches with a score less than this number will be returned. Defaults to 0}
}
\if{html}{\out{</div>}}
}
}
\if{html}{\out{<hr>}}
\if{html}{\out{<a id="method-ExtractOne"></a>}}
\if{latex}{\out{\hypertarget{method-ExtractOne}{}}}
\subsection{Method \code{ExtractOne()}}{
\subsection{Usage}{
\if{html}{\out{<div class="r">}}\preformatted{FuzzExtract$ExtractOne(
  string = NULL,
  sequence_strings = NULL,
  processor = NULL,
  scorer = NULL,
  score_cutoff = 0L
)}\if{html}{\out{</div>}}
}

\subsection{Arguments}{
\if{html}{\out{<div class="arguments">}}
\describe{
\item{\code{string}}{a character string.}

\item{\code{sequence_strings}}{a character string vector}

\item{\code{processor}}{either NULL or a function of the form f(a) -> b, where a is the query or individual choice and b is the choice to be used in matching. See the examples for more details.}

\item{\code{scorer}}{a function for scoring matches between the query and an individual processed choice. This should be a function of the form f(query, choice) -> int. By default, FuzzMatcher.WRATIO() is used and expects both query and choice to be strings. See the examples for more details.}

\item{\code{score_cutoff}}{an integer value for the score threshold. No matches with a score less than this number will be returned. Defaults to 0}
}
\if{html}{\out{</div>}}
}
}
\if{html}{\out{<hr>}}
\if{html}{\out{<a id="method-Dedupe"></a>}}
\if{latex}{\out{\hypertarget{method-Dedupe}{}}}
\subsection{Method \code{Dedupe()}}{
\subsection{Usage}{
\if{html}{\out{<div class="r">}}\preformatted{FuzzExtract$Dedupe(contains_dupes = NULL, threshold = 70L, scorer = NULL)}\if{html}{\out{</div>}}
}

\subsection{Arguments}{
\if{html}{\out{<div class="arguments">}}
\describe{
\item{\code{contains_dupes}}{a vector of strings that we would like to dedupe}

\item{\code{threshold}}{the numerical value (0, 100) point at which we expect to find duplicates. Defaults to 70 out of 100}

\item{\code{scorer}}{a function for scoring matches between the query and an individual processed choice. This should be a function of the form f(query, choice) -> int. By default, FuzzMatcher.WRATIO() is used and expects both query and choice to be strings. See the examples for more details.}
}
\if{html}{\out{</div>}}
}
}
\if{html}{\out{<hr>}}
\if{html}{\out{<a id="method-clone"></a>}}
\if{latex}{\out{\hypertarget{method-clone}{}}}
\subsection{Method \code{clone()}}{
The objects of this class are cloneable with this method.
\subsection{Usage}{
\if{html}{\out{<div class="r">}}\preformatted{FuzzExtract$clone(deep = FALSE)}\if{html}{\out{</div>}}
}

\subsection{Arguments}{
\if{html}{\out{<div class="arguments">}}
\describe{
\item{\code{deep}}{Whether to make a deep clone.}
}
\if{html}{\out{</div>}}
}
}
}
