% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/strj-tokenize.R
\name{strj_tokenize}
\alias{strj_tokenize}
\title{Split text into tokens}
\usage{
strj_tokenize(
  text,
  format = c("list", "data.frame"),
  engine = c("stringi", "budoux", "tinyseg", "mecab", "sudachipy"),
  rcpath = NULL,
  mode = c("C", "B", "A"),
  split = FALSE
)
}
\arguments{
\item{text}{Character vector to be tokenized.}

\item{format}{Output format. Choose \code{list} or \code{data.frame}.}

\item{engine}{Tokenizer name. Choose one of 'stringi', 'budoux',
'tinyseg', 'mecab', or 'sudachipy'.
Note that the specified tokenizer is installed and available when you use
'mecab' or 'sudachipy'.}

\item{rcpath}{Path to a setting file for 'MeCab' or 'sudachipy' if any.}

\item{mode}{Splitting mode for 'sudachipy'.}

\item{split}{Logical. If true, the function splits the vector
into some sentences using \code{stringi::stri_split_boundaries(type = "sentence")}
before tokenizing.}
}
\value{
A list or data.frame.
}
\description{
Splits text into several tokens using specified tokenizer.
}
\examples{
strj_tokenize(
  paste0(
    "\u3042\u306e\u30a4\u30fc\u30cf\u30c8",
    "\u30fc\u30f4\u30a9\u306e\u3059\u304d",
    "\u3068\u304a\u3063\u305f\u98a8"
  )
)
strj_tokenize(
  paste0(
    "\u3042\u306e\u30a4\u30fc\u30cf\u30c8",
    "\u30fc\u30f4\u30a9\u306e\u3059\u304d",
    "\u3068\u304a\u3063\u305f\u98a8"
  ),
  format = "data.frame"
)
}
