% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/utils_transformer.R
\name{calc_tokenizer_statistics}
\alias{calc_tokenizer_statistics}
\title{Estimate tokenizer statistics}
\usage{
calc_tokenizer_statistics(dataset, step = "creation")
}
\arguments{
\item{dataset}{Object of class datasets.arrow_dataset.Dataset. The data set must contain a column \code{"length"}
containing the number of tokens for every sequence and a column \code{"word_ids"} containing the word ids within every
sequence.}

\item{step}{\code{string} indicating to which step the statistics belong. Recommended values are
\itemize{
\item \code{"creation"} for the creation of the tokenizer.
\item \code{"initial_training"} for the first training of the transformer.
\item \code{"fine_tuning"} for all following trainings of the transformer.
\item \code{"training"} for a training run of the transformer.
}}
}
\value{
Returns a \code{list} with the following entries:
\itemize{
\item n_sequences: Number of sequences
\item n_words: Number for words in whole corpus
\item n_tokens: Number of tokens in the whole corpus
\item mu_t: eqn(n_tokens/n_sequences)
\item mu_w: eqn(n_words/n_sequences)
\item mu_g: eqn(n_tokens/n_words)
}
}
\description{
Function for estimating the tokenizer statistics described by Kaya & Tantuğ (2024).
}
\references{
Kaya, Y. B., & Tantuğ, A. C. (2024). Effect of tokenization granularity for Turkish large language
models. Intelligent Systems with Applications, 21, 200335. https://doi.org/10.1016/j.iswa.2024.200335
}
\concept{Utils Transformers Developers}
