% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/NesPrInDT.R
\name{NesPrInDT}
\alias{NesPrInDT}
\title{Nested \code{\link{PrInDT}} with additional undersampling of a factor with two unbalanced levels}
\usage{
NesPrInDT(datain,classname,ctestv=NA,N,plarge,psmall=1.0,conf.level=0.95,
       thres=0.5,stratvers=0,strat=NA,seedl=TRUE,nesvar,nesunder,repin,
       minsplit=NA,minbucket=NA)
}
\arguments{
\item{datain}{Input data frame with class factor variable 'classname' and the\cr
influential variables, which need to be factors or numericals (transform logicals and character variables to factors)}

\item{classname}{Name of class variable (character)}

\item{ctestv}{Vector of character strings of forbidden split results;\cr
(see function \code{\link{PrInDT}} for details.)\cr
If no restrictions exist, the default = NA is used.}

\item{N}{Number of repetitions (integer > 0)}

\item{plarge}{Undersampling percentage of larger class (numerical, > 0 and <= 1)}

\item{psmall}{Undersampling percentage of smaller class (numerical, > 0 and <= 1);\cr
default = 1}

\item{conf.level}{(1 - significance level) in function \code{ctree} (numerical, > 0 and <= 1);\cr
default = 0.95}

\item{thres}{Probability threshold for prediction of smaller class; default = 0.5}

\item{stratvers}{Version of stratification;\cr
= 0: none (default),\cr
= 1: stratification according to the percentages of the values of the factor variable 'strat',\cr
> 1: stratification with minimum number 'stratvers' of observations per value of 'strat'}

\item{strat}{Name of one (!) stratification variable for undersampling (character);\cr
default = NA (no stratification)}

\item{seedl}{Should the seed for random numbers be set (TRUE / FALSE)?\cr
default = TRUE}

\item{nesvar}{Name of factor to be undersampled (character)}

\item{nesunder}{Data of factor to be undersampled (integer)}

\item{repin}{Number of repetitions (integer) for undersampling of 'nesvar'}

\item{minsplit}{Minimum number of elements in a node to be splitted;\cr
default = 20}

\item{minbucket}{Minimum number of elements in a node;\cr
default = 7}
}
\value{
\describe{  
\item{undba}{balanced accuracies on undersamples}
\item{imax}{indices of best trees on undersamples}
\item{undba3en}{balanced accuracies of ensembles of 3 best trees on undersamples}
\item{accF}{balanced accuracies on full sample}
\item{accE}{balanced accuracy on full sample of best ensemble of 3 trees from undersampling}
\item{maxt}{indices of best trees on full sample}
\item{treesb}{3 best trees of all undersamples of 'nesunder'; refer to an individual tree as \code{treesb[[k]]}, k = 1, ..., 3*repin}
}
}
\description{
Function for additional undersampling of the factor 'nesvar' with two unbalanced levels to avoid dominance of the level with higher frequency.
The factor 'nesvar' is allowed not be part of the input data frame 'datain'. The data of this factor is given in the vector 'nesunder'. 
The observations in 'nesunder' have to represent the same cases as in 'datain' in the same ordering.\cr
\code{\link{PrInDT}} is called 'repin' times with subsamples of the original data so that the level with the larger frequency in the vector 'nesunder' has 
approximately the same number of values as the level with the smaller frequency.\cr
Only the arguments 'nesvar', 'nesunder', and 'repin' relate to the additional undersampling, all the other arguments relate to the standard 
\code{\link{PrInDT}} procedure. \cr As in \code{\link{PrInDT}}, the aim is to optimally model the relationship between the two-class factor variable 'classname' and all other factor and  
numerical variables in the data frame 'datain' by means of 'N' repetitions of undersampling. The trees generated by \code{\link{PrInDT}} can be
restricted by excluding unacceptable trees which include split results specified in the character strings of the vector 'ctestv'.\cr
The probability threshold 'thres' for the prediction of the smaller class may be specified (default = 0.5).\cr
Undersampling may be stratified in two ways by the feature 'strat'.\cr
The results are evaluated on the full sample and on the subsamples of 'nesunder'.
The parameters 'conf.level', 'minsplit', and 'minbucket' can be used to control the size of the trees.\cr

\strong{Reference} \cr Weihs, C., Buschfeld, S. 2021b. NesPrInDT: Nested undersampling in PrInDT. 
arXiv:2103.14931
}
\details{
Standard output can be produced by means of \code{print(name)} or just \code{ name } as well as \code{plot(name)}  where 'name' is the output data 
frame of the function.\cr
The plot function will produce a series of more than one plot. If you use R, you might want to specify \code{windows(record=TRUE)} before 
\code{plot(name)} to save the whole series of plots. In R-Studio this functionality is provided automatically.
}
\examples{
# data input and preparation --> data frame with 
#   class variable, factors, and numericals (no character variables)!!
data <- PrInDT::data_speaker
data <- na.omit(data)
nesvar <- "SPEAKER"
N <- 49  # no. of repetitions in inner loop
plarge <- 0.06 # sampling percentage for larger class in nesunder-subsample
psmall <- 1 # sampling percentage for smaller class in nesunder-subsample
nesunder <- data$SPEAKER
data[,nesvar] <- list(NULL)
outNes <- NesPrInDT(data,"class",ctestv=NA,N,plarge,psmall,conf.level=0.95,nesvar=nesvar,
  nesunder=nesunder,repin=5)
outNes
plot(outNes)
hist(outNes$undba,main=" ",xlab = "balanced accuracies of 3 best trees of all undersamples")

}
