% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/twophase.R
\name{twophase}
\alias{twophase}
\title{twophase}
\usage{
twophase(
  formula,
  data,
  phase_id,
  cluster = NA,
  small_area = list(sa.col = NA, areas = NA, unbiased = TRUE),
  boundary_weights = NA,
  exhaustive = NA,
  progressbar = FALSE,
  psmall = FALSE
)
}
\arguments{
\item{formula}{an object of class "\code{\link[stats]{formula}}" as would be used in the function \code{\link[stats]{lm}}}

\item{data}{a data frame containing all variables contained in \code{formula} and a column indexing
phase membership.  Additional columns designating small-area membership, cluster ID and
boundary weights should also be contained in the data frame if they are
requested in the function.}

\item{phase_id}{an object of class "\code{\link[base]{list}}" containing two elements:
\itemize{
     \item \code{phase.col}: the column name in \code{data} that specifies the
                             phase membership of each observation
     \item \code{terrgrid.id}: the indicator identifying the terrestrial
                               (a.k.a. "ground truth") phase for that column
                                (must be of type "\code{\link[base]{numeric}}")
        }}

\item{cluster}{(\emph{Optional}) Specifies the column name in \code{data}
containing the cluster ID. Only used in case of
cluster sampling.}

\item{small_area}{(\emph{Optional}) a list that if containing three elements:
            \itemize{
                 \item \code{sa.col}: the column name in \code{data} containing
                                      domain identification
                 \item \code{areas}: vector of desired small-area domain identifiers
                 \item \code{unbiased}: an object of type "\code{\link[base]{logical}}"
                                        that when FALSE designates that the estimator is allowed to be
                                        biased (i.e. the synthetic estimator) and when TRUE forces
                                        it to be design-unbiased. See \emph{'Details'}.
                    }

            \strong{Note}: If \code{small_area} is left unchanged then \code{twophase} defaults to global estimation.}

\item{boundary_weights}{(\emph{Optional}) Specifies the column name in \code{data}
containing the weights for boundary adjustment.  See \emph{'Details'}}

\item{exhaustive}{(\emph{Optional}) For global estimation, a vector of true auxiliary means corresponding to
an exhaustive first phase.
The vector must be input in the same order that \code{lm} processes a \code{formula} object
and include the intercept term.
For small area estimation, \code{exhaustive} is a \code{data.frame} containing column names
(\code{\link[base]{colnames}}) for every variable appearing in the parameter \code{formula} including
the variable "Intercept".Rownames (\code{\link[base]{row.names}}) have to be used and must correspond
to the names of the small areas. See \emph{'Details'}.}

\item{progressbar}{(\emph{Optional}) an object a type "\code{\link[base]{logical}}" that when TRUE prints
the progress of the calculation in the console (recommended for large amount of small areas).  Defaults to FALSE.}

\item{psmall}{(\emph{Optional}) an object a type "\code{\link[base]{logical}}" used for small area estimations
that only works when \code{unbiased} in the parameter \code{small_area} is set to TRUE. See \emph{'Details'}.}
}
\value{
\code{twophase} returns an object of class \code{"twophase"}.

An object of class \code{"twophase"} returns a \code{list} of the following components:

 \item{input}{a \code{list} containing the function's inputs}
 \item{estimation}{a data frame containing the following components:
                  \itemize{
                   \item \code{area:} the domain (only present if argument \code{areas} has been used)
                   \item \code{estimate:} the point estimate
                   \item \code{ext_variance:} the external variance of the point estimate that doesn't account for
                                              fitting the model from the current inventory
                   \item \code{g_variance:} the internal (g-weight) variance that accounts for
                                              fitting the model from the current inventory
                   \item \code{n1} the first phase sample size of plots
                   \item \code{n2} the second phase (i.e. terrestrial) sample size of plots
                   \item \code{n1G} the first phase sample size in the small area
                   \item \code{n2G} the second phase (i.e. terrestrial) sample size in the small area
                   \item \code{r.squared} the R squared of the linear model
                   }}
 \item{samplesizes}{a \code{\link[base]{data.frame}} summarizing all samplesizes: in case of cluster sampling both,
                    the number of individual plots and the number of clusters is reported.}
 \item{coefficients}{the linear model coefficients}
 \item{cov_coef}{the design-based covariance matrix of the model coefficients}
 \item{Z_bar_1G}{the estimated auxiliary means of \code{formula} based on the first phase.
                 If the first phase is exhaustive, these are the true auxiliary means specified in the input-argument \code{exhaustive}.}
 \item{cov_Z_bar_1G}{the covariance matrix of \code{Z_bar_1G}}
 \item{Rc_x_hat_G}{the small-area residuals at either the plot level or cluster level depending on the call}
 \item{Rc_x_hat}{the residuals at either the plot level or cluster level depending on the call}
 \item{Yx_s2G}{the local densities in the small area}
 \item{Mx_s2G}{the cluster weights in the small area}
 \item{mean_Rc_x_hat_G}{the mean residual (weighted mean in the case of cluster sampling) in the small area}
 \item{mean_Rc_x_hat}{the mean residual (weighted mean in the case of cluster sampling)}
 \item{warn.messages}{logical indicating if warning messages were issued}
}
\description{
\code{twophase} is used to calculate estimations based on double sampling under the
\emph{model-assisted Monte Carlo approach}. A \emph{first phase} of auxiliary information
(e.g. taken from remote sensing data) is used to generate model predictions based on multiple linear
regression  using the method of ordinary least squares. A subsample of the first phase comprises
the \emph{second phase} which contains terrestrial observations (i.e. the \emph{local densities}
of the ground truth) that is used to correct for bias in the design-based sense.
The estimation method is available for \emph{simple} and \emph{cluster sampling} and includes
the special case where the first phase is based on an \emph{exhaustive} sample (i.e. a census).
\emph{Small-area applications} are supported for synthetic estimation as well as two varieties
of bias-corrected estimators: the traditional small-area estimator and an asymptotically
equivalent version derived under Mandallaz' extended model approach.
}
\details{
If estimations for multiple small-area domains should be computed, the domains have to be
         defined within a \code{character} vector using \code{c()}. Using \code{small_area(..., unbiased=FALSE)}
         calculates design-based estimates with the synthetic estimator and may be design-biased if
         the model is biased in that small area.  The default, \code{small_area(..., unbiased=TRUE)}, allows for a residual
         correction by one of two asymptotically equivalent methods to create design-unbiased estimates:
         \itemize{
             \item Mandallaz' extended model approach calculates the residual correction by extending the
                   model formula with an indicator variable in the small area.  It is the default method
                   \code{psmall}=FALSE.
             \item the traditional small area estimator calculates the residual correction by taking the
                   synthetic estimator and adding the mean residual observed in the small area.  It is activated
                   when \code{psmall}=TRUE.
                 }

         Missing values (\code{NA}) in the auxiliary variables (i.e. at least one auxiliary variable cannot be observed at
         an inventory location) are automatically removed from the dataset \emph{before} the estimations are computed.
         Note that missingness in the auxiliary variables is only allowed if we assume that they are \emph{missing at random},
         since the unbiasedness of the estimates is based on the sampling design.

         The boundary weight adjustment is pertinent for auxiliary information derived from remote sensing and
         is equal to the percentage of forested area (e.g. as defined by a forest mask) in the interpretation area.

         Exhaustive estimation refers to when the true means of certain auxiliary variables are known
         and an exhaustive first phase (i.e. a census).  For global estimation, the vector must be input
         in the same order that \code{lm} processes a \code{formula} object including the intercept term whose
         true mean will always be one.  For small area estimation, \code{exhaustive} is a \code{data.frame} containing column names for every variable appearing in
         the parameter \code{formula} including the variable "Intercept".  The observations of the data.frame
         must represent the true auxiliary means in the same order as was presented in \code{areas} from the
         parameter \code{small_area}.  See \emph{'Examples'}.
}
\note{
In the special case of cluster sampling, the reported sample sizes in \code{estimation} are the number of clusters.
The \code{samplesize}-object also provides the respective number of single plot units for cluster sampling.
The reported \code{r.squared} describe the model fit of the applied linear regression
model (i.e. on \emph{plot-level}, not on \emph{cluster level}).
}
\examples{

## load datasets:
data(grisons)
data(zberg)

# ------------------------------------------------#
# ----------- GLOBAL ESTIMATION ------------------#

#----
## 1) -- Design-based estimation with non-exhaustive auxiliary information
#----

# 1.1) non-cluster-sampling:
summary(twophase(formula = tvol ~mean + stddev + max + q75,
                 data = grisons,
                 phase_id = list(phase.col = "phase_id_2p", terrgrid.id = 2)))

# 1.2) cluster-sampling (see eqns. [57] and [58] in Mandallaz, Hill, Massey 2016):
summary(twophase(formula = basal ~ stade + couver + melange,
                data = zberg,
                phase_id = list(phase.col = "phase_id_2p", terrgrid.id = 2),
                cluster = "cluster"))

# 1.3) example for boundary weight adjustment (non-cluster example):
summary(twophase(formula=tvol ~ mean + stddev + max + q75,
                 data=grisons,
                 phase_id=list(phase.col = "phase_id_2p", terrgrid.id = 2),
                 boundary_weights = "boundary_weights"))

#----
## 2) -- Design-based estimation with exhaustive auxiliary information
#----

# establish order for vector of true auxiliary means:
colnames(lm(formula = tvol ~ mean + stddev + max + q75, data = grisons, x = TRUE)$x)
true.means <- c(1, 11.39, 8.84, 32.68, 18.03)

# 2.1) non-cluster-sampling:
summary(twophase(formula = tvol ~ mean + stddev + max + q75,
                 data = grisons,
                 phase_id = list(phase.col = "phase_id_2p", terrgrid.id = 2),
                 exhaustive = true.means))

# 2.2) cluster-sampling:
summary(twophase(formula = stem ~ stade + couver + melange,
                 data = zberg,
                 phase_id = list(phase.col = "phase_id_2p", terrgrid.id = 2),
                 cluster = "cluster",
                 exhaustive = c(1, 0.10, 0.7, 0.10, 0.6, 0.8)))


# ----------------------------------------------------#
# ----------- SMALL AREA ESTIMATION ------------------#

#----
## 1) -- Design-based estimation with non-exhaustive auxiliary information
#----

# 1.1) Mandallaz's extended pseudo small area estimator (see eqns. [35] and [36] in Mandallaz 2013):
summary(twophase(formula = tvol ~ mean + stddev + max + q75, data = grisons,
                 phase_id = list(phase.col = "phase_id_2p", terrgrid.id = 2),
                 small_area = list(sa.col = "smallarea", areas = c("A", "B","C", "D"),
                                   unbiased = TRUE)))

summary(twophase(formula = basal ~ stade + couver + melange, data=zberg,
                 phase_id = list(phase.col = "phase_id_2p", terrgrid.id = 2),
                 cluster = "cluster",
                 small_area = list(sa.col = "ismallg23", areas = c("2", "3"),
                                   unbiased = TRUE)))


# 1.2) pseudo small area estimator (see eqns. [25] and [26] in Mandallaz 2013):
summary(twophase(formula = tvol ~ mean + stddev + max + q75, data = grisons,
                 phase_id = list(phase.col = "phase_id_2p", terrgrid.id = 2),
                 small_area = list(sa.col = "smallarea", areas = c("A", "B"),
                                   unbiased = TRUE),
                 psmall = TRUE))

summary(twophase(formula = basal ~ stade + couver + melange, data=zberg,
                 phase_id = list(phase.col = "phase_id_2p", terrgrid.id = 2),
                 cluster = "cluster",
                 small_area = list(sa.col = "ismallg23", areas = c("2", "3"),
                                   unbiased = TRUE),
                 psmall = TRUE))


# 1.3) pseudosynthetic small area estimator (see eqns. [35] and [36] in Mandallaz 2013):
summary(twophase(formula = tvol ~ mean + stddev + max + q75, data=grisons,
                 phase_id = list(phase.col = "phase_id_2p", terrgrid.id = 2),
                 small_area = list(sa.col = "smallarea", areas = c("B", "A"),
                                   unbiased = FALSE)))

summary(twophase(formula = basal ~ stade + couver + melange, data=zberg,
                 phase_id = list(phase.col = "phase_id_2p", terrgrid.id = 2),
                 cluster = "cluster",
                 small_area = list(sa.col = "ismallg23", areas = c("2", "3"),
                                   unbiased = FALSE)))


#----
## 2) -- Design-based estimation with exhaustive auxiliary information
#----

# establish order for vector of true auxiliary means:
colnames(lm(formula = tvol ~ mean + stddev + max + q75, data = grisons, x = TRUE)$x)
colnames(lm(formula = basal ~ stade + couver + melange, data = zberg, x = TRUE)$x)

# true auxiliary means taken from Mandallaz et al. (2013):
truemeans.G <- data.frame(Intercept = rep(1, 4),
                         mean = c(12.85, 12.21, 9.33, 10.45),
                         stddev = c(9.31, 9.47, 7.90, 8.36),
                         max = c(34.92, 35.36, 28.81, 30.22),
                         q75 = c(19.77, 19.16, 15.40, 16.91))
rownames(truemeans.G) <- c("A", "B", "C", "D")

# true auxiliary means taken from Mandallaz (1991):
truemeans.G.clust <- data.frame(Intercept = 1,
                               stade400 = 0.175,
                               stade500 = 0.429,
                               stade600 = 0.321,
                               couver2 = 0.791,
                               melange2 = 0.809)
rownames(truemeans.G.clust) <- c("1")


# 2.1) Mandallaz's extended small area estimator (see eqns. [31] and [33] in Mandallaz 2013):
summary(twophase(formula = tvol ~ mean + stddev + max + q75, data = grisons,
                 phase_id = list(phase.col = "phase_id_2p", terrgrid.id = 2),
                 small_area = list(sa.col ="smallarea", areas = c("A", "B"),
                                   unbiased = TRUE),
                 exhaustive = truemeans.G))

summary(twophase(formula = basal ~ stade + couver + melange, data=zberg,
                 phase_id = list(phase.col = "phase_id_2p", terrgrid.id = 2),
                 cluster = "cluster",
                 small_area = list(sa.col = "ismallold", areas = c("1"),
                                   unbiased = TRUE),
                 exhaustive = truemeans.G.clust))


# 2.2) small area estimator (see eqns. [20] and [21] in Mandallaz 2013):
summary(twophase(formula = tvol ~ mean + stddev + max + q75, data = grisons,
                 phase_id = list(phase.col = "phase_id_2p", terrgrid.id = 2),
                 small_area = list(sa.col = "smallarea", areas = c("A"),
                                   unbiased = TRUE),
                 exhaustive = truemeans.G, psmall = TRUE))

summary(twophase(formula = basal ~ stade + couver + melange, data = zberg,
                 phase_id = list(phase.col = "phase_id_2p", terrgrid.id = 2),
                 cluster = "cluster",
                 small_area = list(sa.col ="ismallold", areas = c("1"),
                                   unbiased = TRUE),
                 psmall = TRUE,
                 exhaustive = truemeans.G.clust))


# 2.3) synthetic small area estimator (see eqns. [18] and [19] in Mandallaz 2013):
summary(twophase(formula=tvol ~ mean + stddev + max + q75, data=grisons,
                 phase_id=list(phase.col = "phase_id_2p", terrgrid.id = 2),
                 small_area=list(sa.col = "smallarea", areas = c("A", "B"),
                                 unbiased = FALSE),
                 exhaustive = truemeans.G))

summary(twophase(formula = basal ~ stade + couver + melange, data = zberg,
                 phase_id = list(phase.col = "phase_id_2p", terrgrid.id = 2),
                 cluster = "cluster",
                 small_area = list(sa.col = "ismallold", areas = c("1"),
                                   unbiased = FALSE),
                 exhaustive = truemeans.G.clust))

}
\references{
Hill, A., Massey, A. F. (2021). \emph{The R Package forestinventory: Design-Based Global and Small Area Estimations for Multiphase Forest Inventories.} Journal of Statistical Software, 97(4), 1-40.

Mandallaz, D. (2007). \emph{Sampling techniques for forest inventories.} Chapter 4. CRC Press.

Mandallaz, D. (2013). \emph{Design-based properties of some small-area estimators in forest inventory with two-phase sampling.} Can. J. For. Res. 43: 441-449

Mandallaz, D. and Hill, A. and Massey, A. (2016). \emph{Design-based properties of some small-area estimators in forest inventory with two-phase sampling.} ETH Zurich, Department of Environmental Systems Science,Tech. rep. Available from \url{http://e-collection.library.ethz.ch}.
}
