% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/threephase.R
\name{threephase}
\alias{threephase}
\title{threephase}
\usage{
threephase(formula.s0, formula.s1, data, phase_id, cluster = NA,
  small_area = list(sa.col = NA, areas = NA, unbiased = TRUE),
  boundary_weights = NA, exhaustive = NA, progressbar = FALSE,
  psmall = FALSE)
}
\arguments{
\item{formula.s0}{an object of class "\code{\link[stats]{formula}}" as would be used in the function \code{\link[stats]{lm}}
that contains a reduced set of auxiliary variables available for all first phase plots}

\item{formula.s1}{an object of class "\code{\link[stats]{formula}}" as would be used in the function \code{\link[stats]{lm}}
that contains the predictors from \code{formula.s0} as well as further ancilliary predictors available
for all second phase plots (i.e. \code{formula.s0} is \strong{nested} in \code{formula.s1})}

\item{data}{a data frame containing all variables contained in \code{formula} and a column indexing
phase membership.  Additional columns designating small-area membership, cluster ID and
boundary weights should also be contained in the data frame if they are
requested in the function.}

\item{phase_id}{an object of class "\code{\link[base]{list}}" containing three elements:
\itemize{
     \item \code{phase.col}: the column name in \code{data} that specifies the
                             phase membership of each observation
     \item \code{s1.id}: the indicator identifying the "second phase only" plots
                               for that column
     \item \code{terrgrid.id}: the indicator identifying the terrestrial
                               (a.k.a. "ground truth") phase for that column
        }}

\item{cluster}{(\emph{Optional}) Specifies the column name in \code{data}
containing the cluster ID. Only used in case of
cluster sampling.}

\item{small_area}{(\emph{Optional}) a list that if containing three elements:
            \itemize{
                 \item \code{sa.col}: the column name in \code{data} containing
                                      domain identification
                 \item \code{areas}: vector of desired small-area domain identifiers
                 \item \code{unbiased}: an object of type "\code{\link[base]{logical}}"
                                        that when FALSE designates that the estimator is allowed to be
                                        biased (i.e. the synthetic estimator) and when TRUE forces
                                        it to be design-unbiased. See \emph{'Details'}.
                    }

            \strong{Note}: If \code{small_area} is left unchanged then \code{twophase} defaults to global estimation.}

\item{boundary_weights}{(\emph{Optional}) Specifies the column name in \code{data}
containing the weights for boundary adjustment.  See \emph{'Details'}}

\item{exhaustive}{(\emph{Optional}) For global estimation, a vector of true auxiliary means corresponding to
an exhaustive first phase.
The vector must be input in the same order that \code{lm} processes a \code{formula} object
and include the intercept term.
For small area estimation, \code{exhaustive} is a \code{data.frame} containing column names
(\code{\link[base]{colnames}}) for every variable appearing in the parameter \code{formula} including
the variable "Intercept".Rownames (\code{\link[base]{row.names}}) have to be used and must correspond
to the names of the small areas. See \emph{'Details'}.}

\item{progressbar}{(\emph{Optional}) an object a type "\code{\link[base]{logical}}" that when TRUE prints
the progress of the calculation in the console (recommended for large amount of small areas).  Defaults to FALSE.}

\item{psmall}{(\emph{Optional}) an object a type "\code{\link[base]{logical}}" used for small area estimations
that only works when \code{unbiased} in the parameter \code{small_area} is set to TRUE. See \emph{'Details'}.}
}
\value{
\code{threephase} returns an object of class \code{"threephase"}.

An object of class \code{"threephase"} returns a \code{list} of the following components:

 \item{input}{a \code{list} containing the function's inputs}
 \item{estimation}{a data frame containing the following components:
                  \itemize{
                   \item \code{area:} the domain (only present if argument \code{areas} has been used)
                   \item \code{estimate:} the point estimate
                   \item \code{ext_variance:} the external variance of the point estimate that doesn't account for
                                              fitting the model from the current inventory
                   \item \code{g_variance:} the internal (g-weight) variance that accounts for
                                              fitting the model from the current inventory
                   \item \code{n0} the first phase sample size of plots
                   \item \code{n1} the second phase sample size of plots
                   \item \code{n2} the third phase (i.e. terrestrial) sample size of plots
                   \item \code{n0G} the first phase sample size in the small area
                   \item \code{n1G} the second phase sample size in the small area
                   \item \code{n2G} the third phase (i.e. terrestrial) sample size in the small area
                   \item \code{r.squared_reduced} the R-squared of the linear model based on \code{formula.s0} (i.e. the reduced model)
                   \item \code{r.squared_full} the R-squared of the linear model based on \code{formula.s1} (i.e. the full model)
                   }}
 \item{samplesizes}{a \code{\link[base]{data.frame}} summarizing all samplesizes: in case of cluster sampling both,
                    the number of individual plots and the number of clusters is reported.}
 \item{coefficients}{the coefficients of the two linear models:
                  \itemize{
                    \item \code{alpha:} the reduced model coefficients
                   \item \code{beta:} the full model coefficients
                   }}
 \item{cov_alpha_s2}{the design-based covariance matrix of the reduced model coefficients}
 \item{cov_beta_s2}{the design-based covariance matrix of the full model coefficients}
 \item{Z_bar_1_s0}{the estimated auxiliary means of \code{formula.s0} based on the first phase.
                   If the first phase is exhaustive, these are the true auxiliary means specified in the input-argument \code{exhaustive}.}
 \item{Z1_bar_s1}{the estimated auxiliary means of \code{formula.s0} based on the second phase}
 \item{Z_bar_s1}{the estimated auxiliary means of \code{formula.s1} based on the second phase}
 \item{cov_Z_bar_1_s0}{the covariance matrix for \code{Z_bar_1_s0}}
 \item{resid_reduced}{the reduced model residuals at either the plot level or cluster level depending on the call}
 \item{resid_full}{the full model residuals at either the plot level or cluster level depending on the call}
 \item{warn.messages}{logical indicating if warning messages were issued}
}
\description{
\code{threephase} is used to calculate estimations based on triple sampling under the
\emph{model-assisted Monte Carlo approach}. A \emph{first phase} of auxiliary information
(e.g. taken from remote sensing data) is used to generate model predictions based on multiple linear
regression using the method of ordinary least squares. A subsample of the first phase comprises
a \emph{second phase} which contains further auxiliary information that produces another set of model predictions.
A further subsample produces a \emph{third final phase} based on terrestrial observations
(i.e. the \emph{local densities} of the ground truth) and is used to correct for bias in the design-based sense.
The estimation method is available for \emph{simple} and \emph{cluster sampling} and includes
the special case where the first phase is based on an \emph{exhaustive} sample (i.e. a census).
\emph{Small-area applications} are supported for synthetic estimation as well as two varieties
of bias-corrected estimators: the traditional small-area estimator and an asymptotically
equivalent version derived under Mandallaz's extended model approach.
}
\details{
\code{s1.id} identifies "second phase only" plots because the terrestrial phase is
         known to be part of the second phase by the construction of the subsampling.

         If estimations for multiple small-area domains should be computed, the domains have to be
         defined within a \code{character} vector using \code{c()}. Using \code{small_area(..., unbiased=FALSE)}
         calculates design-based estimates with the synthetic estimator and may be design-biased if
         the model is biased in that small area.  The default, \code{small_area(..., unbiased=TRUE)}, allows for a residual
         correction by one of two asympototically equivalent methods to create design-unbiased estimates:
         \itemize{
             \item Mandallaz's extended model approach calculates the residual correction by extending the
                   model formula with an indicator variable in the small area.  It is the default method
                   \code{psmall}=FALSE.
             \item the traditional small area estimator calculates the residual correction by taking the
                   synthetic estimator and adding the mean residual observed in the small area.  It is activated
                   when \code{psmall}=TRUE.
                 }

         Missing values (\code{NA}) in the auxiliary variables (i.e. at least one auxiliary variable cannot be observed at
         an inventory location) are automatically removed from the dataset \emph{before} the estimations are computed.
         Note that missingness in the auxiliary variables is only allowed if we assume that they are \emph{missing at random},
         since the unbiasedness of the estimates is based on the sampling design.

         The boundary weight adjustment is pertinent for auxiliary information derived from remote sensing and
         is equal to the percentage of forested area (e.g. as defined by a forest mask) in the interpretation area.

         Exhaustive estimation refers to when the true means of certain auxiliary variables are known
         and an exhaustive first phase (i.e. a census).  For global estimation, the vector must be input
         in the same order that \code{lm} processes a \code{formula} object including the intercept term whose
         true mean will always be one.  For small area estimation, \code{exhaustive} is a \code{data.frame} containing column names for every variable appearing in
         the parameter \code{formula} including the variable "Intercept".  The observations of the data.frame
         must represent the true auxiliary means in the same order as was presented in \code{areas} from the
         parameter \code{small_area}.  See \emph{'Examples'}.
}
\note{
In the special case of cluster sampling, the reported sample sizes in \code{estimation} are the number of clusters.
The \code{samplesize}-object also provides the respective number of single plot units for cluster sampling.
The reported \code{r.squared_reduced} and \code{r.squared_full} describe the model fit of the applied linear regression
models (i.e. on \emph{plot-level}, not on \emph{cluster level}).
}
\examples{

## load datasets:
data(grisons)
data(zberg)


## define regression models for simple and cluster sampling:
formula.s0 <- tvol ~ mean # reduced model:
formula.s1 <- tvol ~ mean + stddev + max + q75 # full model
formula.clust.s0 <- basal ~ stade
formula.clust.s1 <- basal ~ stade + couver + melange


# ------------------------------------------------#
# ----------- GLOBAL ESTIMATION ------------------#

#----
## 1) -- Design-based estimation with non-exhaustive auxiliary information
#----

# 1.1) non-cluster-sampling (see eqns. [11], [14] and [16] in Mandallaz 2014):
summary(threephase(formula.s0, formula.s1, data = grisons,
                   phase_id = list(phase.col = "phase_id_3p", s1.id=1, terrgrid.id = 2)))


# 1.2) cluster-sampling (see eqns. [49] and [50] in Mandallaz 2013):
summary(threephase(formula.clust.s0, formula.clust.s1, data = zberg,
                   phase_id = list(phase.col="phase_id_3p", s1.id = 1, terrgrid.id = 2),
                   cluster = "cluster"))


# 1.3) example for boundary weight adjustment (non-cluster example):
summary(threephase(formula.s0, formula.s1, data = grisons,
                   phase_id = list(phase.col="phase_id_3p", s1.id = 1, terrgrid.id = 2),
                   boundary_weights = "boundary_weights"))

#----
## 2) -- Design-based estimation with exhaustive auxiliary information
#----

# 2.1) non-cluster-sampling (see eqns. [7], [9] and [10] in Mandallaz 2014):
summary(threephase(formula.s0, formula.s1, data = grisons,
                   phase_id = list(phase.col = "phase_id_3p", s1.id = 1, terrgrid.id = 2),
                   exhaustive = c(1,11.39)))


# 2.2) cluster-sampling:
summary(threephase(formula.clust.s0, formula.clust.s1, data = zberg,
                   phase_id = list(phase.col = "phase_id_3p", s1.id = 1, terrgrid.id = 2),
                   cluster = "cluster", exhaustive = c(1, 0.10, 0.7, 0.10)))


# ----------------------------------------------------#
# ----------- SMALL AREA ESTIMATION ------------------#

#----
## 1) --  Design-based estimation with non-exhaustive auxiliary information
#----

# 1.1) Mandallaz's extended pseudo small area estimator:
summary(threephase(formula.s0,
                   formula.s1,
                   data = grisons,
                   phase_id = list(phase.col = "phase_id_3p", s1.id = 1, terrgrid.id = 2),
                   small_area=list(sa.col = "smallarea", areas = c("A", "B", "C", "D"),
                                   unbiased = TRUE)))

summary(threephase(formula.clust.s0,
                   formula.clust.s1,
                   data = zberg,
                   phase_id = list(phase.col = "phase_id_3p", s1.id = 1, terrgrid.id = 2),
                   cluster = "cluster",
                   small_area = list(sa.col = "ismallold", areas = c("1"), unbiased = TRUE)))


# 1.2) pseudo small area estimator:
summary(threephase(formula.s0,
                   formula.s1,
                   data = grisons,
                   phase_id = list(phase.col = "phase_id_3p", s1.id = 1, terrgrid.id = 2),
                   small_area = list(sa.col = "smallarea", areas = c("A", "B", "C", "D"),
                                   unbiased = TRUE),
                   psmall = TRUE))

summary(threephase(formula.clust.s0,
                   formula.clust.s1,
                   data=zberg,
                   phase_id=list(phase.col="phase_id_3p", s1.id=1, terrgrid.id=2),
                   cluster="cluster",
                   small_area=list(sa.col="ismallold", areas=c("1"), unbiased=TRUE),
                   psmall = TRUE))


# 1.3) pseudosynthetic small area estimator:
summary(threephase(formula.s0  = tvol ~ mean,
                   formula.s1 = tvol ~ mean + stddev + max + q75,
                   data = grisons,
                   phase_id = list(phase.col = "phase_id_3p", s1.id = 1, terrgrid.id = 2),
                   small_area = list(sa.col = "smallarea", areas = c("A", "B", "C", "D"),
                                   unbiased = FALSE)))

summary(threephase(formula.clust.s0,
                   formula.clust.s1,
                   data = zberg,
                   phase_id = list(phase.col = "phase_id_3p", s1.id = 1, terrgrid.id = 2),
                   cluster = "cluster",
                   small_area = list(sa.col = "ismallold", areas = c("1"), unbiased = FALSE)))


#----
## 2) --  Design-based estimation with exhaustive auxiliary information
#----

# true auxiliary mean for variable "mean" taken from Mandallaz et al. (2013):
truemeans.G <- data.frame(Intercept = rep(1, 4),
                         mean = c(12.85, 12.21, 9.33, 10.45))
rownames(truemeans.G) <- c("A", "B", "C", "D")

# true auxiliary means taken from Mandallaz (1991):
truemeans.G.clust <- data.frame(Intercept = 1, stade400 = 0.175, stade500 = 0.429,
                               stade600 = 0.321)
rownames(truemeans.G.clust) <- c("1")


# 2.1) Mandallaz's extended small area estimator:
summary(threephase(formula.s0,
                   formula.s1,
                   data = grisons,
                   phase_id = list(phase.col = "phase_id_3p", s1.id = 1, terrgrid.id = 2),
                   small_area = list(sa.col = "smallarea", areas = c("A", "B", "C", "D"),
                                unbiased = TRUE),
                   exhaustive = truemeans.G))

summary(threephase(formula.clust.s0,
                   formula.clust.s1,
                   data = zberg,
                   phase_id = list(phase.col = "phase_id_3p", s1.id = 1, terrgrid.id = 2),
                   cluster = "cluster",
                   small_area = list(sa.col = "ismallold", areas = c("1"), unbiased = TRUE),
                   exhaustive = truemeans.G.clust))


# 2.2) small area estimator:
summary(threephase(formula.s0,
                   formula.s1,
                   data = grisons,
                   phase_id = list(phase.col = "phase_id_3p", s1.id = 1, terrgrid.id = 2),
                   small_area = list(sa.col = "smallarea", areas = c("A", "B", "C", "D"),
                                   unbiased = TRUE),
                   exhaustive = truemeans.G,
                   psmall = TRUE))

summary(threephase(formula.clust.s0,
                   formula.clust.s1,
                   data = zberg,
                   phase_id = list(phase.col = "phase_id_3p", s1.id = 1, terrgrid.id = 2),
                   cluster = "cluster",
                   small_area = list(sa.col = "ismallold", areas = c("1"), unbiased = TRUE),
                   exhaustive = truemeans.G.clust,
                   psmall = TRUE))


# 2.3) synthetic small area estimator:
summary(threephase(formula.s0,
                   formula.s1,
                   data = grisons,
                   phase_id = list(phase.col="phase_id_3p", s1.id = 1, terrgrid.id = 2),
                   small_area = list(sa.col = "smallarea", areas = c("A", "B", "C", "D"),
                                   unbiased = FALSE),
                   exhaustive = truemeans.G))

summary(threephase(formula.clust.s0,
                   formula.clust.s1,
                   data = zberg,
                   phase_id = list(phase.col = "phase_id_3p", s1.id = 1, terrgrid.id = 2),
                   cluster = "cluster",
                   small_area = list(sa.col = "ismallold", areas = c("1"), unbiased = FALSE),
                   exhaustive = truemeans.G.clust))

}
\references{
Mandallaz, D., Breschan, J., & Hill, A. (2013). \emph{New regression estimators in forest inventories
with two-phase sampling and partially exhaustive information: a design-based monte carlo approach
with applications to small-area estimation.} Canadian Journal of Forest Research, 43(11), 1023-1031.

Mandallaz, D. (2014). \emph{A three-phase sampling extension of the generalized regression estimator with partially exhaustive information.} Can. J. For. Res. 44: 383-388

Massey, A. and Mandallaz, D. and Lanz, A. (2014). \emph{Integrating remote sensing and past inventory data under the new annual design of the Swiss National Forest Inventory using three-phase design-based regression estimation.} Can. J. For. Res. 44(10): 1177-1186

Mandallaz, D. (2013). \emph{Regression estimators in forest inventories with three-phase sampling and two multivariate components of auxiliary information.} ETH Zurich, Department of Environmental Systems Science,Tech. rep. Available from http://e-collection.library.ethz.ch.
}

