% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/logistic2ph.R
\name{logistic2ph}
\alias{logistic2ph}
\title{Sieve maximum likelihood estimator (SMLE) for two-phase logistic regression problems}
\usage{
logistic2ph(
  Y_unval = NULL,
  Y = NULL,
  X_unval = NULL,
  X = NULL,
  Z = NULL,
  Bspline = NULL,
  data = NULL,
  theta_pred = NULL,
  gamma_pred = NULL,
  initial_lr_params = "Zeros",
  hn_scale = 1,
  noSE = FALSE,
  TOL = 1e-04,
  MAX_ITER = 1000,
  verbose = FALSE
)
}
\arguments{
\item{Y_unval}{Column name of the error-prone or unvalidated continuous outcome. Subjects with missing values of \code{Y_unval} are omitted from the analysis. If \code{Y_unval} is null, the outcome is assumed to be error-free.}

\item{Y}{Column name that stores the validated value of \code{Y_unval} in the second phase. Subjects with missing values of \code{Y} are considered as those not selected in the second phase. This argument is required.}

\item{X_unval}{Column name(s) with the unvalidated predictors.  If \code{X_unval} and \code{X} are \code{null}, all predictors are assumed to be error-free.}

\item{X}{Column name(s) with the validated predictors. If \code{X_unval} and \code{X} are \code{NULL}, all predictors are assumed to be error-free.}

\item{Z}{(Optional) Column name(s) with additional error-free covariates.}

\item{Bspline}{Vector of column names containing the B-spline basis functions.}

\item{data}{A dataframe with one row per subject containing columns: \code{Y_unval}, \code{Y}, \code{X_unval}, \code{X}, \code{Z}, and \code{Bspline}.}

\item{theta_pred}{Vector of columns in \code{data} that pertain to the predictors in the analysis model.}

\item{gamma_pred}{Vector of columns in \code{data} that pertain to the predictors in the outcome error model.}

\item{initial_lr_params}{Initial values for parametric model parameters. Choices include (1) \code{"Zeros"} (non-informative starting values) or (2) \code{"Complete-data"} (estimated based on validated subjects only)}

\item{hn_scale}{Size of the perturbation used in estimating the standard errors via profile likelihood. If none is supplied, default is \code{hn_scale = 1}.}

\item{noSE}{Indicator for whether standard errors are desired. Defaults to \code{noSE = FALSE}.}

\item{TOL}{Tolerance between iterations in the EM algorithm used to define convergence.}

\item{MAX_ITER}{Maximum number of iterations in the EM algorithm. The default number is \code{1000}. This argument is optional.}

\item{verbose}{If \code{TRUE}, then show details of the analysis. The default value is \code{FALSE}.}
}
\value{
\item{coeff}{dataframe with final coefficient and standard error estimates (where applicable) for the analysis model.}
\item{outcome_err_coeff}{dataframe with final coefficient estimates for the outcome error model.}
\item{Bspline_coeff}{dataframe with final B-spline coefficient estimates (where applicable).}
\item{vcov}{variance-covarianced matrix for \code{coeff} (where applicable).}
\item{converged}{indicator of EM algorithm convergence for parameter estimates.}
\item{se_converged}{indicator of standard error estimate convergence.}
\item{converged_msg}{(where applicable) description of non-convergence.}
\item{iterations}{number of iterations completed by EM algorithm to find parameter estimates.}
\item{od_loglik_at_conv}{value of the observed-data log-likelihood at convergence.}
}
\description{
This function returns the sieve maximum likelihood estimators (SMLE) for the logistic regression model from Lotspeich et al. (2021).
}
\examples{
 set.seed(918)
 
 # Set sample sizes ----------------------------------------
 N <- 1000 # Phase-I = N
 n <- 250 # Phase-II/audit size = n
 
 # Generate true values Y, Xb, Xa --------------------------
 Xa <- rbinom(n = N, size = 1, prob = 0.25)
 Xb <- rbinom(n = N, size = 1, prob = 0.5)
 Y <- rbinom(n = N, size = 1,prob = (1 + exp(-(- 0.65 - 0.2 * Xb - 0.1 * Xa))) ^ (- 1))
 
 # Generate error-prone Xb* from error model P(Xb*|Xb,Xa) --
 sensX <- specX <- 0.75
 delta0 <- - log(specX / (1 - specX))
 delta1 <- - delta0 - log((1 - sensX) / sensX)
 Xbstar <- rbinom(n = N, size = 1,
                  prob = (1 + exp(- (delta0 + delta1 * Xb + 0.5 * Xa))) ^ (- 1))
 
 # Generate error-prone Y* from error model P(Y*|Xb*,Y,Xb,Xa)
 sensY <- 0.95
 specY <- 0.90
 theta0 <- - log(specY / (1 - specY))
 theta1 <- - theta0 - log((1 - sensY) / sensY)
 Ystar <- rbinom(n = N, size = 1,
   prob = (1 + exp(- (theta0 - 0.2 * Xbstar + theta1 * Y - 0.2 * Xb - 0.1 * Xa))) ^ (- 1))
 
 
 ## V is a TRUE/FALSE vector where TRUE = validated --------
 V <- seq(1, N) \%in\% sample(x = seq(1, N), size = n, replace = FALSE)
 
 
 # Build dataset --------------------------------------------
 sdat <- cbind(Y, Xb, Ystar, Xbstar, Xa)
 # Make Phase-II variables Y, Xb NA for unaudited subjects ---
 sdat[!V, c("Y", "Xb")] <- NA
 
 # Fit models -----------------------------------------------
 ## Naive model -----------------------------------------
 naive <- glm(Ystar ~ Xbstar + Xa, family = "binomial", data = data.frame(sdat))
 
 
 ## Generalized raking ----------------------------------
 ### Influence function for logistic regression
 ### Taken from: https://github.com/T0ngChen/multiwave/blob/master/sim.r
 inf.fun <- function(fit) {
   dm <- model.matrix(fit)
   Ihat <- (t(dm) \%*\% (dm * fit$fitted.values * (1 - fit$fitted.values))) / nrow(dm)
   ## influence function
   infl <- (dm * resid(fit, type = "response")) \%*\% solve(Ihat)
   infl
 }
 naive_infl <- inf.fun(naive) # error-prone influence functions based on naive model
 colnames(naive_infl) <- paste0("if", 1:3)
 
 # Add naive influence functions to sdat -----------------------------------------------
 sdat <- cbind(id = 1:N, sdat, naive_infl)
 
 ### Construct B-spline basis -------------------------------
 ### Since Xb* and Xa are both binary, reduces to indicators --
 nsieve <- 4
 B <- matrix(0, nrow = N, ncol = nsieve)
 B[which(Xa == 0 & Xbstar == 0), 1] <- 1
 B[which(Xa == 0 & Xbstar == 1), 2] <- 1
 B[which(Xa == 1 & Xbstar == 0), 3] <- 1
 B[which(Xa == 1 & Xbstar == 1), 4] <- 1
 colnames(B) <- paste0("bs", seq(1, nsieve))
 sdat <- cbind(sdat, B)
 smle <- logistic2ph(Y_unval = "Ystar",
   Y = "Y",
   X_unval = "Xbstar",
   X = "Xb",
   Z = "Xa",
   Bspline = colnames(B),
   data = sdat,
   noSE = FALSE,
   MAX_ITER = 1000,
   TOL = 1E-4)
}
\references{
Lotspeich, S. C., Shepherd, B. E., Amorim, G. G. C., Shaw, P. A., & Tao, R. (2021). Efficient odds ratio estimation under two-phase sampling using error-prone data from a multi-national HIV research cohort. \emph{Biometrics, biom.13512.} https://doi.org/10.1111/biom.13512
}
