README

The HOIFCar package facilitates the estimation of treatment effects through a range of covariate adjustment methods. It supports the calculation of point estimates and variance estimates for treatment effects. Additionally, the package provides oracle bias, oracle variance, and the corresponding approximated variance for the adjusted estimators derived from higher-order influence functions (HOIF).

Complete derivation for the exact variance of $\hat{\tau}_{\mathsf{adj}, 2}^{\dagger}$ under the randomization-based framework can be found here.

Installation

devtools::install_github("Cinbo-Wang/HOIF-Car")

Example

adj2 and adj2c estimators

We first consider the Completely Randomized Experiment (CRE) with moderately high-dimensional covariates under the randomization-based (design-based) framework.

# Oracle Estimatation 
## Linear setting
set.seed(100)
n <- 500
p <- 50
beta <- rt(p,3)

X <- mvtnorm::rmvt(n, sigma = diag(1, p), df = 3)
Y1 <- as.numeric(X %*% beta)
pi1 <- 0.5
n1 <- ceiling(n*pi1)

require(HOIFCar)
result_adj_db <- get_oracle_bias_var_adj_db(X = X,Y1 = Y1,n1 = n1) # from LYW paper
result_adj2c <- get_oracle_bias_var_adj2c(X = X,Y1 = Y1,n1 = n1)
result_adj2_3 <- get_oracle_bias_var_adj_2_3(X = X, Y1 = Y1,n1 = n1)
unlist(result_adj_db)
unlist(result_adj2c)
unlist(result_adj2_3)



## Nonlinear setting
n <- 500;
alpha <- 0.2;
set.seed(1000)
p <- ceiling(n * alpha)
Sigma_true <- matrix(0, nrow = p, ncol = p)
for(i in 1:p){
  for(j in 1:p){
    Sigma_true[i, j] <- 0.1**(abs(i - j))
  }
}

X <- mvtnorm::rmvt(n, sigma = Sigma_true, df = 3)
beta <- rt(p,3)
or_baseline <- sign(X %*% beta) * abs(X %*% beta)^(1/2) + sin(X %*% beta)
epsilon1 <- epsilon0 <- rt(n, 3)
Y1 <- 1 + as.numeric(or_baseline) + epsilon1


pi1 <- 2/3
n1 <- ceiling(n * pi1)

result_adj_db <- get_oracle_bias_var_adj_db(X = X, Y1 = Y1, n1 = n1) # from LYW paper
result_adj2c <- get_oracle_bias_var_adj2c(X = X, Y1 = Y1, n1 = n1)
result_adj2_3 <- get_oracle_bias_var_adj_2_3(X = X, Y1 = Y1, n1 = n1)
unlist(result_adj_db)
unlist(result_adj2c)
unlist(result_adj2_3)



# Realistic estimation
set.seed(100)
n <- 500
p <- n * 0.3
beta <- runif(p, -1 / sqrt(p), 1 / sqrt(p))

X <- mvtnorm::rmvt(n, sigma = diag(1, p), df = 3)
Y1 <- as.numeric(X %*% beta)
Y0 <- rep(0, n)

pi1 <- 2/3
n1 <- ceiling(n * pi1)
ind <- sample(n, size = n1)
A <- rep(0, n)
A[ind] <- 1
Y <- Y1 * A + Y0 * (1 - A)

Xc <- scale(X, scale = FALSE)
H <- Xc %*% MASS::ginv(t(Xc) %*% Xc) %*% t(Xc)

## Estimate mean treat on the treatment arm
result_ls <- esti_mean_treat(X, Y, A, H)
point_est <- result_ls$point_est
var_est <- result_ls$var_est
print(paste0('True mean treat:', round(mean(Y1), digits = 3), '.'))
print('Absolute bias:')
print(abs(point_est - mean(Y1)))
print('Estimated variance:')
print(var_est)

## Estimate ATE using adj2 and adj2c estimators
Xc <- cbind(1, scale(X, scale = FALSE))
result.adj2.adj2c.random.ls <-  fit.adj2.adj2c.Random(Y, Xc, A)
result.adj2.adj2c.random.ls

We next consider the simple randomization experiments with moderately high-dimensional covariates under the Super-Population based framework.

set.seed(120)
alpha0 <- 0.1; 
n <- 400; 

p0 <- ceiling(n * alpha0)
beta0_full <- 1 / (1:p0) ^ (1 / 2) * (-1) ^ c(1:p0)
beta <- beta0_full / norm(beta0_full,type='2')

Sigma_true <- matrix(0, nrow = p0, ncol = p0)
for (i in 1:p0) {
  for (j in 1:p0) {
    Sigma_true[i, j] <- 0.1 ** (abs(i - j))
  }
}

X <- mvtnorm::rmvt(n, sigma = Sigma_true, df = 3)

lp0 <- X %*% beta
delta_X <- 1  -  1/4 * X[, 2] -  1/8 * X[, 3]
lp1 <- lp0 + delta_X

Y0 <- lp0 + rnorm(n)
Y1 <- lp1 + rnorm(n)


pi1 <- 1 / 2
A <- rbinom(n, size = 1, prob = pi1)
Y <- A * Y1 + (1 - A) * Y0


# Estimate ATE, EY1 and EY0 using adj2 and adj2c estimators
Xc <- cbind(1, scale(X, scale = FALSE))
result.adj2.adj2c.sp.ate.ls <- fit.adj2.adj2c.Super(Y, Xc, A, intercept = TRUE,
                                                    target = 'ATE', lc = TRUE)
result.adj2.adj2c.sp.ate.ls
result.adj2.adj2c.sp.treat.ls <- fit.adj2.adj2c.Super(Y, Xc, A, intercept = TRUE,
                                                      target = 'EY1', lc = TRUE)
result.adj2.adj2c.sp.treat.ls
result.adj2.adj2c.sp.control.ls <- fit.adj2.adj2c.Super(Y, Xc, A, intercept = TRUE,
                                                        target = 'EY0', lc = TRUE)
result.adj2.adj2c.sp.control.ls

We next consider the Covariate-adaptive randomization with moderately high-dimensional covariates under the Super-Population based framework.

  set.seed(120)
  alpha0 <- 0.1;
  n <- 400;
  S <- as.factor(sample(c("0-30","31-50",">50"),n,replace = T,prob=c(0.2,0.4,0.4)))
  ns_min = min(table(S))
  
  p0 <- ceiling(ns_min * alpha0)
  beta0_full <- 1 / (1:p0) ^ (1 / 2) * (-1) ^ c(1:p0)
  beta <- beta0_full / norm(beta0_full,type='2')
  
  Sigma_true <- matrix(0, nrow = p0, ncol = p0)
  for (i in 1:p0) {
    for (j in 1:p0) {
      Sigma_true[i, j] <- 0.1 ** (abs(i - j))
    }
  }
  
  X <- mvtnorm::rmvt(n, sigma = Sigma_true, df = 3)
  
  lp0 <- X %*% beta
  delta_X <- 1  -  1/4 * X[, 2] -  1/8 * X[, 3]
  lp1 <- lp0 + delta_X
  
  Y0 <- lp0 + rnorm(n)
  Y1 <- lp1 + rnorm(n)
  
  
  pi1 <- 1 / 2
  
  # We use stratified block randomization as an example, simple randomization
  # is also valid by setting S = rep(1,n) and A = rbinom(n,1,pi1)
  
  sbr <- function(S,nA,p,block_size=10){
    N <- length(S)
    B <- block_size
    A <- rep(0,N)
    nS <- length(unique(S))
    for(s in 1:nS){
      ind_s <- which(S==s)
      n_s <- length(ind_s)
      A_s <- rep(0,n_s)
      numB <- floor(n_s/B)
      rem <- n_s - numB*B
      size_A <- B*p[s]
      if(numB==0){
        size_rem = floor(rem*p[s])
        size_rem[1] = rem - sum(size_rem[-1])
        A_s[(B*numB+1):n_s] <- sample(rep(0:(nA-1),size_rem),size=rem,replace = F)
      }else{
        for(i in 1:numB){
          A_s[(B*(i-1)+1):(B*i)] <- sample(rep(0:(nA-1),size_A),size=B,replace = F)
        }
        if(rem>0){
          size_rem = floor(rem*p[s])
          size_rem[1] = rem - sum(size_rem[-1])
          A_s[(B*numB+1):n_s] <- sample(rep(0:(nA-1),size_rem),size=rem,replace = F)
        }
      }
      A[ind_s] <- A_s
    }
    return(A)
  }
  
  
  A <- sbr(as.numeric(S),2,rep(pi1,3),block_size = 4)
  
  Y <- A * Y1 + (1 - A) * Y0
  
  Xc <- cbind(1, scale(X, scale = FALSE))
  result.adj2.adj2c.car.ate.ls <- fit.adj2.adj2c.CAR(Y, Xc,S, A, intercept = TRUE,
                                                     target = 'ATE')
  result.adj2.adj2c.car.ate.ls
  result.adj2.adj2c.car.treat.ls <- fit.adj2.adj2c.CAR(Y, Xc,S, A, intercept = TRUE,
                                                       target = 'EY1')
  result.adj2.adj2c.car.treat.ls
  result.adj2.adj2c.car.control.ls <- fit.adj2.adj2c.CAR(Y, Xc,S, A, intercept = TRUE,
                                                         target = 'EY0')
  result.adj2.adj2c.car.control.ls

JASA and JASA-cal estimators

generate_data_SR <- function(n, family, pi1, p_n_ratio = 0.05, seed = 123){
  set.seed(seed)
  alpha0 <- 0.15
  p0 <- ceiling(round(n * alpha0))
  beta0_full <- 1/(1:p0)^(1/4)*(-1)^c(1:p0)
  Sigma_true <- matrix(0, nrow = p0, ncol = p0)
  for (i in 1:p0) {
    for (j in 1:p0) {
      Sigma_true[i, j] <- 0.1 ** (abs(i - j))
    }
  }

  if(family != 'poisson'){
    X <- mvtnorm::rmvt(n, sigma = Sigma_true, df = 3)
  }else{
    X0 <- mvtnorm::rmvt(n, sigma = Sigma_true, df = 3)
    X <- pmin(pmax(X0, -3), 3)
    rm(X0)
  }

  beta <- beta0_full / norm(beta0_full,type='2')
  lp0 <- X %*% beta

  delta_X <- 1 - 1/2 * pmin(X[, 1]^2, 5) + 1/4 * X[, 1:10] %*% beta[1:10]
  lp1 <- lp0 + delta_X


  if (family == 'binomial') {
    r0 <- plogis(2 * lp0)
    r1 <- plogis(2 * lp1)
    Y1 <- rbinom(n, size=1, prob=r1)
    Y0 <- rbinom(n, size=1, prob=r0)
  }else if(family == 'poisson'){
    # quantile(lp1);quantile(lp0)
    lp1_tran <- pmin(lp1, 4)
    lp0_tran <- pmin(lp0, 4)
    r1 <- exp(lp1_tran)
    r0 <- exp(lp0_tran)

    Y1 <- rpois(n,r1)
    Y0 <- rpois(n,r0)
  }else if(family == 'gaussian'){
    r1 <- lp1;
    r0 <- lp0
    Y1 <- r1 + rnorm(n)
    Y0 <- r0 + rnorm(n)
  }

  A <- rbinom(n, size=1, prob=pi1)
  Y <- A * Y1 + (1 - A) * Y0

  p <- ceiling(round(n * p_n_ratio))
  if(p > ncol(X)){
    if(family != 'poisson'){
      X_noise <- rmvt(n, sigma = diag(p - ncol(X)), df = 3)
    }else{
      X0_noise <- rmvt(n, sigma = diag(p - ncol(X)), df = 3)
      X_noise <- pmin(pmax(X0_noise, -3), 3)
    }
    X_obs <- cbind(X, X_noise)
  }else{
    X_obs <- X[, 1:p, drop = FALSE]
  }

  data_ls <- list(
    X = X_obs, Y = Y, A = A,
    Y1 = Y1, Y0 = Y0,
    r1 = r1, r0 = r0
  )
  return(data_ls)
}


n <- 400; pi1 <- 1/3

## Continuous outcomes
family <- 'gaussian'; p_n_ratio <- 0.05
data_ls <- generate_data_SR(n, family, pi1, p_n_ratio)
X <- data_ls$X;
A <- data_ls$A
Y <- data_ls$Y

Xc <- scale(X, scale = FALSE)
Xc_aug <- cbind(1, Xc)
result.jasa.ls <- fit.JASA(Y, Xc_aug, A, family, opt_obj = 'beta')
result.jasa.ls$tau_vec
result.jasa.ls$var_tau_vec

## Count outcomes
family <- 'poisson'; p_n_ratio <- 0.05
data_ls <- generate_data_SR(n, family, pi1, p_n_ratio)
X <- data_ls$X;
A <- data_ls$A
Y <- data_ls$Y

Xc <- scale(X, scale = FALSE)
Xc_aug <- cbind(1, Xc)
result.jasa.ls <- fit.JASA(Y, Xc_aug, A, family, opt_obj = 'mu')
result.jasa.ls$tau_vec
result.jasa.ls$var_tau_vec

## Binary outcomes
family <- 'binomial'; p_n_ratio <- 0.05
data_ls <- generate_data_SR(n, family, pi1, p_n_ratio)
X <- data_ls$X;
A <- data_ls$A
Y <- data_ls$Y

Xc <- scale(X, scale = FALSE)
Xc_aug <- cbind(1, Xc)
result.jasa.ls <- fit.JASA(Y, Xc_aug, A, family, opt_obj = 'beta')
result.jasa.ls$tau_vec
result.jasa.ls$var_tau_vec

References

Zhao, S., Wang, X., Liu, L., & Zhang, X. (2024). Covariate adjustment in randomized experiments motivated by higher-order influence functions. arXiv preprint. https://arxiv.org/abs/2411.08491

Gu, Y., Liu, L., & Ma, W. (2025). Assumption-lean covariate adjustment under covariate adaptive randomization when $ p= o (n) $. arXiv preprint arXiv:2512.20046.

Xin Lu, Fan Yang, and Yuhao Wang. Debiased regression adjustment in completely randomized experiments with moderately high-dimensional covariates. arXiv preprint arXiv:2309.02073, 2023.

Marlena S. Bannick, Jun Shao, Jingyi Liu, Yu Du, Yanyao Yi, Ting Ye (2025). A General Form of Covariate Adjustment in Randomized Clinical Trials. Biometrika.