Creating Baseline Characteristics Tables

library(clinpubr)
library(dplyr)
library(survival)

Introduction

Baseline characteristics tables (Table 1) summarize patient demographics and clinical features at study entry. The clinpubr package automates the key decisions: variable type classification, normality assessment, statistical test selection, and missing data reporting.

Loading and Preparing Data

We’ll use the NCCTG Lung Cancer dataset from the survival package:

data(cancer, package = "survival")
str(cancer)
#> 'data.frame':    228 obs. of  10 variables:
#>  $ inst     : num  3 3 3 5 1 12 7 11 1 7 ...
#>  $ time     : num  306 455 1010 210 883 ...
#>  $ status   : num  2 2 1 2 2 1 2 2 2 2 ...
#>  $ age      : num  74 68 56 57 60 74 68 71 53 61 ...
#>  $ sex      : num  1 1 1 1 1 1 2 2 1 1 ...
#>  $ ph.ecog  : num  1 0 0 1 0 1 2 2 1 2 ...
#>  $ ph.karno : num  90 90 90 90 100 50 70 60 70 70 ...
#>  $ pat.karno: num  100 90 90 60 90 80 60 80 80 70 ...
#>  $ meal.cal : num  1175 1225 NA 1150 NA ...
#>  $ wt.loss  : num  NA 15 15 11 0 0 10 1 16 34 ...
knitr::kable(head(cancer), caption = "Raw Data Preview")
Raw Data Preview
inst time status age sex ph.ecog ph.karno pat.karno meal.cal wt.loss
3 306 2 74 1 1 90 100 1175 NA
3 455 2 68 1 0 90 90 1225 15
3 1010 1 56 1 0 90 90 NA 15
5 210 2 57 1 1 90 60 1150 11
1 883 2 60 1 0 100 90 NA 0
12 1022 1 74 1 1 50 80 513 0

Create derived variables for demonstration:

cancer$age_group <- cut(cancer$age,
  breaks = c(0, 50, 60, 70, 100),
  labels = c("<50", "50-60", "60-70", ">70")
)

# Combine sparse ECOG categories
cancer$ph.ecog_cat <- factor(cancer$ph.ecog,
  levels = c(0:3),
  labels = c("0", "1", ">=2", ">=2")
)

# Add missing values for demonstration
set.seed(123)
cancer$meal.cal[sample(1:nrow(cancer), 30)] <- NA
cancer$wt.loss[sample(1:nrow(cancer), 20)] <- NA

knitr::kable(head(cancer), caption = "Data After Preparation")
Data After Preparation
inst time status age sex ph.ecog ph.karno pat.karno meal.cal wt.loss age_group ph.ecog_cat
3 306 2 74 1 1 90 100 1175 NA >70 1
3 455 2 68 1 0 90 90 1225 15 60-70 0
3 1010 1 56 1 0 90 90 NA 15 50-60 0
5 210 2 57 1 1 90 60 1150 11 50-60 1
1 883 2 60 1 0 100 90 NA 0 50-60 0
12 1022 1 74 1 1 50 80 513 0 >70 1

Automatic Variable Type Determination

Before creating a baseline table, get_var_types() classifies each variable as:

var_types <- get_var_types(cancer, strata = "sex")

var_types
#> $factor_vars
#> [1] "status"      "sex"         "ph.ecog"     "age_group"   "ph.ecog_cat"
#> 
#> $exact_vars
#> [1] "ph.ecog"
#> 
#> $nonnormal_vars
#> [1] "inst"      "time"      "ph.karno"  "pat.karno" "meal.cal"  "wt.loss"  
#> 
#> $omit_vars
#> NULL
#> 
#> $strata
#> [1] "sex"
#> 
#> attr(,"class")
#> [1] "var_types"

Customizing Classification

Adjust thresholds for automatic classification:

var_types_custom <- get_var_types(
  cancer,
  strata = "sex",
  num_to_factor = 10, # Numeric vars with <=10 unique values treated as factor
  omit_factor_above = 15, # Omit factors with >15 levels
  norm_test_by_group = TRUE # Test normality within each stratum
)

var_types_custom
#> $factor_vars
#> [1] "status"      "sex"         "ph.ecog"     "ph.karno"    "pat.karno"  
#> [6] "age_group"   "ph.ecog_cat"
#> 
#> $exact_vars
#> [1] "ph.ecog"   "ph.karno"  "pat.karno"
#> 
#> $nonnormal_vars
#> [1] "inst"     "time"     "meal.cal" "wt.loss" 
#> 
#> $omit_vars
#> NULL
#> 
#> $strata
#> [1] "sex"
#> 
#> attr(,"class")
#> [1] "var_types"

Save QQ plots for manual review of normality tests (optional):

# var_types_with_plots <- get_var_types(
#   cancer, strata = "sex",
#   save_qqplots = TRUE, folder_name = "qqplots_review"
# )

Creating Baseline Tables

Basic Baseline Table

baseline_table() automatically selects summary statistics (mean/SD vs median/IQR) and statistical tests (t-test vs Mann-Whitney vs Chi-square vs Fisher):

baseline_result <- baseline_table(
  cancer,
  var_types = var_types,
  save_table = FALSE
)

knitr::kable(baseline_result$baseline, caption = "Baseline Characteristics by Sex")
Baseline Characteristics by Sex
Overall sex: 1 sex: 2 p test
n 228 138 90
inst (median [IQR]) 11.00 [3.00, 16.00] 11.00 [3.00, 15.00] 11.00 [3.25, 16.00] 0.416 nonnorm
time (median [IQR]) 255.50 [166.75, 396.50] 224.00 [144.75, 369.25] 292.50 [195.25, 448.50] 0.013 nonnorm
status = 2 (%) 165 (72.4) 112 (81.2) 53 (58.9) <0.001
age (mean (SD)) 62.45 (9.07) 63.34 (9.14) 61.08 (8.85) 0.064
ph.ecog (%) 0.823 exact
0 63 (27.8) 36 (26.3) 27 (30.0)
1 113 (49.8) 71 (51.8) 42 (46.7)
2 50 (22.0) 29 (21.2) 21 (23.3)
3 1 (0.4) 1 (0.7) 0 (0.0)
ph.karno (median [IQR]) 80.00 [75.00, 90.00] 80.00 [70.00, 90.00] 80.00 [80.00, 90.00] 0.882 nonnorm
pat.karno (median [IQR]) 80.00 [70.00, 90.00] 80.00 [70.00, 90.00] 80.00 [70.00, 90.00] 0.332 nonnorm
meal.cal (median [IQR]) 1025.00 [630.00, 1137.50] 1025.00 [730.25, 1175.00] 925.00 [588.00, 1060.00] 0.062 nonnorm
wt.loss (median [IQR]) 6.50 [0.00, 15.00] 9.00 [0.25, 20.00] 4.00 [0.00, 11.00] 0.031 nonnorm
age_group (%) 0.188
<50 26 (11.4) 14 (10.1) 12 (13.3)
50-60 68 (29.8) 35 (25.4) 33 (36.7)
60-70 88 (38.6) 58 (42.0) 30 (33.3)
>70 46 (20.2) 31 (22.5) 15 (16.7)
ph.ecog_cat (%) 0.737
0 63 (27.8) 36 (26.3) 27 (30.0)
1 113 (49.8) 71 (51.8) 42 (46.7)
>=2 51 (22.5) 30 (21.9) 21 (23.3)

Multi-Group Comparisons

With more than 2 groups, pairwise comparisons are automatically generated with optional multiple testing correction:

data(cancer, package = "survival")
cancer$ph.ecog_cat <- factor(cancer$ph.ecog,
  levels = c(0:3),
  labels = c("0", "1", ">=2", ">=2")
)

var_types_ecog <- get_var_types(cancer, strata = "ph.ecog_cat")

baseline_multi <- baseline_table(
  cancer,
  var_types = var_types_ecog,
  save_table = FALSE,
  multiple_comparison_test = TRUE,
  p_adjust_method = "BH"
)

knitr::kable(baseline_multi$baseline, caption = "Baseline Characteristics by ECOG Status")
Baseline Characteristics by ECOG Status
Overall ph.ecog_cat: 0 ph.ecog_cat: 1 ph.ecog_cat: >=2 p test
n 228 63 113 51
inst (median [IQR]) 11.00 [3.00, 16.00] 7.00 [3.00, 13.00] 11.00 [5.00, 15.00] 11.50 [3.25, 16.00] 0.254 nonnorm
time (median [IQR]) 255.50 [166.75, 396.50] 303.00 [224.50, 437.50] 243.00 [177.00, 426.00] 180.00 [100.00, 301.00] 0.001 nonnorm
status = 2 (%) 165 (72.4) 37 (58.7) 82 (72.6) 45 (88.2) 0.002
age (median [IQR]) 63.00 [56.00, 69.00] 61.00 [56.50, 68.00] 63.00 [55.00, 68.00] 68.00 [60.50, 73.00] 0.002 nonnorm
sex = 2 (%) 90 (39.5) 27 (42.9) 42 (37.2) 21 (41.2) 0.737
ph.ecog (%) <0.001 exact
0 63 (27.8) 63 (100.0) 0 (0.0) 0 (0.0)
1 113 (49.8) 0 (0.0) 113 (100.0) 0 (0.0)
2 50 (22.0) 0 (0.0) 0 (0.0) 50 (98.0)
3 1 (0.4) 0 (0.0) 0 (0.0) 1 (2.0)
ph.karno (median [IQR]) 80.00 [75.00, 90.00] 90.00 [90.00, 100.00] 80.00 [80.00, 90.00] 70.00 [60.00, 70.00] <0.001 nonnorm
pat.karno (median [IQR]) 80.00 [70.00, 90.00] 90.00 [80.00, 90.00] 80.00 [70.00, 90.00] 60.00 [60.00, 70.00] <0.001 nonnorm
meal.cal (median [IQR]) 975.00 [635.00, 1150.00] 1000.00 [653.75, 1175.00] 1025.00 [825.00, 1150.00] 796.50 [472.00, 1075.00] 0.037 nonnorm
wt.loss (median [IQR]) 7.00 [0.00, 15.75] 4.00 [0.00, 10.00] 6.00 [0.00, 15.00] 10.50 [3.50, 22.75] 0.009 nonnorm
knitr::kable(baseline_multi$pairwise, caption = "Pairwise Comparison P-values")
Pairwise Comparison P-values
ph.ecog_cat: 0_ph.ecog_cat: 1 ph.ecog_cat: 0_ph.ecog_cat: >=2 ph.ecog_cat: 1_ph.ecog_cat: >=2
inst 0.2225074 0.2225074 0.7774247
time 0.1457843 0.0008158 0.0101101
status 0.0868062 0.0031545 0.0650124
age 0.8454807 0.0029606 0.0029606
sex 1.0000000 1.0000000 1.0000000
ph.ecog 0.0001000 0.0001000 0.0001000
ph.karno 0.0000000 0.0000000 0.0000000
pat.karno 0.0136468 0.0000000 0.0000000
meal.cal 0.5094785 0.1265793 0.0316249
wt.loss 0.0907856 0.0062612 0.0907856

Customizing the Table

Select specific variables, add SMD, handle missing strata:

baseline_custom <- baseline_table(
  cancer,
  var_types = var_types,
  vars = c("age", "wt.loss", "meal.cal", "ph.ecog"),
  smd = TRUE,
  omit_missing_strata = TRUE,
  seed = 123
)

knitr::kable(baseline_custom$baseline, caption = "Customized Baseline Table")
Customized Baseline Table
Overall sex: 1 sex: 2 p test SMD
n 228 138 90
age (mean (SD)) 62.45 (9.07) 63.34 (9.14) 61.08 (8.85) 0.064 0.252
wt.loss (median [IQR]) 7.00 [0.00, 15.75] 8.00 [0.75, 18.50] 4.00 [0.00, 11.00] 0.029 nonnorm 0.264
meal.cal (median [IQR]) 975.00 [635.00, 1150.00] 1025.00 [768.00, 1175.00] 925.00 [588.00, 1067.50] 0.022 nonnorm 0.357
ph.ecog (%) 0.826 exact 0.165
0 63 (27.8) 36 (26.3) 27 (30.0)
1 113 (49.8) 71 (51.8) 42 (46.7)
2 50 (22.0) 29 (21.2) 21 (23.3)
3 1 (0.4) 1 (0.7) 0 (0.0)

Missing Data Summary

knitr::kable(baseline_result$missing, caption = "Missing Data Summary")
Missing Data Summary
Overall sex: 1 sex: 2 p test
n 228 138 90
inst = TRUE (%) 1 (0.4) 1 (0.7) 0 (0.0) 1.000
time = TRUE (%) 0 (0.0) 0 (0.0) 0 (0.0) NaN
status = TRUE (%) 0 (0.0) 0 (0.0) 0 (0.0) NaN
age = TRUE (%) 0 (0.0) 0 (0.0) 0 (0.0) NaN
ph.ecog = TRUE (%) 1 (0.4) 1 (0.7) 0 (0.0) 1.000
ph.karno = TRUE (%) 1 (0.4) 1 (0.7) 0 (0.0) 1.000
pat.karno = TRUE (%) 3 (1.3) 2 (1.4) 1 (1.1) 1.000
meal.cal = TRUE (%) 69 (30.3) 36 (26.1) 33 (36.7) 0.121
wt.loss = TRUE (%) 34 (14.9) 24 (17.4) 10 (11.1) 0.267
age_group = TRUE (%) 0 (0.0) 0 (0.0) 0 (0.0) NaN
ph.ecog_cat = TRUE (%) 1 (0.4) 1 (0.7) 0 (0.0) 1.000

Manual Override

Override automatic classification based on clinical knowledge or manual review: :

baseline_manual <- baseline_table(
  cancer,
  strata = "sex",
  factor_vars = c("ph.ecog", "pat.karno"),
  nonnormal_vars = c("age"),
  exact_vars = c("ph.ecog")
)

knitr::kable(baseline_manual$baseline, caption = "Baseline Table with Manual Overrides")
Baseline Table with Manual Overrides
Overall sex: 1 sex: 2 p test
n 228 138 90
inst (mean (SD)) 11.09 (8.30) 10.56 (7.80) 11.89 (9.00) 0.254
time (mean (SD)) 305.23 (210.65) 283.23 (213.05) 338.97 (203.47) 0.049
status (mean (SD)) 1.72 (0.45) 1.81 (0.39) 1.59 (0.49) <0.001
age (median [IQR]) 63.00 [56.00, 69.00] 64.00 [57.00, 70.00] 61.00 [55.00, 68.00] 0.057 nonnorm
ph.ecog (%) 0.831 exact
0 63 (27.8) 36 (26.3) 27 (30.0)
1 113 (49.8) 71 (51.8) 42 (46.7)
2 50 (22.0) 29 (21.2) 21 (23.3)
3 1 (0.4) 1 (0.7) 0 (0.0)
ph.karno (mean (SD)) 81.94 (12.33) 81.82 (12.38) 82.11 (12.32) 0.864
pat.karno (%) 0.636
30 2 (0.9) 1 (0.7) 1 (1.1)
40 2 (0.9) 1 (0.7) 1 (1.1)
50 4 (1.8) 2 (1.5) 2 (2.2)
60 30 (13.3) 18 (13.2) 12 (13.5)
70 41 (18.2) 30 (22.1) 11 (12.4)
80 51 (22.7) 32 (23.5) 19 (21.3)
90 60 (26.7) 31 (22.8) 29 (32.6)
100 35 (15.6) 21 (15.4) 14 (15.7)
meal.cal (mean (SD)) 928.78 (402.17) 980.54 (413.26) 840.70 (369.08) 0.020
wt.loss (mean (SD)) 9.83 (13.14) 11.22 (12.98) 7.77 (13.18) 0.060
ph.ecog_cat (%) 0.737
0 63 (27.8) 36 (26.3) 27 (30.0)
1 113 (49.8) 71 (51.8) 42 (46.7)
>=2 51 (22.5) 30 (21.9) 21 (23.3)

Saving Results

Save all tables to CSV files:

# baseline_saved <- baseline_table(
#   cancer, var_types = var_types,
#   save_table = TRUE, filename = "baseline_characteristics.csv"
# )

Complete Workflow

A streamlined 5-step workflow from data preparation to final table:

# Step 1: Prepare data
data(cancer, package = "survival")
cancer_clean <- cancer %>%
  mutate(
    age_group = cut(age,
      breaks = c(0, 50, 60, 70, 100),
      labels = c("<50", "50-60", "60-70", ">70")
    ),
    ph.ecog_cat = factor(ph.ecog,
      levels = c(0:3),
      labels = c("0", "1", ">=2", ">=2")
    ),
    sex = factor(sex, labels = c("Male", "Female"))
  )

# Step 2: Determine variable types
var_types <- get_var_types(cancer_clean, strata = "sex", num_to_factor = 5)

# Step 3: Review classification
knitr::kable(data.frame(
  Variable_Type = c("Factor", "Non-normal", "Exact"),
  Variables = c(
    paste(var_types$factor_vars, collapse = ", "),
    paste(var_types$nonnormal_vars, collapse = ", "),
    paste(var_types$exact_vars, collapse = ", ")
  )
), caption = "Variable Type Review")
Variable Type Review
Variable_Type Variables
Factor status, sex, ph.ecog, age_group, ph.ecog_cat
Non-normal inst, time, ph.karno, pat.karno, meal.cal, wt.loss
Exact ph.ecog

# Step 4: Create baseline table
baseline_final <- baseline_table(
  cancer_clean,
  var_types = var_types,
  smd = TRUE
)

# Step 5: Review results
knitr::kable(baseline_final$baseline, caption = "Final Baseline Characteristics Table")
Final Baseline Characteristics Table
Overall sex: Male sex: Female p test SMD
n 228 138 90
inst (median [IQR]) 11.00 [3.00, 16.00] 11.00 [3.00, 15.00] 11.00 [3.25, 16.00] 0.416 nonnorm 0.158
time (median [IQR]) 255.50 [166.75, 396.50] 224.00 [144.75, 369.25] 292.50 [195.25, 448.50] 0.013 nonnorm 0.268
status = 2 (%) 165 (72.4) 112 (81.2) 53 (58.9) <0.001 0.501
age (mean (SD)) 62.45 (9.07) 63.34 (9.14) 61.08 (8.85) 0.064 0.252
ph.ecog (%) 0.826 exact 0.165
0 63 (27.8) 36 (26.3) 27 (30.0)
1 113 (49.8) 71 (51.8) 42 (46.7)
2 50 (22.0) 29 (21.2) 21 (23.3)
3 1 (0.4) 1 (0.7) 0 (0.0)
ph.karno (median [IQR]) 80.00 [75.00, 90.00] 80.00 [70.00, 90.00] 80.00 [80.00, 90.00] 0.882 nonnorm 0.023
pat.karno (median [IQR]) 80.00 [70.00, 90.00] 80.00 [70.00, 90.00] 80.00 [70.00, 90.00] 0.332 nonnorm 0.093
meal.cal (median [IQR]) 975.00 [635.00, 1150.00] 1025.00 [768.00, 1175.00] 925.00 [588.00, 1067.50] 0.022 nonnorm 0.357
wt.loss (median [IQR]) 7.00 [0.00, 15.75] 8.00 [0.75, 18.50] 4.00 [0.00, 11.00] 0.029 nonnorm 0.264
age_group (%) 0.188 0.298
<50 26 (11.4) 14 (10.1) 12 (13.3)
50-60 68 (29.8) 35 (25.4) 33 (36.7)
60-70 88 (38.6) 58 (42.0) 30 (33.3)
>70 46 (20.2) 31 (22.5) 15 (16.7)
ph.ecog_cat (%) 0.737 0.106
0 63 (27.8) 36 (26.3) 27 (30.0)
1 113 (49.8) 71 (51.8) 42 (46.7)
>=2 51 (22.5) 30 (21.9) 21 (23.3)
knitr::kable(baseline_final$missing, caption = "Final Missing Data Summary")
Final Missing Data Summary
Overall sex: Male sex: Female p test SMD
n 228 138 90
inst = TRUE (%) 1 (0.4) 1 (0.7) 0 (0.0) 1.000 0.121
time = TRUE (%) 0 (0.0) 0 (0.0) 0 (0.0) NaN <0.001
status = TRUE (%) 0 (0.0) 0 (0.0) 0 (0.0) NaN <0.001
age = TRUE (%) 0 (0.0) 0 (0.0) 0 (0.0) NaN <0.001
ph.ecog = TRUE (%) 1 (0.4) 1 (0.7) 0 (0.0) 1.000 0.121
ph.karno = TRUE (%) 1 (0.4) 1 (0.7) 0 (0.0) 1.000 0.121
pat.karno = TRUE (%) 3 (1.3) 2 (1.4) 1 (1.1) 1.000 0.030
meal.cal = TRUE (%) 47 (20.6) 24 (17.4) 23 (25.6) 0.186 0.200
wt.loss = TRUE (%) 14 (6.1) 10 (7.2) 4 (4.4) 0.562 0.120
age_group = TRUE (%) 0 (0.0) 0 (0.0) 0 (0.0) NaN <0.001
ph.ecog_cat = TRUE (%) 1 (0.4) 1 (0.7) 0 (0.0) 1.000 0.121

Summary

Key Functions

Best Practices

  1. Review automatic classifications — clinical knowledge should override statistical defaults when appropriate
  2. Include SMD for observational studies — standardized mean differences help assess group balance
  3. Handle missing data transparently — report missing patterns in your tables
  4. Use BH correction for multi-group pairwise comparisons