Programming environment
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(iml)
library(rmlnomogram)
Binary outcome (or class-wise multinomial outcome)
1 - Categorical predictors and binary outcome without
probability
# Load dataset
data("mtcars")
# Preprocess for training a classifier
mtcars$am <- factor(mtcars$am, levels = c(0, 1), labels = c("Auto", "Manual"))
# Convert some numerical features to categorical for demonstration
mtcars$cyl <- factor(mtcars$cyl)
mtcars$vs <- factor(mtcars$vs)
mtcars$qsec <- factor(as.integer(mtcars$qsec >= 18))
# Using tidyverse to filter only factor columns
mtcars <- select(mtcars, where(is.factor))
# Create dummy variables for all factor variables, excluding the target variable
dummy_vars <- dummyVars(am ~ ., data = mtcars) |>
predict(newdata = mtcars)
# Combine binarized predictors with the target variable
mtcars_binarized <- as.data.frame(dummy_vars) |>
mutate_all(as.factor) |>
mutate(am = mtcars$am) |>
select(-vs.0, -cyl.4, -qsec.0)
# Train a random forest model using the full dataset
set.seed(1)
model <- train(
am ~ .,
data = mtcars_binarized,
method = "rf",
trControl = trainControl(method = "none", classProbs = TRUE)
)
# Extract feature names used in the trained model from caret model object
caret_features <- str_remove_all(model$finalModel$xNames, "1$")
# Modify the training dataset to include only the model features
mtcars_selected <- mtcars_binarized[, caret_features]
# Generate all possible feature combinations for nomogram
nomogram_features <- expand.grid(lapply(mtcars_selected, unique))
## cyl.6 cyl.8 qsec.1 vs.1
## 1 1 0 0 0
## 2 0 0 0 0
## 3 1 1 0 0
## 4 0 1 0 0
## 5 1 0 1 0
## 6 0 0 1 0
nomogram_outputs <- predict(model, nomogram_features, type = "prob") |>
select(output = levels(mtcars_binarized$am)[2])
## output
## 1 0.884
## 2 0.818
## 3 0.246
## 4 0.002
## 5 0.498
## 6 0.700
nomogram <- create_nomogram(nomogram_features, nomogram_outputs)
# Prepare data and model for SHAP value calculation using iml
X <- mtcars_binarized[, -which(names(mtcars_binarized) == "am")]
# Create a predictor object
predictor <- Predictor$new(model = model, data = X)
# Calculate SHAP values
nomogram_shaps <- list()
for(i in seq(nrow(nomogram_features))){
shapley <- Shapley$new(predictor, x.interest = nomogram_features[i, ])
nomogram_shaps[[i]] <- shapley$results
}
names(nomogram_shaps) <- seq(nrow(nomogram_features))
nomogram_shaps <- reduce(imap(nomogram_shaps, ~ mutate(.x, i = .y)), rbind) |>
filter(class == levels(mtcars_binarized$am)[2]) |>
select(i, feature, phi) |>
spread(feature, phi) |>
arrange(as.numeric(i)) |>
column_to_rownames(var = "i")
## cyl.6 cyl.8 qsec.1 vs.1
## 1 -0.09 0.53 0.19 0
## 2 0.03 0.50 0.04 0
## 3 -0.05 -0.42 0.06 0
## 4 0.06 -0.50 0.09 0
## 5 -0.43 0.33 -0.25 0
## 6 0.13 0.38 -0.05 0
2 - Categorical predictors and binary outcome with probability
nomogram2 <- create_nomogram(nomogram_features, nomogram_outputs, prob = TRUE)
nomogram2_with_shap <-
create_nomogram(
nomogram_features, nomogram_outputs, nomogram_shaps
, prob = TRUE
)
3 - Categorical with single numerical predictors and binary outcome
with probability
# Reload original dataset
data("mtcars")
# Round to 0 decimal to reduce possible combinations later
mtcars <- mutate(mtcars, qsec = round(qsec, 0))
# Add single numerical predictor to binarized predictors with the target variable
mtcars_mixed <- cbind(mtcars["qsec"], select(mtcars_binarized, -qsec.1))
# Train a random forest model using the full dataset
set.seed(1)
model2 <- train(
am ~ .,
data = mtcars_mixed,
method = "rf",
trControl = trainControl(method = "none", classProbs = TRUE)
)
# Extract feature names used in the trained model from caret model object
caret_features2 <- str_remove_all(model2$finalModel$xNames, "1$")
# Modify the training dataset to include only the model features
mtcars_selected2 <- mtcars_mixed[, caret_features2]
# Generate all possible feature combinations for nomogram
nomogram_features2 <-
select_if(mtcars_selected2, is.numeric) |>
lapply(\(x) seq(min(x), max(x))) |>
c(lapply(select_if(mtcars_selected2, is.factor), unique)) |>
expand.grid()
## qsec cyl.6 cyl.8 vs.1
## 1 14 1 0 0
## 2 15 1 0 0
## 3 16 1 0 0
## 4 17 1 0 0
## 5 18 1 0 0
## 6 19 1 0 0
nomogram_outputs2 <- predict(model2, nomogram_features2, type = "prob") |>
select(output = levels(mtcars_mixed$am)[2])
nomogram3 <- create_nomogram(nomogram_features2, nomogram_outputs2, prob = TRUE)
# Prepare data and model for SHAP value calculation using iml
X2 <- mtcars_mixed[, -which(names(mtcars_mixed) == "am")]
# Create a predictor object
predictor2 <- Predictor$new(model = model2, data = X2)
# Calculate SHAP values
nomogram_shaps2 <- list()
for(i in seq(nrow(nomogram_features2))){
shapley2 <- Shapley$new(predictor2, x.interest = nomogram_features2[i, ])
nomogram_shaps2[[i]] <- shapley2$results
}
names(nomogram_shaps2) <- seq(nrow(nomogram_features2))
nomogram_shaps2 <- reduce(imap(nomogram_shaps2, ~ mutate(.x, i = .y)), rbind) |>
filter(class == levels(mtcars_mixed$am)[2]) |>
select(i, feature, phi) |>
spread(feature, phi) |>
arrange(as.numeric(i)) |>
column_to_rownames(var = "i")
nomogram3_with_shap <-
create_nomogram(
nomogram_features2, nomogram_outputs2, nomogram_shaps2
, prob = TRUE
)
Continuous outcome
4 - Categorical predictors and continuous outcome
# Load dataset
data("mtcars")
# Preprocess for training a regressor
outcome <- mtcars$wt
# Convert some numerical features to categorical for demonstration
mtcars$cyl <- factor(mtcars$cyl)
mtcars$vs <- factor(mtcars$vs)
mtcars$qsec <- factor(as.integer(mtcars$qsec >= 18))
# Using tidyverse to filter only factor columns
mtcars <- cbind(select(mtcars, where(is.factor)), select(mtcars, wt))
# Create dummy variables for all factor variables, excluding the target variable
dummy_vars2 <- dummyVars(wt ~ ., data = mtcars) |>
predict(newdata = mtcars)
# Combine binarized predictors with the target variable
mtcars_binarized2 <- as.data.frame(dummy_vars2) |>
mutate_all(as.factor) |>
mutate(wt = outcome) |>
select(-vs.0, -cyl.4, -qsec.0)
# Train a random forest model using the full dataset
set.seed(1)
model3 <- train(
wt ~ .,
data = mtcars_binarized2,
method = "rf",
trControl = trainControl(method = "none")
)
# Extract feature names used in the trained model from caret model object
caret_features3 <- str_remove_all(model3$finalModel$xNames, "1$")
# Modify the training dataset to include only the model features
mtcars_selected3 <- mtcars_binarized2[, caret_features3]
# Generate all possible feature combinations for nomogram
nomogram_features3 <- expand.grid(lapply(mtcars_selected3, unique))
nomogram_outputs3 <- data.frame(output = predict(model3, nomogram_features3))
## output
## 1 2.958282
## 2 3.010966
## 3 3.537469
## 4 3.878820
## 5 3.037775
## 6 2.971140
nomogram4 <- create_nomogram(nomogram_features3, nomogram_outputs3, est = TRUE)
# Prepare data and model for SHAP value calculation using iml
X3 <- mtcars_binarized2[, -which(names(mtcars_binarized2) == "wt")]
# Create a predictor object
predictor3 <- Predictor$new(model = model3, data = X3)
# Calculate SHAP values
nomogram_shaps3 <- list()
for(i in seq(nrow(nomogram_features3))){
shapley3 <- Shapley$new(predictor3, x.interest = nomogram_features3[i, ])
nomogram_shaps3[[i]] <- shapley3$results
}
names(nomogram_shaps3) <- seq(nrow(nomogram_features3))
nomogram_shaps3 <- reduce(imap(nomogram_shaps3, ~ mutate(.x, i = .y)), rbind) |>
select(i, feature, phi) |>
spread(feature, phi) |>
arrange(as.numeric(i)) |>
column_to_rownames(var = "i")
nomogram4_with_shap <-
create_nomogram(
nomogram_features3, nomogram_outputs3, nomogram_shaps3
, est = TRUE
)
5 - Categorical with single numerical predictors and continuous
outcome
# Reload original dataset
data("mtcars")
# Round to 0 decimal to reduce possible combinations later
mtcars <- mutate(mtcars, qsec = round(qsec, 0))
# Add single numerical predictor to binarized predictors with the target variable
mtcars_mixed2 <- cbind(mtcars["qsec"], select(mtcars_binarized2, -qsec.1))
# Train a random forest model using the full dataset
set.seed(1)
model4 <- train(
wt ~ .,
data = mtcars_mixed2,
method = "rf",
trControl = trainControl(method = "none")
)
# Extract feature names used in the trained model from caret model object
caret_features4 <- str_remove_all(model4$finalModel$xNames, "1$")
# Modify the training dataset to include only the model features
mtcars_selected4 <- mtcars_mixed2[, caret_features4]
# Generate all possible feature combinations for nomogram
nomogram_features4 <-
select_if(mtcars_selected4, is.numeric) |>
lapply(\(x) seq(min(x), max(x))) |>
c(lapply(select_if(mtcars_selected4, is.factor), unique)) |>
expand.grid()
nomogram_outputs4 <- data.frame(output = predict(model4, nomogram_features4))
nomogram5 <- create_nomogram(nomogram_features4, nomogram_outputs4, est = TRUE)
# Prepare data and model for SHAP value calculation using iml
X4 <- mtcars_mixed2[, -which(names(mtcars_mixed2) == "wt")]
# Create a predictor object
predictor4 <- Predictor$new(model = model4, data = X4)
# Calculate SHAP values
nomogram_shaps4 <- list()
for(i in seq(nrow(nomogram_features4))){
shapley4 <- Shapley$new(predictor4, x.interest = nomogram_features4[i, ])
nomogram_shaps4[[i]] <- shapley4$results
}
names(nomogram_shaps4) <- seq(nrow(nomogram_features4))
nomogram_shaps4 <- reduce(imap(nomogram_shaps4, ~ mutate(.x, i = .y)), rbind) |>
select(i, feature, phi) |>
spread(feature, phi) |>
arrange(as.numeric(i)) |>
column_to_rownames(var = "i")
nomogram4_with_shap <-
create_nomogram(
nomogram_features4, nomogram_outputs4, nomogram_shaps4
, est = TRUE
)
sessionInfo()
## R version 4.4.1 (2024-06-14)
## Platform: aarch64-unknown-linux-gnu
## Running under: Ubuntu 22.04.5 LTS
##
## Matrix products: default
## BLAS: /usr/lib/aarch64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/aarch64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so; LAPACK version 3.10.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=C
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Etc/UTC
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] rmlnomogram_0.1.2 iml_0.11.3 caret_6.0-94 lattice_0.22-6
## [5] lubridate_1.9.3 forcats_1.0.0 stringr_1.5.1 dplyr_1.1.4
## [9] purrr_1.0.2 readr_2.1.5 tidyr_1.3.1 tibble_3.2.1
## [13] ggplot2_3.5.1 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.2.1 timeDate_4041.110 farver_2.1.2
## [4] fastmap_1.2.0 pROC_1.18.5 digest_0.6.37
## [7] rpart_4.1.23 timechange_0.3.0 lifecycle_1.0.4
## [10] survival_3.6-4 magrittr_2.0.3 compiler_4.4.1
## [13] rlang_1.1.4 sass_0.4.9 tools_4.4.1
## [16] utf8_1.2.4 yaml_2.3.10 data.table_1.16.2
## [19] ggsignif_0.6.4 knitr_1.48 labeling_0.4.3
## [22] plyr_1.8.9 abind_1.4-8 withr_3.0.2
## [25] Metrics_0.1.4 nnet_7.3-19 grid_4.4.1
## [28] stats4_4.4.1 fansi_1.0.6 ggpubr_0.6.0
## [31] colorspace_2.1-1 future_1.34.0 globals_0.16.3
## [34] scales_1.3.0 iterators_1.0.14 MASS_7.3-60.2
## [37] cli_3.6.3 rmarkdown_2.28 generics_0.1.3
## [40] rstudioapi_0.17.1 future.apply_1.11.3 reshape2_1.4.4
## [43] tzdb_0.4.0 cachem_1.1.0 splines_4.4.1
## [46] parallel_4.4.1 vctrs_0.6.5 hardhat_1.4.0
## [49] Matrix_1.7-0 carData_3.0-5 jsonlite_1.8.9
## [52] car_3.1-3 hms_1.1.3 rstatix_0.7.2
## [55] Formula_1.2-5 listenv_0.9.1 foreach_1.5.2
## [58] gower_1.0.1 jquerylib_0.1.4 recipes_1.1.0
## [61] glue_1.8.0 parallelly_1.38.0 codetools_0.2-20
## [64] cowplot_1.1.3 stringi_1.8.4 gtable_0.3.6
## [67] munsell_0.5.1 pillar_1.9.0 htmltools_0.5.8.1
## [70] ipred_0.9-15 lava_1.8.0 R6_2.5.1
## [73] evaluate_1.0.1 highr_0.11 backports_1.5.0
## [76] broom_1.0.7 bslib_0.8.0 class_7.3-22
## [79] Rcpp_1.0.13 nlme_3.1-164 prodlim_2024.06.25
## [82] checkmate_2.3.2 xfun_0.48 pkgconfig_2.0.3
## [85] ModelMetrics_1.2.2.2