---
title: "GeoTox Package Data"
vignette: >
  %\VignetteIndexEntry{GeoTox Package Data}
  %\VignetteEngine{quarto::html}
  %\VignetteEncoding{UTF-8}
format:
  html:
    toc: true
    toc-expand: true
execute:
  eval: false
---

The GeoTox package includes example data `geo_tox_data`. This article provides a description of the data and example code for how it was gathered.

:::{.callout-note}
The websites linked in this vignette were last accessed on March 20, 2026. The latest R package versions were used at that time.
:::

:::{.callout-warning}
The package data are being pulled from various sources and are connected using FIPS codes. FIPS codes are a simple way to connect data, but they can change. For example, Connecticut began the process of going from 8 legacy counties to 9 planning regions starting in 2022 and became effective in 2024.
:::

# Setup

Load packages and create an empty list to store data.

```{r}
#| label: setup
library(dplyr)
library(httk)
library(httr2)
library(purrr)
library(readr)
library(readxl)
library(sf)
library(stringr)
library(tibble)
library(tidyr)

geo_tox_data <- list()
```

# Chemicals

## External exposure

Download modeled exposure data from [AirToxScreen](https://www.epa.gov/AirToxScreen){target="_blank" rel="noopener noreferrer"}. Results from 2020 for a subset of chemicals in North Carolina counties are included in the GeoTox package data as `geo_tox_data$exposure`.

:::{.callout-note}
The [2020](https://www.epa.gov/AirToxScreen/2020-airtoxscreen-assessment-results){target="_blank" rel="noopener noreferrer"} version has census block level data, which is provided in several large files separated by region. The [2019](https://www.epa.gov/AirToxScreen/2019-airtoxscreen-assessment-results){target="_blank" rel="noopener noreferrer"} version has census tract level data, which is provided as a single file and is much smaller to download.
:::

```{r}
#| label: exposure
filename <- "Region4b_2020ATS_Exposure_Concentrations.xlsx"
tmp <- tempfile(filename)
download.file(
  paste0(
    "https://gaftp.epa.gov/rtrmodeling_public/AirToxScreen/2020/",
    "Exposure%20Concentrations/",
    filename
  ),
  tmp
)
exposure <- read_xlsx(tmp)

# Normalization function
min_max_norm = function(x) {
  min_x <- min(x, na.rm = TRUE)
  max_x <- max(x, na.rm = TRUE)
  if (min_x == max_x) {
    rep(0, length(x))
  } else {
    (x - min_x) / (max_x - min_x)
  }
}

geo_tox_data$exposure <- exposure |>
  # North Carolina counties
  filter(State == "NC", !grepl("0$", FIPS)) |>
  # Aggregate chemicals by county
  summarize(
    across(
      -c(State:Population), # Note: use "-c(State:Tract)" for 2019 data
      list(mean = mean, sd = sd),
      .names = "{col}|||{fn}"
    ),
    .by = FIPS
  ) |>
  pivot_longer(-FIPS) |>
  separate_wider_delim(name, "|||", names = c("chemical", "stat")) |>
  pivot_wider(names_from = stat) |>
  # Normalize concentrations
  mutate(norm = min_max_norm(mean), .by = chemical)
```

### Get chemical identifiers from CompTox Dashboard

The exposure data is labeled with chemical names, but names can vary across resources and a more reliable way to connect datasets is by using Chemical Abstracts Service Registry Number (CAS-RN) identifiers. Use the batch search feature of the [CompTox Dashboard](https://comptox.epa.gov/dashboard/batch-search){target="_blank" rel="noopener noreferrer"} to retrieve CAS-RN and PREFERRED_NAME for the chemicals in the exposure dataset.

```{r}
#| label: comptox
# Copy/paste the chemical names into the batch search field
# https://comptox.epa.gov/dashboard/batch-search
cat(geo_tox_data$exposure |> distinct(chemical) |> pull(), sep = "\n")
# Export results with "CAS-RN" identifiers as a csv file, then process in R

# Remove rows without clear chemical matches, investigate manually if desired
exposure_casrn <- read_csv("CCD-Batch-Search.csv") |>
  filter(DTXSID != "N/A", !grepl("WARNING", FOUND_BY))

# Update exposure data with CompTox Dashboard data
geo_tox_data$exposure <- geo_tox_data$exposure |>
  inner_join(exposure_casrn, by = join_by(chemical == INPUT)) |>
  select(FIPS, casn = CASRN, chnm = PREFERRED_NAME, mean, sd, norm)
```

## Active chemicals

Use the Integrated Chemical Environment ([ICE](https://ice.ntp.niehs.nih.gov/){target="_blank" rel="noopener noreferrer"})
curated high-throughput screening ([cHTS](https://ice.ntp.niehs.nih.gov/DATASETDESCRIPTION?section=cHTS){target="_blank" rel="noopener noreferrer"}) data to identify active chemicals for a given set of assays.

:::{.callout-note}
The following can be used to get a list of available assays for a given chemical set.

```{r}
#| label: available-assays
# Get all supported assays
help_text <- request("https://ice.ntp.niehs.nih.gov/api/v1/search/help") |>
  req_perform() |>
  resp_body_string()
supported_assays <- str_split_i(help_text, "Supported assay\\(s\\):", 2) |>
  str_split_1("\",\"") |>
  str_replace_all(c("\"" = "")) |>
  str_trim()

# Search for assays available for given chemids
chemids <- unique(geo_tox_data$exposure$casn)
resp <- request("https://ice.ntp.niehs.nih.gov/api/v1/search") |>
  req_body_json(list(chemids = chemids, assays = supported_assays)) |>
  req_perform()
result <- resp |> resp_body_json() |> pluck("endPoints")
fields <- c("assay", "casrn", "dtxsid", "substanceName", "endpoint", "value")
df <- map(fields, \(x) map_chr(result, x)) |>
  set_names(fields) |>
  bind_cols()
available_assays <- df |> distinct(assay) |> pull() |> sort()
```
:::

```{r}
#| label: cHTS-hits
get_cHTS_hits <- function(assays = NULL, chemids = NULL) {
  if (is.null(assays) & is.null(chemids)) {
    stop("Must provide at least one of 'assays' or 'chemids'")
  }

  # Format query parameters
  req_params <- list()

  if (!is.null(assays)) {
    if (!is.list(assays)) assays <- as.list(assays)
    req_params$assays <- assays
  }

  if (!is.null(chemids)) {
    if (!is.list(chemids)) chemids <- as.list(chemids)
    req_params$chemids <- chemids
  }

  # Query ICE API
  resp <- request("https://ice.ntp.niehs.nih.gov/api/v1/search") |>
    req_body_json(req_params) |>
    req_perform()

  if (resp$status_code != 200) {
    stop("Failed to retrieve data from ICE API")
  }

  # Return active chemicals
  result <- resp |> resp_body_json() |> pluck("endPoints")

  fields <- c("assay", "casrn", "dtxsid", "substanceName", "endpoint", "value")

  map(fields, \(x) map_chr(result, x)) |>
    set_names(fields) |>
    bind_cols() |>
    filter(endpoint == "Call", value == "Active") |>
    select(-c(endpoint, value)) |>
    distinct()
}

assays <- c(
  "APR_HepG2_p53Act_1hr",
  "APR_HepG2_p53Act_24hr",
  "APR_HepG2_p53Act_72hr",
  "ATG_p53_CIS",
  "TOX21_DT40_LUC",
  "TOX21_DT40_100_LUC",
  "TOX21_DT40_657_LUC",
  "TOX21_ELG1_LUC_Agonist",
  "TOX21_H2AX_HTRF_CHO_Agonist_ratio",
  "TOX21_p53_BLA_p1_ratio",
  "TOX21_p53_BLA_p2_ratio",
  "TOX21_p53_BLA_p3_ratio",
  "TOX21_p53_BLA_p4_ratio",
  "TOX21_p53_BLA_p5_ratio"
)

chemids <- unique(geo_tox_data$exposure$casn)

cHTS_hits <- get_cHTS_hits(assays = assays, chemids = chemids)
```

## Dose-response

Use the ICE API to retrieve dose-response data for selected assays and chemicals, which is included in the GeoTox package data as `geo_tox_data$dose_response`.

```{r}
#| label: dose-response
get_ICE_dose_resp <- function(assays = NULL, chemids = NULL) {
  if (is.null(assays) & is.null(chemids)) {
    stop("Must provide at least one of 'assays' or 'chemids'")
  }

  # Format query parameters
  req_params <- list()

  if (!is.null(assays)) {
    if (!is.list(assays)) assays <- as.list(assays)
    req_params$assays <- assays
  }

  if (!is.null(chemids)) {
    if (!is.list(chemids)) chemids <- as.list(chemids)
    req_params$chemids <- chemids
  }

  # Query ICE API
  resp <- request("https://ice.ntp.niehs.nih.gov/api/v1/curves") |>
    req_body_json(req_params) |>
    req_perform()

  if (resp$status_code != 200) {
    stop("Failed to retrieve data from ICE API")
  }

  # Return dose-response data
  result <- resp |> resp_body_json() |> pluck("curves")

  map(result, function(x) {
    tibble(
      endp = x[["endpoint"]],
      casn = x[["casrn"]],
      chnm = x[["substance"]],
      call = x[["call"]],
      logc = map_dbl(x[["concentrationResponses"]], "concentration") |> log10(),
      resp = map_dbl(x[["concentrationResponses"]], "response")
    )
  }) |>
    bind_rows()
}

assays <- unique(cHTS_hits$assay)

chemids <- intersect(cHTS_hits$casrn, geo_tox_data$exposure$casn)

dose_response <- get_ICE_dose_resp(assays = assays, chemids = chemids)

# Only keep active calls for assay/chemical combinations
geo_tox_data$dose_response <- dose_response |>
  filter(call == "Active") |>
  select(-call)

# Update dose-response chemical names using CompTox Dashboard data
geo_tox_data$dose_response <- geo_tox_data$dose_response |>
  inner_join(exposure_casrn, by = join_by(casn == CASRN)) |>
  select(endp, casn, chnm = PREFERRED_NAME, logc, resp)
```

# Population

## Age

Download age data from the [U.S. Census Bureau](https://www.census.gov/){target="_blank" rel="noopener noreferrer"} by searching for "County Population by Characteristics". A subset of data for North Carolina from [2020](https://www.census.gov/programs-surveys/popest/technical-documentation/research/evaluation-estimates/2020-evaluation-estimates/2010s-county-detail.html){target="_blank" rel="noopener noreferrer"} is included in the GeoTox package data as `geo_tox_data$age`.

```{r}
#| label: age
# Data for North Carolina
url <- paste0(
  "https://www2.census.gov/programs-surveys/popest/datasets/",
  "2020-2024/counties/asrh/cc-est2024-alldata-37.csv"
)
age <- read_csv(url)

geo_tox_data$age <- age |>
  # 7/1/2020 population estimate
  filter(YEAR == 2) |>
  # Create FIPS
  mutate(FIPS = str_c(STATE, COUNTY)) |>
  # Keep selected columns
  select(FIPS, AGEGRP, TOT_POP)
```

## Obesity status

Search [CDC data](https://data.cdc.gov/){target="_blank" rel="noopener noreferrer"} for "places county data". Go to the desired dataset webpage, e.g. [2020 county data](https://data.cdc.gov/500-Cities-Places/PLACES-County-Data-GIS-Friendly-Format-2020-releas/mssc-ksj7/about_data){target="_blank" rel="noopener noreferrer"}, and download the data by selecting Actions &rarr; API &rarr; Download file. A subset of data for North Carolina is included in the GeoTox package data as `geo_tox_data$obesity`.

```{r}
#| label: obesity-status
places <- read_csv(
  "PLACES__County_Data_(GIS_Friendly_Format),_2020_release.csv"
)

# Convert confidence interval to standard deviation
extract_SD <- function(x) {
  range <- as.numeric(str_split_1(str_sub(x, 2, -2), ","))
  diff(range) / 3.92
}

geo_tox_data$obesity <- places |>
  # North Carolina Counties
  filter(StateAbbr == "NC") |>
  # Select obesity data
  select(FIPS = CountyFIPS, OBESITY_CrudePrev, OBESITY_Crude95CI) |>
  # Change confidence interval to standard deviation
  rowwise() |>
  mutate(OBESITY_SD = extract_SD(OBESITY_Crude95CI)) |>
  ungroup() |>
  select(-OBESITY_Crude95CI)
```

## Steady-state plasma concentration (C_ss)

Use the `httk` package to generate C_ss values for combinations of age group and weight status for each chemical. The generation of these values is a time-intensive step, so one approach is to generate populations of C_ss values initially and then sample them later. The simulated C_ss values are included in the GeoTox package data as `geo_tox_data$simulated_css`.

```{r}
#| label: C_ss
set.seed(2345)
n_samples <- 500

# Get CASN for which httk simulation is possible. Try using load_dawson2021,
# load_sipes2017, or load_pradeep2020 to increase availability.
load_sipes2017()
casn <- intersect(
  unique(geo_tox_data$dose_response$casn),
  get_cheminfo(suppress.messages = TRUE)
)

# Define population demographics for httk simulation
pop_demo <- cross_join(
  tibble(age_group = list(
    c(0, 2), c(3, 5), c(6, 10), c(11, 15), c(16, 20), c(21, 30), c(31, 40),
    c(41, 50), c(51, 60), c(61, 70), c(71, 100)
  )),
  tibble(weight = c("Normal", "Obese"))
)

# Create wrapper function around httk steps
simulate_css <- function(
  chem.cas,
  agelim_years,
  weight_category,
  samples,
  verbose = TRUE
) {
  if (verbose) {
    cat(
      chem.cas,
      paste0("(", paste(agelim_years, collapse = ", "), ")"),
      weight_category,
      "\n"
    )
  }

  httkpop <- list(
    method = "vi",
    gendernum = NULL,
    agelim_years = agelim_years,
    agelim_months = NULL,
    weight_category = weight_category,
    reths = c(
      "Mexican American",
      "Other Hispanic",
      "Non-Hispanic White",
      "Non-Hispanic Black",
      "Other"
    )
  )

  css <- try(
    suppressWarnings({
      mcs <- create_mc_samples(
        chem.cas = chem.cas,
        samples = samples,
        httkpop.generate.arg.list = httkpop,
        suppress.messages = TRUE
      )

      calc_analytic_css(
        chem.cas = chem.cas,
        parameters = mcs,
        model = "3compartmentss",
        suppress.messages = TRUE
      )
    }),
    silent = TRUE
  )

  # Return
  if (is(css, "try-error")) {
    warning("simulate_css() failed to generate data for CASN ", chem.cas)
    list(NA)
  } else {
    list(css)
  }
}

# Simulate C_ss values
simulated_css <- map(casn, function(chem.cas) {
  pop_demo |>
    rowwise() |>
    mutate(
      css = simulate_css(.env$chem.cas, age_group, weight, .env$n_samples)
    ) |>
    ungroup()
})
simulated_css <- setNames(simulated_css, casn)

# Remove CASN that failed simulate_css
casn_keep <- map_lgl(simulated_css, function(df) {
  !(length(df$css[[1]]) == 1 && is.na(df$css[[1]]))
})
simulated_css <- simulated_css[casn_keep]

geo_tox_data$simulated_css <- simulated_css |>
  map(\(x) {
    x |>
      mutate(
        age_lb = map_int(age_group, first),
        age_ub = map_int(age_group, last)
      ) |>
      select(age_lb, age_ub, weight, css) |>
      unnest(css)
  }) |>
  enframe(name = "casn") |>
  unnest(value)
```

# Retain overlap

Retain only those chemicals found in exposure, dose-response and C_ss datasets.

```{r}
#| label: retain
casn_keep <- unique(geo_tox_data$simulated_css$casn)

geo_tox_data$exposure <- geo_tox_data$exposure |>
  filter(casn %in% casn_keep)

geo_tox_data$dose_response <- geo_tox_data$dose_response |>
  filter(casn %in% casn_keep)
```

# County/State boundaries

Download cartographic boundary files for counties and states from the [U.S. Census Bureau](https://www.census.gov/geographies/mapping-files/time-series/geo/cartographic-boundary.html){target="_blank" rel="noopener noreferrer"}. The geometry data for North Carolina counties and the state are included in the GeoTox package data as `geo_tox_data$boundaries`.

```{r}
#| label: boundaries
county <- st_read("cb_2020_us_county_5m/cb_2020_us_county_5m.shp")
state <- st_read("cb_2020_us_state_5m/cb_2020_us_state_5m.shp")

geo_tox_data$boundaries <- list(
  county = county |>
    filter(STATEFP == 37) |>
    select(FIPS = GEOID, geometry),
  state = state |>
    filter(STATEFP == 37) |>
    select(geometry)
)
```
