--- title: "Demonstration of Supported Models" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Supported Models with the Zenodo Street-View Density Data} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r setup, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", eval = FALSE, warning = FALSE, message = FALSE ) ``` This vignette demonstrates all currently supported `gopher` engines with the Zenodo dataset at `https://zenodo.org/records/10120281/files/gridded_OSM_GSV.RDS`. The file contains polygon geometries, uses every `dens_*` column as a covariate, and fits separate models for the two air-pollution responses `NO` and `NO2`. Dataset is from: > Chambliss, S. et al. (2024). Local exposure misclassification in national models: relationships with urban infrastructure and demographics. _Journal of Exposure Science and Environmental Epidemiology_ 34, 761-769. https://doi.org/10.1038/s41370-023-00624-z For this dataset, the most reliable workflow is to convert each polygon to one representative point, project to UTM zone 10N, and work in kilometres. That avoids row-mismatch failures in point-based GP engines and keeps range-related parameters on a friendlier numerical scale. The chunks are shown but not executed during vignette build because they depend on an external download and several optional engine packages. The same workflow is applied to all six engines: - `gstat` - `fields` - `GPvecchia` - `spNNGP` - `PrestoGP` - `sdmTMB` ## Load the data ```{r} library(gopher) library(parsnip) library(sf) data_path <- system.file("extdata", "gridded_gsv.rds", package = "gopher") gsv_sf <- readRDS(data_path) stopifnot(inherits(gsv_sf, "sf")) stopifnot(all(c("NO", "NO2") %in% names(gsv_sf))) covariate_vars <- grep("^dens_", names(gsv_sf), value = TRUE) response_vars <- c("NO", "NO2") length(covariate_vars) covariate_vars ``` ## Convert polygons to point locations Several GP engines expect one coordinate pair per row. Because the Zenodo object is polygon-based, we derive one representative point per feature and add explicit `x`/`y` columns in projected kilometres. ```{r} gsv_pt <- gsv_sf |> sf::st_transform(32610) |> sf::st_point_on_surface() xy <- sf::st_coordinates(gsv_pt) gsv_df <- sf::st_drop_geometry(gsv_pt) gsv_df$x <- xy[, "X"] / 1000 gsv_df$y <- xy[, "Y"] / 1000 ``` ## Prepare a reusable modelling split The helper below keeps only complete cases for a chosen response and the full set of `dens_*` covariates, removes duplicated point locations, downsamples to a vignette-sized dataset, and then creates a simple training/test split. ```{r} set.seed(2026) prepare_split <- function(data, response, covariates, prop = 0.8, max_n = 300) { keep <- stats::complete.cases( data[, c(response, covariates, "x", "y"), drop = FALSE] ) data_complete <- data[keep, c(response, covariates, "x", "y")] data_complete <- data_complete[!duplicated(data_complete[, c("x", "y")]), ] if (nrow(data_complete) > max_n) { data_complete <- data_complete[ sample.int(nrow(data_complete), size = max_n), , drop = FALSE ] } n <- nrow(data_complete) train_id <- sample.int(n, size = floor(prop * n)) list( data = data_complete, train = data_complete[train_id, , drop = FALSE], test = data_complete[-train_id, , drop = FALSE], formula = stats::reformulate(covariates, response = response) ) } no_split <- prepare_split(gsv_df, "NO", covariate_vars) no2_split <- prepare_split(gsv_df, "NO2", covariate_vars) ``` ## Define model specifications for every supported engine The engines share the same top-level `gaussian_process_spatial()` specification, but each backend gets engine-specific tuning arguments that are conservative enough to serve as a stable starting point for this dataset. ```{r} make_engine_specs <- function(train_data) { n_train <- nrow(train_data) list( gstat = gaussian_process_spatial(covariance_function = "matern") |> set_engine( "gstat", range = 500, psill = 120 ), fields = gaussian_process_spatial(covariance_function = "matern") |> set_engine("fields"), GPvecchia = gaussian_process_spatial( covariance_function = NULL ) |> set_engine( "GPvecchia", m = max(50L, min(30L, n_train - 1L)) ), spNNGP = gaussian_process_spatial( covariance_function = NULL, range = 2000 ) |> set_engine( "spNNGP", covariance_function = "matern", n_neighbors = max(50L, min(15L, n_train - 1L)), n_samples = 1000L, n_burnin = 500L ), PrestoGP = gaussian_process_spatial(covariance_function = "matern") |> set_engine( "PrestoGP", n_neighbors = max(50L, min(20L, n_train - 1L)), model_type = "vecchia", quiet = TRUE ), sdmTMB = gaussian_process_spatial(covariance_function = "matern") |> set_engine( "sdmTMB", spatial = "on", n_knots = 80 ) ) } ``` ## Fit every engine for `NO` ```{r} engine_specs_no <- make_engine_specs(no_split$train) fits_no <- lapply(engine_specs_no, function(spec) { fit(spec, no_split$formula, data = no_split$train) }) preds_no <- lapply(fits_no, function(mod) { predict(mod, new_data = no_split$test) }) results_no <- data.frame( engine = names(preds_no), rmse = vapply( preds_no, function(x) { sqrt(mean((no_split$test$NO - x$.pred)^2)) }, numeric(1) ) ) results_no[order(results_no$rmse), ] ``` If you also want interval predictions: ```{r} pred_int_no_gstat <- predict( fits_no$sdmTMB, new_data = no_split$test, type = "pred_int" ) head(pred_int_no_gstat) ``` ## Fit every engine for `NO2` ```{r} engine_specs_no2 <- make_engine_specs(no2_split$train) fits_no2 <- lapply(engine_specs_no2, function(spec) { fit(spec, no2_split$formula, data = no2_split$train) }) preds_no2 <- lapply(fits_no2, function(mod) { predict(mod, new_data = no2_split$test) }) results_no2 <- data.frame( engine = names(preds_no2), rmse = vapply( preds_no2, function(x) { sqrt(mean((no2_split$test$NO2 - x$.pred)^2)) }, numeric(1) ) ) results_no2[order(results_no2$rmse), ] ``` ## Run both responses in one loop If you want the full benchmark in a single object, wrap the split, fit, and evaluation steps into one helper. ```{r} fit_all_supported_models <- function(data, response, covariates) { split <- prepare_split(data, response, covariates) specs <- make_engine_specs(split$train) fits <- lapply(specs, function(spec) { fit(spec, split$formula, data = split$train) }) preds <- lapply(fits, function(mod) { predict(mod, new_data = split$test) }) data.frame( response = response, engine = names(preds), rmse = vapply( preds, function(x) { sqrt(mean((split$test[[response]] - x$.pred)^2)) }, numeric(1) ), mae = vapply( preds, function(x) { mean(abs(split$test[[response]] - x$.pred)) }, numeric(1) ), row.names = NULL ) } benchmark_table <- do.call( rbind, lapply(response_vars, function(resp) { fit_all_supported_models(gsv_df, resp, covariate_vars) }) ) benchmark_table[order(benchmark_table$response, benchmark_table$rmse), ] ``` ```{r} # plot benchmark results library(ggplot2) ggplot(benchmark_table, aes(x = engine, y = rmse, fill = engine)) + geom_point() + geom_pointrange(aes(ymin = 0, ymax = rmse), size = 0.5) + facet_wrap(~ response, scales = "free_y") + theme_minimal() + theme(legend.position = "none") + labs( title = "RMSE of Supported GP Engines on the Zenodo Street-View Density Data", x = "GP Engine", y = "Root Mean Squared Error (RMSE)" ) ``` ## Notes - The code intentionally discovers covariates with `grep("^dens_", ...)`, so it stays in sync if the Zenodo file is updated with more density features. - `NO` and `NO2` are fitted separately because `gaussian_process_spatial()` is a univariate regression interface. - The raw Zenodo file is polygon-based, so the vignette converts it to representative point locations before fitting point-referenced GP models. - The coordinates are projected to UTM zone 10N and rescaled to kilometres so range-related parameters are on a friendlier numerical scale. - For larger subsets, `GPvecchia`, `spNNGP`, `PrestoGP`, and `sdmTMB` will generally be the more scalable backends to try first.