---
title: "Demonstration of Supported Models"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{Supported Models with the Zenodo Street-View Density Data}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r setup, include = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  eval = FALSE,
  warning = FALSE,
  message = FALSE
)
```

This vignette demonstrates all currently supported `gopher` engines with the
Zenodo dataset at
`https://zenodo.org/records/10120281/files/gridded_OSM_GSV.RDS`.
The file contains polygon geometries, uses every `dens_*` column as a
covariate, and fits separate models for the two air-pollution responses `NO`
and `NO2`.

Dataset is from:

> Chambliss, S. et al. (2024). Local exposure misclassification in national models: relationships with urban infrastructure and demographics. _Journal of Exposure Science and Environmental Epidemiology_ 34, 761-769. https://doi.org/10.1038/s41370-023-00624-z


For this dataset, the most reliable workflow is to convert each polygon to one
representative point, project to UTM zone 10N, and work in kilometres. That
avoids row-mismatch failures in point-based GP engines and keeps range-related
parameters on a friendlier numerical scale.

The chunks are shown but not executed during vignette build because they depend
on an external download and several optional engine packages.

The same workflow is applied to all six engines:

- `gstat`
- `fields`
- `GPvecchia`
- `spNNGP`
- `PrestoGP`
- `sdmTMB`

## Load the data

```{r}
library(gopher)
library(parsnip)
library(sf)

data_path <- system.file("extdata", "gridded_gsv.rds", package = "gopher")
gsv_sf <- readRDS(data_path)

stopifnot(inherits(gsv_sf, "sf"))
stopifnot(all(c("NO", "NO2") %in% names(gsv_sf)))

covariate_vars <- grep("^dens_", names(gsv_sf), value = TRUE)
response_vars <- c("NO", "NO2")

length(covariate_vars)
covariate_vars
```

## Convert polygons to point locations

Several GP engines expect one coordinate pair per row. Because the Zenodo
object is polygon-based, we derive one representative point per feature and add
explicit `x`/`y` columns in projected kilometres.

```{r}
gsv_pt <- gsv_sf |>
  sf::st_transform(32610) |>
  sf::st_point_on_surface()

xy <- sf::st_coordinates(gsv_pt)

gsv_df <- sf::st_drop_geometry(gsv_pt)
gsv_df$x <- xy[, "X"] / 1000
gsv_df$y <- xy[, "Y"] / 1000
```

## Prepare a reusable modelling split

The helper below keeps only complete cases for a chosen response and the full
set of `dens_*` covariates, removes duplicated point locations, downsamples to
a vignette-sized dataset, and then creates a simple training/test split.

```{r}
set.seed(2026)

prepare_split <- function(data, response, covariates, prop = 0.8, max_n = 300) {
  keep <- stats::complete.cases(
    data[, c(response, covariates, "x", "y"), drop = FALSE]
  )
  data_complete <- data[keep, c(response, covariates, "x", "y")]
  data_complete <- data_complete[!duplicated(data_complete[, c("x", "y")]), ]

  if (nrow(data_complete) > max_n) {
    data_complete <- data_complete[
      sample.int(nrow(data_complete), size = max_n),
      ,
      drop = FALSE
    ]
  }

  n <- nrow(data_complete)
  train_id <- sample.int(n, size = floor(prop * n))

  list(
    data = data_complete,
    train = data_complete[train_id, , drop = FALSE],
    test = data_complete[-train_id, , drop = FALSE],
    formula = stats::reformulate(covariates, response = response)
  )
}

no_split <- prepare_split(gsv_df, "NO", covariate_vars)
no2_split <- prepare_split(gsv_df, "NO2", covariate_vars)
```

## Define model specifications for every supported engine

The engines share the same top-level `gaussian_process_spatial()` specification,
but each backend gets engine-specific tuning arguments that are conservative
enough to serve as a stable starting point for this dataset.

```{r}
make_engine_specs <- function(train_data) {
  n_train <- nrow(train_data)

  list(
    gstat = gaussian_process_spatial(covariance_function = "matern") |>
      set_engine(
        "gstat",
        range = 500,
        psill = 120
      ),

    fields = gaussian_process_spatial(covariance_function = "matern") |>
      set_engine("fields"),

    GPvecchia = gaussian_process_spatial(
      covariance_function = NULL
    ) |>
      set_engine(
        "GPvecchia",
        m = max(50L, min(30L, n_train - 1L))
      ),

    spNNGP = gaussian_process_spatial(
      covariance_function = NULL,
      range = 2000
    ) |>
      set_engine(
        "spNNGP",
        covariance_function = "matern",
        n_neighbors = max(50L, min(15L, n_train - 1L)),
        n_samples = 1000L,
        n_burnin = 500L
      ),

    PrestoGP = gaussian_process_spatial(covariance_function = "matern") |>
      set_engine(
        "PrestoGP",
        n_neighbors = max(50L, min(20L, n_train - 1L)),
        model_type = "vecchia",
        quiet = TRUE
      ),

    sdmTMB = gaussian_process_spatial(covariance_function = "matern") |>
      set_engine(
        "sdmTMB",
        spatial = "on",
        n_knots = 80
      )
  )
}
```

## Fit every engine for `NO`

```{r}
engine_specs_no <- make_engine_specs(no_split$train)

fits_no <- lapply(engine_specs_no, function(spec) {
  fit(spec, no_split$formula, data = no_split$train)
})

preds_no <- lapply(fits_no, function(mod) {
  predict(mod, new_data = no_split$test)
})

results_no <- data.frame(
  engine = names(preds_no),
  rmse = vapply(
    preds_no,
    function(x) {
      sqrt(mean((no_split$test$NO - x$.pred)^2))
    },
    numeric(1)
  )
)

results_no[order(results_no$rmse), ]
```

If you also want interval predictions:

```{r}
pred_int_no_gstat <- predict(
  fits_no$sdmTMB,
  new_data = no_split$test,
  type = "pred_int"
)

head(pred_int_no_gstat)
```

## Fit every engine for `NO2`

```{r}
engine_specs_no2 <- make_engine_specs(no2_split$train)

fits_no2 <- lapply(engine_specs_no2, function(spec) {
  fit(spec, no2_split$formula, data = no2_split$train)
})

preds_no2 <- lapply(fits_no2, function(mod) {
  predict(mod, new_data = no2_split$test)
})

results_no2 <- data.frame(
  engine = names(preds_no2),
  rmse = vapply(
    preds_no2,
    function(x) {
      sqrt(mean((no2_split$test$NO2 - x$.pred)^2))
    },
    numeric(1)
  )
)

results_no2[order(results_no2$rmse), ]
```

## Run both responses in one loop

If you want the full benchmark in a single object, wrap the split, fit, and
evaluation steps into one helper.

```{r}
fit_all_supported_models <- function(data, response, covariates) {
  split <- prepare_split(data, response, covariates)
  specs <- make_engine_specs(split$train)

  fits <- lapply(specs, function(spec) {
    fit(spec, split$formula, data = split$train)
  })

  preds <- lapply(fits, function(mod) {
    predict(mod, new_data = split$test)
  })

  data.frame(
    response = response,
    engine = names(preds),
    rmse = vapply(
      preds,
      function(x) {
        sqrt(mean((split$test[[response]] - x$.pred)^2))
      },
      numeric(1)
    ),
    mae = vapply(
      preds,
      function(x) {
        mean(abs(split$test[[response]] - x$.pred))
      },
      numeric(1)
    ),
    row.names = NULL
  )
}

benchmark_table <- do.call(
  rbind,
  lapply(response_vars, function(resp) {
    fit_all_supported_models(gsv_df, resp, covariate_vars)
  })
)


benchmark_table[order(benchmark_table$response, benchmark_table$rmse), ]
```

```{r}
# plot benchmark results
library(ggplot2)
ggplot(benchmark_table, aes(x = engine, y = rmse, fill = engine)) +
  geom_point() +
  geom_pointrange(aes(ymin = 0, ymax = rmse), size = 0.5) +
  facet_wrap(~ response, scales = "free_y") +
  theme_minimal() +
  theme(legend.position = "none") +
  labs(
    title = "RMSE of Supported GP Engines on the Zenodo Street-View Density Data",
    x = "GP Engine",
    y = "Root Mean Squared Error (RMSE)"
  )
```


## Notes

- The code intentionally discovers covariates with `grep("^dens_", ...)`, so it
  stays in sync if the Zenodo file is updated with more density features.
- `NO` and `NO2` are fitted separately because `gaussian_process_spatial()` is a
  univariate regression interface.
- The raw Zenodo file is polygon-based, so the vignette converts it to
  representative point locations before fitting point-referenced GP models.
- The coordinates are projected to UTM zone 10N and rescaled to kilometres so
  range-related parameters are on a friendlier numerical scale.
- For larger subsets, `GPvecchia`, `spNNGP`, `PrestoGP`, and `sdmTMB` will
  generally be the more scalable backends to try first.