From 0acfe4d3bfb7154f0bd9298bd9f42e10c58ba3fd Mon Sep 17 00:00:00 2001 From: Maciej Nasinski Date: Wed, 25 Feb 2026 23:33:12 +0100 Subject: [PATCH] update vignettes --- README.md | 58 +++++++- vignettes/miceFast-intro.Rmd | 94 +++++------- vignettes/missing-data-and-imputation.Rmd | 170 +++++++++++++++++----- 3 files changed, 224 insertions(+), 98 deletions(-) diff --git a/README.md b/README.md index 22816c7..fc563d8 100644 --- a/README.md +++ b/README.md @@ -26,9 +26,9 @@ For performance details, see `performance_validity.R` in the `extdata` folder. - [Introduction and Advanced Usage](https://polkas.github.io/miceFast/articles/miceFast-intro.html) - [Missing Data Mechanisms and Multiple Imputation](https://polkas.github.io/miceFast/articles/missing-data-and-imputation.html) -## Multiple Imputation and mice +## Multiple Imputation Workflow -[mice](https://cran.r-project.org/package=mice) implements the full MI pipeline (impute, analyze, pool). **miceFast** focuses on the computationally expensive part: fitting the imputation models. Two usage modes: +[mice](https://cran.r-project.org/package=mice) implements the full MI pipeline (impute, analyze, pool). **miceFast** focuses on the computationally expensive part — fitting the imputation models — and is typically **~10× faster** than mice for the imputation step alone (see [benchmarks](#performance-highlights)). Two usage modes: 1. **MI with Rubin's rules** — call `fill_NA()` with a stochastic model (`lm_bayes`, `lm_noise`, or `lda` with a random `ridge`) in a loop to create *m* completed datasets, then `pool()` the fitted models. @@ -50,6 +50,8 @@ devtools::install_github("polkas/miceFast") ## Quick Example +### dplyr + ```r library(miceFast) library(dplyr) @@ -60,10 +62,7 @@ data(air_miss) # Visualize the NA structure upset_NA(air_miss, 6) -# Naive imputation (quick, but biased — see ?naive_fill_NA) -naive_fill_NA(air_miss) - -# Model-based single imputation with fill_NA +# Model-based single imputation air_miss %>% mutate(Ozone_imp = fill_NA( x = ., model = "lm_bayes", @@ -80,9 +79,46 @@ completed <- lapply(1:5, function(i) { }) fits <- lapply(completed, function(d) lm(Ozone_imp ~ Wind + Temp, data = d)) pool(fits) +#> Pooled results from 5 imputed datasets +#> Rubin's rules with Barnard-Rubin df adjustment +#> +#> term estimate std.error statistic df p.value +#> (Intercept) -62.771 23.9022 -2.626 46.95 1.162e-02 +#> Wind -3.087 0.6857 -4.502 37.24 6.420e-05 +#> Temp 1.736 0.2498 6.951 58.54 3.400e-09 +``` + +### data.table + +```r +library(miceFast) +library(data.table) + +set.seed(1234) +data(air_miss) +setDT(air_miss) + +# Single imputation +air_miss[, Ozone_imp := fill_NA( + x = .SD, model = "lm_bayes", + posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp") +)] + +# Grouped imputation — fits a separate model per group +air_miss[, Solar_R_imp := fill_NA( + x = .SD, model = "lm_bayes", + posit_y = "Solar.R", posit_x = c("Wind", "Temp", "Intercept") +), by = .(groups)] +``` + +### Naive imputation (baseline only) + +```r +# Quick baseline — biased, does not account for relationships between variables +naive_fill_NA(air_miss) ``` -See the [Introduction vignette](https://polkas.github.io/miceFast/articles/miceFast-intro.html) for grouped imputation, data.table syntax, the OOP interface, and more. +See the [Introduction vignette](https://polkas.github.io/miceFast/articles/miceFast-intro.html) for weights, the OOP interface, log-transformations, and more. --- @@ -113,6 +149,14 @@ See the [Introduction vignette](https://polkas.github.io/miceFast/articles/miceF --- +## Practical Advice + +- **Little missing data + MCAR?** Consider using `complete.cases()` — listwise deletion is unbiased under MCAR and may be sufficient when the fraction of incomplete rows is small. +- **For publication**, always run a **sensitivity analysis**: compare MI results against base methods (`complete.cases()`, mean imputation) and across different imputation models (`lm_bayes`, `lm_noise`, `pmm`). Vary the number of imputations. If conclusions change, investigate why. Report the imputation model, *m*, and any assumptions about the missing-data mechanism. +- See the [MI vignette](https://polkas.github.io/miceFast/articles/missing-data-and-imputation.html) for details on MCAR/MAR/MNAR mechanisms and a practical checklist. + +--- + ## Performance Highlights Median timings on 100k rows, 10 variables, 100 groups (R 4.4.3, macOS M3 Pro, [optimized BLAS/LAPACK](https://cran.r-project.org/bin/macosx/RMacOSX-FAQ.html#Which-BLAS-is-used-and-how-can-it-be-changed_003f)): diff --git a/vignettes/miceFast-intro.Rmd b/vignettes/miceFast-intro.Rmd index 39f317e..2c7e7e5 100644 --- a/vignettes/miceFast-intro.Rmd +++ b/vignettes/miceFast-intro.Rmd @@ -72,7 +72,9 @@ upset_NA(air_miss, 6) ## Checking for Collinearity -Before imputing, check Variance Inflation Factors. Values above ~10 suggest problematic collinearity. +Before imputing, check Variance Inflation Factors. Values above ~10 suggest +problematic collinearity that can destabilize imputation models — consider +dropping or combining redundant predictors before imputing. ```{r vif} VIF( @@ -140,7 +142,8 @@ head(result_grouped[, c("Solar.R", "Solar_R_imp", "groups")]) ## Log-transformation -For right-skewed variables (like Ozone), use `logreg = TRUE` to impute on the log scale: +For right-skewed variables (like Ozone), use `logreg = TRUE` to impute on the log scale. +The model fits on $\log(y)$ and back-transforms the predictions: ```{r fill-na-logreg} data(air_miss) @@ -153,11 +156,15 @@ result_log <- air_miss %>% posit_x = c("Solar.R", "Wind", "Temp", "Intercept"), logreg = TRUE )) + +# Compare distributions: log imputation avoids negative values +summary(result_log[c("Ozone", "Ozone_imp")]) ``` ## Using column position indices -You can refer to columns by position instead of name: +You can refer to columns by position instead of name. +Check `names(air_miss)` to find the right positions: ```{r fill-na-position} data(air_miss) @@ -170,7 +177,8 @@ result_pos <- air_miss %>% posit_x = c(4, 6), logreg = TRUE )) -``` + +head(result_pos[, c("Ozone", "Ozone_imp")]) ## Basic usage (data.table) @@ -309,37 +317,6 @@ pool_res summary(pool_res) ``` -## MI with continuous and categorical variables - -For a complete MI workflow imputing both continuous and categorical variables: - -```{r mi-mixed} -data(air_miss) - -impute_data <- function(data) { - data %>% - mutate( - Solar_R_imp = fill_NA( - x = ., model = "lm_bayes", - posit_y = "Solar.R", - posit_x = c("Wind", "Temp", "Intercept"), - w = weights - ), - Ozone_chac_imp = fill_NA( - x = ., model = "lda", - posit_y = "Ozone_chac", - posit_x = c("Wind", "Temp"), - ridge = runif(1, 0, 50) # random ridge makes LDA stochastic - ) - ) -} - -set.seed(42) -res <- replicate(n = 5, expr = impute_data(air_miss), simplify = FALSE) -fits <- lapply(res, function(d) lm(Solar_R_imp ~ Wind + Temp, data = d)) -pool(fits) -``` - --- # Full Imputation: Filling All Variables and MI with Rubin's Rules @@ -469,21 +446,19 @@ use `get_index()` to recover the original row order. ## Simple example -```{r oop-simple, eval=requireNamespace("mice", quietly=TRUE)} -data <- cbind(as.matrix(mice::nhanes), intercept = 1, index = 1:nrow(mice::nhanes)) +```{r oop-simple} +data <- cbind(as.matrix(airquality[, 1:4]), intercept = 1, index = 1:nrow(airquality)) model <- new(miceFast) model$set_data(data) -# Single imputation with linear model -model$update_var(2, model$impute("lm_pred", 2, 5)$imputations) - -# LDA for a categorical variable -model$update_var(3, model$impute("lda", 3, c(1, 2))$imputations) +# Single imputation with linear model (col 1 = Ozone) +model$update_var(1, model$impute("lm_pred", 1, 5)$imputations) -# Averaged multiple imputation (Bayesian, k=10 draws) -model$update_var(4, model$impute_N("lm_bayes", 4, c(1, 2, 3), k = 10)$imputations) +# Averaged multiple imputation for Solar.R (col 2, Bayesian, k=10 draws) +model$update_var(2, model$impute_N("lm_bayes", 2, c(1, 3, 4, 5), k = 10)$imputations) model$which_updated() +head(model$get_data(), 4) ``` ## With weights and groups @@ -565,17 +540,28 @@ summary(pool(fits)) # Generating Correlated Data with `corrData` -The `corrData` module generates correlated data for simulations: - -```r -# Constructors: -new(corrData, nr_cat, n_obs, means, cor_matrix) -new(corrData, n_obs, means, cor_matrix) +The `corrData` module generates correlated data for simulations. +This is useful for creating test datasets with known properties. + +```{r corrdata-example} +# 3 continuous variables, 100 observations +means <- c(10, 20, 30) +cor_matrix <- matrix(c( + 1.0, 0.7, 0.3, + 0.7, 1.0, 0.5, + 0.3, 0.5, 1.0 +), nrow = 3) + +cd <- new(corrData, 100, means, cor_matrix) +sim_data <- cd$fill("contin") +round(cor(sim_data), 2) +``` -# Methods: -cd_obj$fill("contin") # continuous data -cd_obj$fill("binom") # binary data -cd_obj$fill("discrete") # multi-category discrete data +```{r corrdata-discrete} +# With 2 categorical variables: first argument is nr_cat +cd2 <- new(corrData, 2, 200, means, cor_matrix) +sim_discrete <- cd2$fill("discrete") +head(sim_discrete) ``` --- diff --git a/vignettes/missing-data-and-imputation.Rmd b/vignettes/missing-data-and-imputation.Rmd index 2b032af..6b1db27 100644 --- a/vignettes/missing-data-and-imputation.Rmd +++ b/vignettes/missing-data-and-imputation.Rmd @@ -35,6 +35,26 @@ library(ggplot2) set.seed(2025) ``` +## Quick-start: MI in 10 lines + +If you just want the recipe, here it is. The rest of this vignette explains +**why** each step matters. + +```{r quickstart} +data(air_miss) + +# 1. Impute m = 10 completed datasets +completed <- lapply(1:10, function(i) { + air_miss %>% + mutate(Ozone_imp = fill_NA(x = ., model = "lm_bayes", + posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp"))) +}) +# 2. Fit the analysis model on each +fits <- lapply(completed, function(d) lm(Ozone_imp ~ Wind + Temp, data = d)) +# 3. Pool using Rubin's rules +summary(pool(fits)) +``` + --- # Missing-Data Mechanisms @@ -265,6 +285,10 @@ identical imputations across datasets. ## Basic MI workflow +The three-step MI workflow uses `fill_NA()` in a loop, any model with `coef()`/`vcov()`, +and `pool()`. For more imputation examples (grouping, weights, data.table, OOP), +see the [Introduction vignette](miceFast-intro.html). + ```{r mi-basic} data(air_miss) @@ -465,6 +489,7 @@ compare_imp(air_sensitivity, origin = "Ozone", Check whether results stabilize as *m* increases: ```{r sensitivity-m} +set.seed(2025) results <- data.frame() for (m_val in c(3, 5, 10, 20, 50)) { @@ -492,28 +517,36 @@ results %>% theme_minimal() ``` -## Comparing with naive imputation +## Comparing with base methods -Demonstrate how naive (mean) imputation underestimates uncertainty compared to -proper MI: +Always compare MI results against simple baselines. If conclusions differ, +investigate why — it may reveal problems with your imputation model or +indicate that the missing-data mechanism matters. -```{r sensitivity-naive} +```{r sensitivity-baselines} data(air_miss) +set.seed(2025) -# Naive: fill_NA with lm_pred + Intercept only = mean imputation -air_naive <- air_miss %>% - mutate( - Ozone_mean = fill_NA( - x = ., model = "lm_pred", - posit_y = "Ozone", - posit_x = "Intercept" - ) - ) - -fit_naive <- lm(Ozone_mean ~ Wind + Temp, data = air_naive) -ci_naive <- confint(fit_naive) - -# Proper MI +# 1. Complete cases (listwise deletion) — unbiased under MCAR +fit_cc <- lm(Ozone ~ Wind + Temp, data = air_miss[complete.cases(air_miss[, c("Ozone", "Wind", "Temp")]), ]) +ci_cc <- confint(fit_cc) + +# 2. Mean imputation — biased, underestimates variance +air_mean <- air_miss +air_mean$Ozone[is.na(air_mean$Ozone)] <- mean(air_mean$Ozone, na.rm = TRUE) +fit_mean <- lm(Ozone ~ Wind + Temp, data = air_mean) +ci_mean <- confint(fit_mean) + +# 3. Deterministic regression imputation (lm_pred) — no residual noise +air_pred <- air_miss %>% + mutate(Ozone_imp = fill_NA( + x = ., model = "lm_pred", + posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp") + )) +fit_pred <- lm(Ozone_imp ~ Wind + Temp, data = air_pred) +ci_pred <- confint(fit_pred) + +# 4. Proper MI with Rubin's rules (lm_bayes, m = 20) completed <- lapply(1:20, function(i) { air_miss %>% mutate(Ozone_imp = fill_NA( @@ -522,17 +555,29 @@ completed <- lapply(1:20, function(i) { )) }) fits <- lapply(completed, function(d) lm(Ozone_imp ~ Wind + Temp, data = d)) -pool_summary <- summary(pool(fits)) - -cat("Naive imputation CI for Wind:\n") -print(ci_naive["Wind", ]) - -cat("\nMI pooled CI for Wind:\n") -cat(sprintf(" 2.5%%: %.4f 97.5%%: %.4f\n", - pool_summary$conf.low[pool_summary$term == "Wind"], - pool_summary$conf.high[pool_summary$term == "Wind"])) +pool_s <- summary(pool(fits)) + +# Compare Wind coefficient across all methods +comparison <- data.frame( + method = c("Complete cases", "Mean imputation", "Regression (lm_pred)", "MI (lm_bayes, m=20)"), + estimate = c(coef(fit_cc)["Wind"], coef(fit_mean)["Wind"], + coef(fit_pred)["Wind"], pool_s$estimate[pool_s$term == "Wind"]), + ci_low = c(ci_cc["Wind", 1], ci_mean["Wind", 1], + ci_pred["Wind", 1], pool_s$conf.low[pool_s$term == "Wind"]), + ci_high = c(ci_cc["Wind", 2], ci_mean["Wind", 2], + ci_pred["Wind", 2], pool_s$conf.high[pool_s$term == "Wind"]), + n_used = c(nrow(air_miss[complete.cases(air_miss[, c("Ozone", "Wind", "Temp")]), ]), + nrow(air_miss), nrow(air_miss), nrow(air_miss)) +) +comparison$ci_width <- comparison$ci_high - comparison$ci_low +comparison ``` +Notice that mean imputation and deterministic regression produce artificially +narrow confidence intervals (they ignore imputation uncertainty), while +complete-case analysis uses fewer observations. MI properly reflects both +sources of uncertainty. + --- # Choosing the Number of Imputations @@ -554,17 +599,59 @@ stabilize (see the sensitivity analysis above). # Practical Checklist -Before imputing, examine the missingness pattern (`upset_NA()`) and check -collinearity (`VIF()` — values above 10 are problematic). Include auxiliary -variables that predict missingness even if they are not in the analysis model. +1. **Examine missingness patterns:** Use `upset_NA()` to visualize which variables are + jointly missing. + ```r + upset_NA(air_miss, 6) + ``` + +2. **Check collinearity:** Run `VIF()` on your predictor set. Drop or combine predictors + with VIF > 10. + ```r + VIF(air_miss, posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp")) + ``` + +3. **Choose imputation models:** `lm_bayes` for continuous, `lda` with random `ridge` + for categorical, `logreg = TRUE` for right-skewed variables. + +4. **Include auxiliary variables:** Add predictors of missingness to the imputation model + even if they are not in the analysis model — this makes MAR more plausible. + +5. **Set *m* ≥ 20.** Since miceFast is fast, there is little cost. Increase until + estimates and standard errors stabilize (see sensitivity analysis above). + +6. **Pool and report:** Use `pool()` for Rubin's rules. Report the imputation model, + *m*, pooled estimates, and confidence intervals. + +7. **Run sensitivity analyses:** Vary the model (`lm_bayes` vs `lm_noise` vs `pmm`), + vary *m*, and compare results. Check base methods too. -For continuous variables use `lm_bayes` or `lm_noise`; for categorical -variables use `lda` with a random `ridge`; for right-skewed data consider -`logreg = TRUE`. Use at least $m = 20$ imputations, pool with `pool()`, -and run sensitivity analyses varying the model and $m$. +## Von Hippel's two-stage rule for *m* -Report the imputation model, number of imputations, and any assumptions -about the missing-data mechanism. +Von Hippel (2020) proposed a two-stage procedure: run a pilot with small *m*, +estimate FMI from the pooled output, then use the FMI to calculate how many +imputations are needed for stable SE estimates. The R package +**howManyImputations** (available on CRAN) implements this calculation directly +from a `mice` mids object. For a miceFast workflow you can use the FMI from +`pool()` as input: + +```{r vonhippel} +# Pilot with m = 5 +set.seed(2025) +pilot <- lapply(1:5, function(i) { + air_miss %>% + mutate(Ozone_imp = fill_NA(x = ., model = "lm_bayes", + posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp"))) +}) +pilot_pool <- pool(lapply(pilot, function(d) lm(Ozone_imp ~ Wind + Temp, data = d))) + +# Inspect FMI — the key input for deciding m +data.frame(term = pilot_pool$term, fmi = round(pilot_pool$fmi, 3)) + +# For the exact formula and its derivation see: +# von Hippel (2020) "How many imputations do you need?", Sociological Methods & Research +# R implementation: install.packages("howManyImputations") +``` --- @@ -574,7 +661,7 @@ about the missing-data mechanism. |---------|------|----------| | MI framework | Complete (FCS/MICE algorithm) | Building blocks for MI | | Imputation models | 25+ built-in methods | `lm_pred`, `lm_bayes`, `lm_noise`, `lda`, `pmm` | -| Chained equations | Yes (iterative multivariate) | Manual chaining possible | +| Chained equations | Yes (iterative multivariate) | Single-pass sequential; not iterative FCS | | Speed | R-based | C++/Armadillo — significantly faster | | Grouping | Via `blocks` | Built-in, auto-sorted | | Weights | Limited | Full support | @@ -588,6 +675,15 @@ tooling. or when you want imputation inside `dplyr`/`data.table` pipelines. The two can be combined: use **miceFast** for imputation and **mice** for diagnostics, or vice versa. +**Important caveat:** miceFast fills variables in a single pass — each variable's +imputation conditions on the previously imputed columns but there is no iterative +cycling back (unlike mice's FCS algorithm, which iterates until convergence). +For datasets where the missing-data pattern is monotone or nearly so, a single pass +is often sufficient. With complex interlocking patterns, the single-pass approach +may introduce some bias because later variables are imputed conditional on +already-imputed (not yet converged) earlier variables. Consider using mice for +datasets with heavy non-monotone missingness. + --- # References