From 0acfe4d3bfb7154f0bd9298bd9f42e10c58ba3fd Mon Sep 17 00:00:00 2001
From: Maciej Nasinski <Nasinski.maciej@gmail.com>
Date: Wed, 25 Feb 2026 23:33:12 +0100
Subject: [PATCH] update vignettes

---
 README.md                                 |  58 +++++++-
 vignettes/miceFast-intro.Rmd              |  94 +++++-------
 vignettes/missing-data-and-imputation.Rmd | 170 +++++++++++++++++-----
 3 files changed, 224 insertions(+), 98 deletions(-)

diff --git a/README.md b/README.md
index 22816c7..fc563d8 100644
--- a/README.md
+++ b/README.md
@@ -26,9 +26,9 @@ For performance details, see `performance_validity.R` in the `extdata` folder.
 - [Introduction and Advanced Usage](https://polkas.github.io/miceFast/articles/miceFast-intro.html)
 - [Missing Data Mechanisms and Multiple Imputation](https://polkas.github.io/miceFast/articles/missing-data-and-imputation.html)
 
-## Multiple Imputation and mice
+## Multiple Imputation Workflow
 
-[mice](https://cran.r-project.org/package=mice) implements the full MI pipeline (impute, analyze, pool). **miceFast** focuses on the computationally expensive part: fitting the imputation models. Two usage modes:
+[mice](https://cran.r-project.org/package=mice) implements the full MI pipeline (impute, analyze, pool). **miceFast** focuses on the computationally expensive part — fitting the imputation models — and is typically **~10× faster** than mice for the imputation step alone (see [benchmarks](#performance-highlights)). Two usage modes:
 
 1. **MI with Rubin's rules** — call `fill_NA()` with a stochastic model (`lm_bayes`, `lm_noise`, or `lda` with a random `ridge`) in a loop to create *m* completed datasets, then `pool()` the fitted models.
 
@@ -50,6 +50,8 @@ devtools::install_github("polkas/miceFast")
 
 ## Quick Example
 
+### dplyr
+
 ```r
 library(miceFast)
 library(dplyr)
@@ -60,10 +62,7 @@ data(air_miss)
 # Visualize the NA structure
 upset_NA(air_miss, 6)
 
-# Naive imputation (quick, but biased — see ?naive_fill_NA)
-naive_fill_NA(air_miss)
-
-# Model-based single imputation with fill_NA
+# Model-based single imputation
 air_miss %>%
   mutate(Ozone_imp = fill_NA(
     x = ., model = "lm_bayes",
@@ -80,9 +79,46 @@ completed <- lapply(1:5, function(i) {
 })
 fits <- lapply(completed, function(d) lm(Ozone_imp ~ Wind + Temp, data = d))
 pool(fits)
+#> Pooled results from 5 imputed datasets
+#> Rubin's rules with Barnard-Rubin df adjustment
+#>
+#>         term estimate std.error statistic    df   p.value
+#>  (Intercept)  -62.771   23.9022    -2.626 46.95 1.162e-02
+#>         Wind   -3.087    0.6857    -4.502 37.24 6.420e-05
+#>         Temp    1.736    0.2498     6.951 58.54 3.400e-09
+```
+
+### data.table
+
+```r
+library(miceFast)
+library(data.table)
+
+set.seed(1234)
+data(air_miss)
+setDT(air_miss)
+
+# Single imputation
+air_miss[, Ozone_imp := fill_NA(
+  x = .SD, model = "lm_bayes",
+  posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp")
+)]
+
+# Grouped imputation — fits a separate model per group
+air_miss[, Solar_R_imp := fill_NA(
+  x = .SD, model = "lm_bayes",
+  posit_y = "Solar.R", posit_x = c("Wind", "Temp", "Intercept")
+), by = .(groups)]
+```
+
+### Naive imputation (baseline only)
+
+```r
+# Quick baseline — biased, does not account for relationships between variables
+naive_fill_NA(air_miss)
 ```
 
-See the [Introduction vignette](https://polkas.github.io/miceFast/articles/miceFast-intro.html) for grouped imputation, data.table syntax, the OOP interface, and more.
+See the [Introduction vignette](https://polkas.github.io/miceFast/articles/miceFast-intro.html) for weights, the OOP interface, log-transformations, and more.
 
 ---
 
@@ -113,6 +149,14 @@ See the [Introduction vignette](https://polkas.github.io/miceFast/articles/miceF
 
 ---
 
+## Practical Advice
+
+- **Little missing data + MCAR?** Consider using `complete.cases()` — listwise deletion is unbiased under MCAR and may be sufficient when the fraction of incomplete rows is small.
+- **For publication**, always run a **sensitivity analysis**: compare MI results against base methods (`complete.cases()`, mean imputation) and across different imputation models (`lm_bayes`, `lm_noise`, `pmm`). Vary the number of imputations. If conclusions change, investigate why. Report the imputation model, *m*, and any assumptions about the missing-data mechanism.
+- See the [MI vignette](https://polkas.github.io/miceFast/articles/missing-data-and-imputation.html) for details on MCAR/MAR/MNAR mechanisms and a practical checklist.
+
+---
+
 ## Performance Highlights
 
 Median timings on 100k rows, 10 variables, 100 groups (R 4.4.3, macOS M3 Pro, [optimized BLAS/LAPACK](https://cran.r-project.org/bin/macosx/RMacOSX-FAQ.html#Which-BLAS-is-used-and-how-can-it-be-changed_003f)):
diff --git a/vignettes/miceFast-intro.Rmd b/vignettes/miceFast-intro.Rmd
index 39f317e..2c7e7e5 100644
--- a/vignettes/miceFast-intro.Rmd
+++ b/vignettes/miceFast-intro.Rmd
@@ -72,7 +72,9 @@ upset_NA(air_miss, 6)
 
 ## Checking for Collinearity
 
-Before imputing, check Variance Inflation Factors. Values above ~10 suggest problematic collinearity.
+Before imputing, check Variance Inflation Factors. Values above ~10 suggest
+problematic collinearity that can destabilize imputation models — consider
+dropping or combining redundant predictors before imputing.
 
 ```{r vif}
 VIF(
@@ -140,7 +142,8 @@ head(result_grouped[, c("Solar.R", "Solar_R_imp", "groups")])
 
 ## Log-transformation
 
-For right-skewed variables (like Ozone), use `logreg = TRUE` to impute on the log scale:
+For right-skewed variables (like Ozone), use `logreg = TRUE` to impute on the log scale.
+The model fits on $\log(y)$ and back-transforms the predictions:
 
 ```{r fill-na-logreg}
 data(air_miss)
@@ -153,11 +156,15 @@ result_log <- air_miss %>%
     posit_x = c("Solar.R", "Wind", "Temp", "Intercept"),
     logreg = TRUE
   ))
+
+# Compare distributions: log imputation avoids negative values
+summary(result_log[c("Ozone", "Ozone_imp")])
 ```
 
 ## Using column position indices
 
-You can refer to columns by position instead of name:
+You can refer to columns by position instead of name.
+Check `names(air_miss)` to find the right positions:
 
 ```{r fill-na-position}
 data(air_miss)
@@ -170,7 +177,8 @@ result_pos <- air_miss %>%
     posit_x = c(4, 6),
     logreg = TRUE
   ))
-```
+
+head(result_pos[, c("Ozone", "Ozone_imp")])
 
 ## Basic usage (data.table)
 
@@ -309,37 +317,6 @@ pool_res
 summary(pool_res)
 ```
 
-## MI with continuous and categorical variables
-
-For a complete MI workflow imputing both continuous and categorical variables:
-
-```{r mi-mixed}
-data(air_miss)
-
-impute_data <- function(data) {
-  data %>%
-    mutate(
-      Solar_R_imp = fill_NA(
-        x = ., model = "lm_bayes",
-        posit_y = "Solar.R",
-        posit_x = c("Wind", "Temp", "Intercept"),
-        w = weights
-      ),
-      Ozone_chac_imp = fill_NA(
-        x = ., model = "lda",
-        posit_y = "Ozone_chac",
-        posit_x = c("Wind", "Temp"),
-        ridge = runif(1, 0, 50)  # random ridge makes LDA stochastic
-      )
-    )
-}
-
-set.seed(42)
-res <- replicate(n = 5, expr = impute_data(air_miss), simplify = FALSE)
-fits <- lapply(res, function(d) lm(Solar_R_imp ~ Wind + Temp, data = d))
-pool(fits)
-```
-
 ---
 
 # Full Imputation: Filling All Variables and MI with Rubin's Rules
@@ -469,21 +446,19 @@ use `get_index()` to recover the original row order.
 
 ## Simple example
 
-```{r oop-simple, eval=requireNamespace("mice", quietly=TRUE)}
-data <- cbind(as.matrix(mice::nhanes), intercept = 1, index = 1:nrow(mice::nhanes))
+```{r oop-simple}
+data <- cbind(as.matrix(airquality[, 1:4]), intercept = 1, index = 1:nrow(airquality))
 model <- new(miceFast)
 model$set_data(data)
 
-# Single imputation with linear model
-model$update_var(2, model$impute("lm_pred", 2, 5)$imputations)
-
-# LDA for a categorical variable
-model$update_var(3, model$impute("lda", 3, c(1, 2))$imputations)
+# Single imputation with linear model (col 1 = Ozone)
+model$update_var(1, model$impute("lm_pred", 1, 5)$imputations)
 
-# Averaged multiple imputation (Bayesian, k=10 draws)
-model$update_var(4, model$impute_N("lm_bayes", 4, c(1, 2, 3), k = 10)$imputations)
+# Averaged multiple imputation for Solar.R (col 2, Bayesian, k=10 draws)
+model$update_var(2, model$impute_N("lm_bayes", 2, c(1, 3, 4, 5), k = 10)$imputations)
 
 model$which_updated()
+head(model$get_data(), 4)
 ```
 
 ## With weights and groups
@@ -565,17 +540,28 @@ summary(pool(fits))
 
 # Generating Correlated Data with `corrData`
 
-The `corrData` module generates correlated data for simulations:
-
-```r
-# Constructors:
-new(corrData, nr_cat, n_obs, means, cor_matrix)
-new(corrData, n_obs, means, cor_matrix)
+The `corrData` module generates correlated data for simulations.
+This is useful for creating test datasets with known properties.
+
+```{r corrdata-example}
+# 3 continuous variables, 100 observations
+means <- c(10, 20, 30)
+cor_matrix <- matrix(c(
+  1.0, 0.7, 0.3,
+  0.7, 1.0, 0.5,
+  0.3, 0.5, 1.0
+), nrow = 3)
+
+cd <- new(corrData, 100, means, cor_matrix)
+sim_data <- cd$fill("contin")
+round(cor(sim_data), 2)
+```
 
-# Methods:
-cd_obj$fill("contin")   # continuous data
-cd_obj$fill("binom")    # binary data
-cd_obj$fill("discrete") # multi-category discrete data
+```{r corrdata-discrete}
+# With 2 categorical variables: first argument is nr_cat
+cd2 <- new(corrData, 2, 200, means, cor_matrix)
+sim_discrete <- cd2$fill("discrete")
+head(sim_discrete)
 ```
 
 ---
diff --git a/vignettes/missing-data-and-imputation.Rmd b/vignettes/missing-data-and-imputation.Rmd
index 2b032af..6b1db27 100644
--- a/vignettes/missing-data-and-imputation.Rmd
+++ b/vignettes/missing-data-and-imputation.Rmd
@@ -35,6 +35,26 @@ library(ggplot2)
 set.seed(2025)
 ```
 
+## Quick-start: MI in 10 lines
+
+If you just want the recipe, here it is. The rest of this vignette explains
+**why** each step matters.
+
+```{r quickstart}
+data(air_miss)
+
+# 1. Impute m = 10 completed datasets
+completed <- lapply(1:10, function(i) {
+  air_miss %>%
+    mutate(Ozone_imp = fill_NA(x = ., model = "lm_bayes",
+      posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp")))
+})
+# 2. Fit the analysis model on each
+fits <- lapply(completed, function(d) lm(Ozone_imp ~ Wind + Temp, data = d))
+# 3. Pool using Rubin's rules
+summary(pool(fits))
+```
+
 ---
 
 # Missing-Data Mechanisms
@@ -265,6 +285,10 @@ identical imputations across datasets.
 
 ## Basic MI workflow
 
+The three-step MI workflow uses `fill_NA()` in a loop, any model with `coef()`/`vcov()`,
+and `pool()`. For more imputation examples (grouping, weights, data.table, OOP),
+see the [Introduction vignette](miceFast-intro.html).
+
 ```{r mi-basic}
 data(air_miss)
 
@@ -465,6 +489,7 @@ compare_imp(air_sensitivity, origin = "Ozone",
 Check whether results stabilize as *m* increases:
 
 ```{r sensitivity-m}
+set.seed(2025)
 results <- data.frame()
 
 for (m_val in c(3, 5, 10, 20, 50)) {
@@ -492,28 +517,36 @@ results %>%
   theme_minimal()
 ```
 
-## Comparing with naive imputation
+## Comparing with base methods
 
-Demonstrate how naive (mean) imputation underestimates uncertainty compared to
-proper MI:
+Always compare MI results against simple baselines. If conclusions differ,
+investigate why — it may reveal problems with your imputation model or
+indicate that the missing-data mechanism matters.
 
-```{r sensitivity-naive}
+```{r sensitivity-baselines}
 data(air_miss)
+set.seed(2025)
 
-# Naive: fill_NA with lm_pred + Intercept only = mean imputation
-air_naive <- air_miss %>%
-  mutate(
-    Ozone_mean = fill_NA(
-      x = ., model = "lm_pred",
-      posit_y = "Ozone",
-      posit_x = "Intercept"
-    )
-  )
-
-fit_naive <- lm(Ozone_mean ~ Wind + Temp, data = air_naive)
-ci_naive <- confint(fit_naive)
-
-# Proper MI
+# 1. Complete cases (listwise deletion) — unbiased under MCAR
+fit_cc <- lm(Ozone ~ Wind + Temp, data = air_miss[complete.cases(air_miss[, c("Ozone", "Wind", "Temp")]), ])
+ci_cc <- confint(fit_cc)
+
+# 2. Mean imputation — biased, underestimates variance
+air_mean <- air_miss
+air_mean$Ozone[is.na(air_mean$Ozone)] <- mean(air_mean$Ozone, na.rm = TRUE)
+fit_mean <- lm(Ozone ~ Wind + Temp, data = air_mean)
+ci_mean <- confint(fit_mean)
+
+# 3. Deterministic regression imputation (lm_pred) — no residual noise
+air_pred <- air_miss %>%
+  mutate(Ozone_imp = fill_NA(
+    x = ., model = "lm_pred",
+    posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp")
+  ))
+fit_pred <- lm(Ozone_imp ~ Wind + Temp, data = air_pred)
+ci_pred <- confint(fit_pred)
+
+# 4. Proper MI with Rubin's rules (lm_bayes, m = 20)
 completed <- lapply(1:20, function(i) {
   air_miss %>%
     mutate(Ozone_imp = fill_NA(
@@ -522,17 +555,29 @@ completed <- lapply(1:20, function(i) {
     ))
 })
 fits <- lapply(completed, function(d) lm(Ozone_imp ~ Wind + Temp, data = d))
-pool_summary <- summary(pool(fits))
-
-cat("Naive imputation CI for Wind:\n")
-print(ci_naive["Wind", ])
-
-cat("\nMI pooled CI for Wind:\n")
-cat(sprintf("  2.5%%: %.4f  97.5%%: %.4f\n",
-    pool_summary$conf.low[pool_summary$term == "Wind"],
-    pool_summary$conf.high[pool_summary$term == "Wind"]))
+pool_s <- summary(pool(fits))
+
+# Compare Wind coefficient across all methods
+comparison <- data.frame(
+  method = c("Complete cases", "Mean imputation", "Regression (lm_pred)", "MI (lm_bayes, m=20)"),
+  estimate = c(coef(fit_cc)["Wind"], coef(fit_mean)["Wind"],
+               coef(fit_pred)["Wind"], pool_s$estimate[pool_s$term == "Wind"]),
+  ci_low = c(ci_cc["Wind", 1], ci_mean["Wind", 1],
+             ci_pred["Wind", 1], pool_s$conf.low[pool_s$term == "Wind"]),
+  ci_high = c(ci_cc["Wind", 2], ci_mean["Wind", 2],
+              ci_pred["Wind", 2], pool_s$conf.high[pool_s$term == "Wind"]),
+  n_used = c(nrow(air_miss[complete.cases(air_miss[, c("Ozone", "Wind", "Temp")]), ]),
+             nrow(air_miss), nrow(air_miss), nrow(air_miss))
+)
+comparison$ci_width <- comparison$ci_high - comparison$ci_low
+comparison
 ```
 
+Notice that mean imputation and deterministic regression produce artificially
+narrow confidence intervals (they ignore imputation uncertainty), while
+complete-case analysis uses fewer observations. MI properly reflects both
+sources of uncertainty.
+
 ---
 
 # Choosing the Number of Imputations
@@ -554,17 +599,59 @@ stabilize (see the sensitivity analysis above).
 
 # Practical Checklist
 
-Before imputing, examine the missingness pattern (`upset_NA()`) and check
-collinearity (`VIF()` — values above 10 are problematic). Include auxiliary
-variables that predict missingness even if they are not in the analysis model.
+1. **Examine missingness patterns:** Use `upset_NA()` to visualize which variables are
+   jointly missing.
+   ```r
+   upset_NA(air_miss, 6)
+   ```
+
+2. **Check collinearity:** Run `VIF()` on your predictor set. Drop or combine predictors
+   with VIF > 10.
+   ```r
+   VIF(air_miss, posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp"))
+   ```
+
+3. **Choose imputation models:** `lm_bayes` for continuous, `lda` with random `ridge`
+   for categorical, `logreg = TRUE` for right-skewed variables.
+
+4. **Include auxiliary variables:** Add predictors of missingness to the imputation model
+   even if they are not in the analysis model — this makes MAR more plausible.
+
+5. **Set *m* ≥ 20.** Since miceFast is fast, there is little cost. Increase until
+   estimates and standard errors stabilize (see sensitivity analysis above).
+
+6. **Pool and report:** Use `pool()` for Rubin's rules. Report the imputation model,
+   *m*, pooled estimates, and confidence intervals.
+
+7. **Run sensitivity analyses:** Vary the model (`lm_bayes` vs `lm_noise` vs `pmm`),
+   vary *m*, and compare results. Check base methods too.
 
-For continuous variables use `lm_bayes` or `lm_noise`; for categorical
-variables use `lda` with a random `ridge`; for right-skewed data consider
-`logreg = TRUE`. Use at least $m = 20$ imputations, pool with `pool()`,
-and run sensitivity analyses varying the model and $m$.
+## Von Hippel's two-stage rule for *m*
 
-Report the imputation model, number of imputations, and any assumptions
-about the missing-data mechanism.
+Von Hippel (2020) proposed a two-stage procedure: run a pilot with small *m*,
+estimate FMI from the pooled output, then use the FMI to calculate how many
+imputations are needed for stable SE estimates. The R package
+**howManyImputations** (available on CRAN) implements this calculation directly
+from a `mice` mids object. For a miceFast workflow you can use the FMI from
+`pool()` as input:
+
+```{r vonhippel}
+# Pilot with m = 5
+set.seed(2025)
+pilot <- lapply(1:5, function(i) {
+  air_miss %>%
+    mutate(Ozone_imp = fill_NA(x = ., model = "lm_bayes",
+      posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp")))
+})
+pilot_pool <- pool(lapply(pilot, function(d) lm(Ozone_imp ~ Wind + Temp, data = d)))
+
+# Inspect FMI — the key input for deciding m
+data.frame(term = pilot_pool$term, fmi = round(pilot_pool$fmi, 3))
+
+# For the exact formula and its derivation see:
+# von Hippel (2020) "How many imputations do you need?", Sociological Methods & Research
+# R implementation: install.packages("howManyImputations")
+```
 
 ---
 
@@ -574,7 +661,7 @@ about the missing-data mechanism.
 |---------|------|----------|
 | MI framework | Complete (FCS/MICE algorithm) | Building blocks for MI |
 | Imputation models | 25+ built-in methods | `lm_pred`, `lm_bayes`, `lm_noise`, `lda`, `pmm` |
-| Chained equations | Yes (iterative multivariate) | Manual chaining possible |
+| Chained equations | Yes (iterative multivariate) | Single-pass sequential; not iterative FCS |
 | Speed | R-based | C++/Armadillo — significantly faster |
 | Grouping | Via `blocks` | Built-in, auto-sorted |
 | Weights | Limited | Full support |
@@ -588,6 +675,15 @@ tooling.
 or when you want imputation inside `dplyr`/`data.table` pipelines.
 The two can be combined: use **miceFast** for imputation and **mice** for diagnostics, or vice versa.
 
+**Important caveat:** miceFast fills variables in a single pass — each variable's
+imputation conditions on the previously imputed columns but there is no iterative
+cycling back (unlike mice's FCS algorithm, which iterates until convergence).
+For datasets where the missing-data pattern is monotone or nearly so, a single pass
+is often sufficient. With complex interlocking patterns, the single-pass approach
+may introduce some bias because later variables are imputed conditional on
+already-imputed (not yet converged) earlier variables. Consider using mice for
+datasets with heavy non-monotone missingness.
+
 ---
 
 # References