diff --git a/.Rbuildignore b/.Rbuildignore
index a8fb882..8721688 100644
--- a/.Rbuildignore
+++ b/.Rbuildignore
@@ -5,3 +5,6 @@
^docs$
^pkgdown$
^\.github$
+^README\.Rmd$
+^doc$
+^Meta$
diff --git a/.gitignore b/.gitignore
index 457525e..54f27cb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,6 @@
.DS_Store
.quarto
docs
+inst/doc
+/doc/
+/Meta/
diff --git a/DESCRIPTION b/DESCRIPTION
index 45a37b1..d99ccdd 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
Package: lighthouse.codebook
Title: Summarize Datasets for Lighthouse Institute Projects
-Version: 0.2.3
+Version: 0.3.0
Authors@R: c(
person("Casey", "Sarapas",
email = "ccsarapas@chestnut.org",
@@ -35,3 +35,7 @@ Roxygen: list(markdown = TRUE)
RoxygenNote: 7.3.3
URL: https://github.com/ccsarapas/lighthouse.codebook, https://ccsarapas.github.io/lighthouse.codebook/
BugReports: https://github.com/ccsarapas/lighthouse.codebook/issues
+Suggests:
+ knitr,
+ rmarkdown
+VignetteBuilder: knitr
diff --git a/NAMESPACE b/NAMESPACE
index 7a19b66..af707b1 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -4,7 +4,9 @@ S3method(nan_to_na,data.frame)
S3method(nan_to_na,default)
S3method(nan_to_na,list)
export(cb_create)
+export(cb_create_options)
export(cb_create_redcap)
+export(cb_create_redcap_options)
export(cb_create_spss)
export(cb_get_data)
export(cb_summarize_categorical)
diff --git a/NEWS.md b/NEWS.md
index d928098..5e0ed4d 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,63 @@
+# lighthouse.codebook 0.3.0
+
+## Added
+
+* Added options to `cb_write()` to show grouping variables for categorical summaries
+ in rows (which was previously only possible for numeric summaries.) `cb_write()`
+ now includes three arguments for showing some or all grouping variables in rows:
+ `group_rows` controls both numeric and categorical summaries, while `group_rows_numeric`
+ and `group_rows_categorical` control numeric and categorical summaries, respectively.
+
+* Added an introductory vignette (see `vignette("lighthouse-codebook")`).
+
+* Expanded the README.
+
+## Changed
+
+* `cb_create()`, `cb_create_spss()`, and `cb_create_redcap()` now use a single `.options`
+ argument for less commonly used settings. Arguments for those settings have been
+ moved into an options object created with `cb_create_options()` or `cb_create_redcap_options()`.
+ ```r
+ # previously
+ cb <- cb_create(
+ dat, metadata = metadata,
+ .rmv_html = FALSE, .include_r_classes = TRUE
+ )
+
+ # now
+ cb <- cb_create(
+ dat, metadata = metadata,
+ .options = cb_create_options(rmv_html = FALSE, include_r_classes = TRUE)
+ )
+
+ ### `cb_create_spss()` also uses `cb_create_options()`
+ # previously
+ cb_spss <- cb_create_spss(dat_spss, .rmv_line_breaks = FALSE)
+
+ # now
+ cb_spss <- cb_create_spss(
+ dat_spss,
+ .options = cb_create_options(rmv_line_breaks = FALSE)
+ )
+
+ ### Note that `cb_create_redcap()` has its own options constructor
+ # previously
+ cb <- cb_create_redcap(
+ dat_rc, metadata = metadata_rc,
+ .form = NULL, .rmv_html = FALSE
+ )
+
+ # now
+ cb <- cb_create_redcap(
+ dat_rc, metadata = metadata_rc,
+ .options = cb_create_redcap_options(form = NULL, rmv_html = FALSE)
+ )
+ ```
+
+## Removed
+
+* The `format = "values"` option in `cb_get_data()` has been removed (see #26).
+
# lighthouse.codebook 0.2.3
* `cb_create_spss()` now accepts `.rmv_html` and `.rmv_line_breaks` arguments, consistent
diff --git a/R/cb_create.r b/R/cb_create.r
index db26640..7e0fdab 100644
--- a/R/cb_create.r
+++ b/R/cb_create.r
@@ -8,8 +8,8 @@
#' [`cb_summarize_text()`]).
#'
#' @param data A data frame.
-#' @param metadata A data frame containing metadata, such as variable labels and value
-#' labels.
+#' @param metadata A data frame containing metadata, such as variable labels and
+#' value labels.
#' @param ... Additional columns from `metadata` to preserve in the final codebook.
#' New names can be assigned by passing named arguments. Columns for variable
#' name, form, variable label, and value labels are included by default.
@@ -23,47 +23,33 @@
#' right-hand side. If left-hand side is omitted, defaults to `tidyselect::everything()`.
#' See "Specifying user missing values" below for examples.
#' @param .split_var_labels A [`tidyselect`][dplyr_tidy_select] expression or list of tidyselect
-#' expressions, indicating (sets of) variable labels with a common stem that should
+#' expressions, indicating (sets of) variable labels with a common stem that should
#' be extracted into a separate column.
-#' @param .include_r_classes Include a column listing class(es) of each variable?
-#' (e.g., `"factor"`, `"POSIXct, POSIXt"`.)
-#' @param .include_types Include a column listing simplified type for each variable?
-#' (e.g,. `"categorical"`, `"date-time"`.)
#' @param .val_labs_sep1,.val_labs_sep2 Regex patterns separating value labels
-#' in `metadata`. `.val_labs_sep1` separates values from labels, and `.val_labs_sep2`
-#' separates value/label pairs. e.g., if value labels are in format `"1, First label|2, Second label"`,
-#' set `.val_labs_sep1` to `","` and `.val_labs_sep2` to `"\\|"`.
-#' @param .rmv_html Should HTML tags be removed from metadata (e.g., from variable
-#' and value labels)?
-#' @param .rmv_line_breaks Should line breaks be removed from metadata (e.g., from
-#' variable and value labels)? If `TRUE`, line breaks will be replaced with `" / "`.
-#' @param .user_missing_col Include value labels for user missing values in a separate
-#' column? The default, `"if_any"`, adds the column only if user missings are
-#' specified for at least one variable.
-#' @param .user_missing_conflict If different labels for a value are provided in
-#' metadata and user missings, which should be used?
-#' @param .user_missing_incompatible How to handle variables specified in `.user_missing`
-#' that aren't compatible with user missing values (e.g., logical, Date, or POSIXt)?
-#'
+#' in `metadata`. `.val_labs_sep1` separates values from labels, and `.val_labs_sep2`
+#' separates value/label pairs from one another. e.g., if value labels are in
+#' the format `"1, First label|2, Second label"`, set `.val_labs_sep1` to `","`
+#' and `.val_labs_sep2` to `"\\|"`.
+#' @param .options Additional options to use for codebook creation. Must be the result
+#' from a call to `cb_create_options()`. See that function's help page for available
+#' options.
+#'
#' @return
-#' An `"li_codebook"` object, consisting of (1) a tibble summarizing the passed
-#' dataset and (2) attributes containing the passed dataset (in several formats)
-#' and additional metadata. Specifically:
-#' - A tibble with columns:
-#' - `name`: variable name
-#' - `type`: optional column containing simplified variable type
-#' - `class`: optional column containing class(es) of each variable
-#' - `label_stem`: optional column containing variable label stems, if any variables
-#' are specified in `.split_var_labels`
-#' - `label`: variable label
-#' - `values`: values, with labels if applicable
-#' - `user_missing`: optional column, depending on value of `.user_missing_col`,
-#' showing user missing values, with labels if applicable
-#' - `missing`: proportion missing
-#' - additional columns if specified in `...`
-#' - Attributes:
-#' - Transformed versions of the passed dataset. See [`cb_get_data()`]
-#' - Lookup tables and other metadata used internally.
+#' An `"li_codebook"` object, consisting of a tibble summarizing the passed
+#' dataset and attributes containing additional metadata. The tibble includes columns:
+#' - `name`: variable name
+#' - `type`: column containing simplified variable type
+#' - `class`: optional column containing class(es) of each variable
+#' - `label_stem`: optional column containing variable label stems, if any variables
+#' are specified in `.split_var_labels`
+#' - `label`: variable label
+#' - `values`: values, with labels if applicable
+#' - `user_missing`: optional column showing user missing values, with labels
+#' if applicable. By default, this column is included only if user missings
+#' are specified for at least one variable. This behavior can be changed using
+#' the `user_missing_col` argument to `cb_create_options()`.
+#' - `missing`: proportion missing
+#' - additional columns if specified in `...`
#'
#' @section Specifying user missing values:
#' User missing values are defined by passing a formula or list of formulas to the
@@ -72,7 +58,7 @@
#' \preformatted{
#' cb <- cb_create(data, metadata, .user_missing = var1 ~ 99)
#' }
-#' The same user missings can be applied to multiple variables using [tidyselect][dplyr_tidy_select]
+#' The same user missings can be applied to multiple variables using [tidyselect][dplyr_tidy_select]
#' expressions.
#' \preformatted{
#' # for variables `var1` through `var5`
@@ -80,7 +66,7 @@
#'
#' # for all numeric variables, plus `var6` and `var7`
#' .user_missing = c(where(is.numeric), var6, var7) ~ c(-9, -8, -7)
-#'
+#'
#' # omitted left-hand side defaults to `tidyselect::everything()`
#' .user_missing = ~ -99
#' }
@@ -96,14 +82,16 @@
#' \preformatted{
#' .user_missing = ~ c(Declined = -98, "Not applicable" = -99)
#' }
-#' If labels set in `.user_missing` conflict with those in `metadata`, `.user_missing_conflict`
-#' controls which labels are used.
-#'
-#' User missing values are not compatible with logical, date, or datetime (POSIXt)
-#' variables. By default, these variables will be ignored if specified in `.user_missing`.
-#' (i.e., user missing values will be applied only to compatible variables.) This behavior
-#' can be changed using the `.user_missing_incompatible` argument.
-#'
+#' If labels set in `.user_missing` conflict with those in `metadata`, the `user_missing_conflict`
+#' argument to `cb_create_options()` controls which labels are used.
+#'
+#' User missings may be set for numeric, character, factor/ordered factor, and haven_labelled/haven_labelled_spss
+#' vectors. For factors, user missings are set based on factor labels (not the underlying
+#' integer codes). For `"haven_labelled"` vectors, user missings are set based on
+#' values (not value labels). By default, variables with incompatible classes (e.g.,
+#' logical, Date, POSIXt) will be ignored if specified in `.user_missing`. This
+#' behavior can be changed using the `.user_missing_incompatible` argument to `cb_create_options()`.
+#'
#' @examples
#' diamonds2 <- ggplot2::diamonds |>
#' transform(
@@ -115,18 +103,18 @@
#' right = FALSE
#' ))
#' )
-#'
+#'
#' # basic codebook
#' cb_create(diamonds2)
-#'
+#'
#' # convert variables to factor to treat as categorical
#' diamonds2 |>
#' transform(
#' carat_group = factor(carat_group),
#' price_group = factor(price_group)
-#' ) |>
+#' ) |>
#' cb_create()
-#'
+#'
#' # provide metadata for variable and value labels
#' diamonds_meta <- data.frame(
#' name = names(diamonds2),
@@ -151,7 +139,7 @@
#' "1 = <$500; 2 = $500-$999; 3 = $1,000-$1,999; 4 = $2,000-$4,999; 5 = $5,000-$9,999; 6 = $10,000+"
#' )
#' )
-#'
+#'
#' cb_create(
#' diamonds2, diamonds_meta,
#' .val_labs_sep1 = " = ", .val_labs_sep2 = "; "
@@ -165,40 +153,80 @@ cb_create <- function(data,
.val_labels = val_labels,
.user_missing = NULL,
.split_var_labels = NULL,
- .include_types = !.include_r_classes,
- .include_r_classes = FALSE,
.val_labs_sep1 = NULL,
.val_labs_sep2 = NULL,
- .rmv_html = TRUE,
- .rmv_line_breaks = TRUE,
- .user_missing_col = c("if_any", "yes", "no"),
- .user_missing_conflict = c("metadata", "missing_label"),
- .user_missing_incompatible = c("ignore", "warn", "error")
- ) {
+ .options = cb_create_options()) {
+ check_options(.options)
data |>
cb_init(
metadata,
meta_var_name = {{ .name }}, meta_var_label = {{ .var_label }},
meta_val_labels = {{ .val_labels }}, ...
) |>
- cb_clean_fields(rmv_html = .rmv_html, rmv_line_breaks = .rmv_line_breaks) |>
+ cb_clean_fields(
+ rmv_html = .options$rmv_html,
+ rmv_line_breaks = .options$rmv_line_breaks
+ ) |>
cb_user_missings(
user_missing = .user_missing,
- incompatible = .user_missing_incompatible
+ incompatible = .options$user_missing_incompatible
) |>
cb_add_lookups(sep1 = .val_labs_sep1, sep2 = .val_labs_sep2) |>
- cb_label_data(conflict = .user_missing_conflict) |>
+ cb_label_data(conflict = .options$user_missing_conflict) |>
cb_zap_data() |>
cb_add_dims() |>
- cb_add_val_labels_col(user_missing_col = .user_missing_col) |>
+ cb_add_val_labels_col(user_missing_col = .options$user_missing_col) |>
cb_add_type_col(
- include_r_classes = .include_r_classes,
- include_types = .include_types
+ include_r_classes = .options$include_r_classes,
+ include_types = .options$include_types
) |>
cb_add_missing_col() |>
cb_split_labels_col(split_var_labels = rlang::enexpr(.split_var_labels))
}
+#' Additional options for codebook creation
+#'
+#' @description
+#' Additional options for use by `cb_create()`.
+#'
+#' @inheritParams rlang::args_dots_empty
+#' @param include_types Include a column listing simplified type for each variable?
+#' (e.g,. `"categorical"`, `"date-time"`.)
+#' @param include_r_classes Include a column listing class(es) of each variable?
+#' (e.g., `"factor"`, `"POSIXct, POSIXt"`.)
+#' @param rmv_html Should HTML tags be removed from metadata (e.g., from variable
+#' and value labels)?
+#' @param rmv_line_breaks Should line breaks be removed from metadata (e.g., from
+#' variable and value labels)? If `TRUE`, line breaks will be replaced with `" / "`.
+#' @param user_missing_col Include value labels for user missing values in a separate
+#' column? The default, `"if_any"`, adds the column only if user missings are
+#' specified for at least one variable.
+#' @param user_missing_conflict If labels passed to `.user_missing` conflict with
+#' value labels in metadata, which should be used?
+#' @param user_missing_incompatible How to handle variables specified in `.user_missing`
+#' that aren't compatible with user missing values (e.g., logical, Date, or POSIXt)?
+#'
+#' @export
+cb_create_options <- function(
+ ...,
+ include_types = TRUE,
+ include_r_classes = FALSE,
+ rmv_html = TRUE,
+ rmv_line_breaks = TRUE,
+ user_missing_col = c("if_any", "yes", "no"),
+ user_missing_conflict = c("val_label", "missing_label"),
+ user_missing_incompatible = c("ignore", "warn", "error")) {
+ rlang::check_dots_empty()
+ out <- list(
+ include_types = include_types, include_r_classes = include_r_classes,
+ rmv_html = rmv_html, rmv_line_breaks = rmv_line_breaks,
+ user_missing_col = user_missing_col,
+ user_missing_conflict = user_missing_conflict,
+ user_missing_incompatible = user_missing_incompatible
+ )
+ structure(out, class = "cb_create_options")
+}
+
#' Extract data from a codebook object
#'
#' Codebook objects created by [`cb_create()`] and friends contain several transformed
@@ -206,27 +234,37 @@ cb_create <- function(data,
#'
#' @param cb An object of class `"li_codebook"` as produced by [`cb_create()`] or
#' a variant.
-#' @param format Format of the returned data; see below for details.
+#' @param format Format of the returned data, either `"factors"` or `"haven"`;
+#' see below for details.
#'
#' @return
#' A tibble with variables formatted based on the `format` argument.
-#' - For `format = "values"`, all variables retain the same values as the original
-#' dataset, including values for user missings. The data may reflect transformations
-#' made by variants of [`cb_create()`] -- e.g., for [`cb_create_redcap()`], integer coercion
-#' and propagation of user missings across checkbox variables.
-#' - For `"haven"`, value labels and user missings are encoded using class
-#' [`"haven_labelled"`][haven::labelled]`
#' - For `"factors"`, all variables with value labels are converted to factors,
#' and all user missings are converted to `NA`.
+#' - For `"haven"`, variable labels, value labels, and user missings are encoded
+#' using class [`"haven_labelled_spss"`][haven::labelled]`.
+#'
+#' Both formats may also reflect transformations made by variants of [`cb_create()`].
+#' In particular, for codebooks created using [`cb_create_redcap()`], integer coercion
+#' and propagation of user missings across checkbox variables.
#'
#' @export
-cb_get_data <- function(cb, format = c("factors", "haven", "values")) {
+cb_get_data <- function(cb, format = c("factors", "haven")) {
check_codebook(cb)
- switch(match.arg(format),
- factors = attr(cb, "data_zapped"),
- haven = attr(cb, "data_labelled"),
- values = attr(cb, "data")
+ tryCatch(
+ format <- match.arg(format),
+ error = \(e) {
+ if (format == "values") {
+ cli::cli_abort(
+ '`format = "values"` is no longer supported.',
+ call = parent.frame(4)
+ )
+ }
+ stop(e)
+ }
)
+ if (format == "factors") attr(cb, "data_zapped")
+ else attr(cb, "data_labelled")
}
cb_init <- function(data,
@@ -254,8 +292,8 @@ cb_init <- function(data,
out <- out |>
dplyr::mutate(values = NA_character_)
}
+ out <- structure(out, class = c("li_codebook", class(out)))
out <- set_attrs(out, data = data)
- class(out) <- c("li_codebook", class(out))
out
}
@@ -293,7 +331,7 @@ cb_user_missings_by_var <- function(cb,
)
if (length(bad_vars) > 4) bad_vars <- c(head(bad_vars, 3), "...")
bad_vars <- paste(bad_vars, collapse = ", ")
- msg <- "{n_bad} variable{?s} specified in {.arg .user_missing} are not compatible with user missing values"
+ msg <- "{n_bad} variable{?s} specified in {.arg .user_missing} {?is/are} not compatible with user missing values"
if (incompatible == "error") cli::cli_abort(c("!" = msg, "*" = bad_vars))
cli::cli_warn(c("!" = paste0(msg, " and will be ignored"), "*" = bad_vars))
}
@@ -426,7 +464,7 @@ cb_add_lookups <- function(cb, sep1, sep2) {
reconcile_missing_labels <- function(val_labs,
missings,
- conflict = c("metadata", "missing_label")) {
+ conflict = c("val_label", "missing_label")) {
conflict <- match.arg(conflict)
labs_in_missing <- val_labs[match(missings, val_labs)]
@@ -447,7 +485,7 @@ reconcile_missing_labels <- function(val_labs,
names(missings)[label_miss] <- lab_name[label_miss]
### if na is labelled and in vals and labels don't match
# relabel based on `conflict`
- if (conflict == "metadata") {
+ if (conflict == "val_label") {
names(missings)[mismatch] <- lab_name[mismatch]
} else if (conflict == "missing_label") {
names(val_labs)[match(lab_val[mismatch], val_labs)] <- miss_name[mismatch]
@@ -455,8 +493,7 @@ reconcile_missing_labels <- function(val_labs,
list(val_labs = val_labs, missings = missings)
}
-cb_label_data <- function(cb, conflict = c("metadata", "missing_label")) {
- conflict <- match.arg(conflict)
+cb_label_data <- function(cb, conflict = c("val_label", "missing_label")) {
data <- attr(cb, "data")
vals_by_label <- attr(cb, "vals_by_label")
factors <- attr(cb, "factors")
diff --git a/R/cb_create_redcap.r b/R/cb_create_redcap.r
index a2d7c1c..ccb8c58 100644
--- a/R/cb_create_redcap.r
+++ b/R/cb_create_redcap.r
@@ -1,5 +1,5 @@
#' Generate a codebook object from REDCap data
-#'
+#'
#' @description
#' `cb_create_redcap()` builds an object of class `"li_codebook"` from a dataset and
#' corresponding codebook exported from REDCap. The resulting object can be used
@@ -14,156 +14,199 @@
#' - Unpacking, labelling, and optional missing propagation for checkbox data
#' - Optional coercion for character variables marked as "integer" in `metedata$text_validation_type_or_show_slider_number`
#'
+#' All of these behaviors can be controlled using the `.options` argument.
+#'
#' @inheritParams cb_create
#' @param data A data frame exported or retrieved from REDCap.
#' @param metadata A data frame containing the REDCap codebook associated with `data`.
#' @param ... Additional columns from `metadata` to preserve in the final codebook.
#' New names can be assigned by passing named arguments. Columns for variable
#' name, form, variable label, and value labels are included by default.
-#' @param .name,.var_label,.val_labels Columns in `metadata` containing variable
-#' name, variable label, and value labels, respectively.
-#' @param .form Column in `metadata` containing form names. (Set to `NULL` to omit.)
#' @param .user_missing A formula or list of formulas specifying user missing values.
#' Formulas should specify variables on the left-hand side (as variable names
#' or [tidyselect][dplyr_tidy_select] expressions), and missing values on the
#' right-hand side. If left-hand side is omitted, defaults to `tidyselect::everything()`.
#' See "Specifying user missing values" in [`cb_create()`] documentation for examples.
-#' @param .coerce_integers Should variables listed as "integer" in `metedata$text_validation_type_or_show_slider_number`
-#' be coerced to integer?
-#' @param .checkbox_resp_values Should checkbox values use labels in `metadata`
-#' (`TRUE`) or "Yes" / "No" (`FALSE`)? See "Checkbox data handling" below.
-#' @param .propagate_checkbox_missings Should user missing values in a checkbox
-#' group be propagated across all variables in the group? See "Checkbox data handling"
-#' below.
+#' @param .options Additional options to use for codebook creation. Must be the result
+#' from a call to `cb_create_redcap_options()` or `cb_create_options()`. See `?cb_create_redcap_options`
+#' for available options.
#'
#' @return
-#' An `"li_codebook"` object, consisting of (1) a tibble summarizing the passed
-#' dataset and (2) attributes containing the passed dataset (in several formats)
-#' and additional metadata. Specifically:
-#' - A tibble with columns:
-#' - `name`: variable name
-#' - `form`: form name
-#' - `type`: optional column containing simplified variable type
-#' - `class`: optional column containing class(es) of each variable
-#' - `label_stem`: optional column containing variable label stems, if any variables
-#' are specified in `.split_var_labels`
-#' - `label`: variable label
-#' - `values`: values, with labels if applicable
-#' - `user_missing`: optional column, depending on value of `.user_missing_col`,
-#' showing user missing values, with labels if applicable
-#' - `missing`: proportion missing
-#' - additional columns if specified in `...`
-#' - Attributes:
-#' - Transformed versions of the passed dataset. See [`cb_get_data()`].
-#' - Lookup tables and other metadata used internally.
+#' An `"li_codebook"` object, consisting of a tibble summarizing the passed
+#' dataset and attributes containing additional metadata. The tibble includes columns:
+#' - `name`: variable name
+#' - `form`: form name
+#' - `type`: column containing simplified variable type
+#' - `class`: optional column containing class(es) of each variable
+#' - `label_stem`: optional column containing variable label stems, if any variables
+#' are specified in `.split_var_labels`
+#' - `label`: variable label
+#' - `values`: values, with labels if applicable
+#' - `user_missing`: optional column showing user missing values, with labels
+#' if applicable. By default, this column is included only if user missings
+#' are specified for at least one variable. This behavior can be changed using
+#' the `user_missing_col` argument to `cb_create_options()`.
+#' - `missing`: proportion missing
+#' - additional columns if specified in `...`
#'
#' @section Checkbox data handling:
#' ## Value labels
#' Data from REDCap checkboxes yields one variable in the dataset for each response
-#' option. These will be labelled generically with `"Yes"` or `"No"`, unless `.checkbox_resp_values`
-#' is `TRUE`, in which case response-specific labels from `metadata` will be used.
-#' For example, if a checkbox group has options "In the past year," "More than a
+#' option. By default, these will be labelled generically with `"Yes"` or `"No"`.
+#' For example, consider a checkbox group with options "In the past year," "More than a
#' year ago," and "Never," corresponding to variables `chk_var1___0`, `chk_var1___1`,
-#' and `chk_var1___2`: if `.checkbox_resp_values` is `FALSE`, all of these will
-#' have values:
+#' and `chk_var1___2`. By default, all of these will be given the same value labels:
#' - `chk_var1___0`, `chk_var1___1`, `chk_var1___2`: 0 = "No"; 1 = "Yes".
-#'
-#' If `.checkbox_resp_values` is `TRUE`, each variable will have unique labels:
+#' This behavior can be changed by setting `checkbox_resp_values = TRUE` in `cb_create_options()`.
+#' In this case, response-specific labels from `metadata` will be used, so that
+#' each variable will have unique labels:
#' - `chk_var1___0`: 0 = "Not selected," 1 = "In the past year"
#' - `chk_var1___1`: 0 = "Not selected," 1 = "More than a year ago"
#' - `chk_var1___2`: 0 = "Not selected," 0 = "Never"
#'
#' ## Missing value propagation
-#' If `.propagate_checkbox_missings` is `TRUE`, missing values in a checkbox group
-#' variable will be propagated to all variables in the group. For example, given
-#' a checkbox group with options "Pregnant," "Not pregnant," and "Not applicable,"
-#' corresponding to variables `chk_preg_0___0`, `chk_preg_0___1`, and `chk_preg_0____9`,
-#' and assuming that `-9` is specified as a user missing value. If `.propagate_checkbox_missings`
-#' is `TRUE`, `chk_preg_0___0` and `chk_preg_0___1` will be set to `-9` if `chk_preg_0____9`
-#' is `1`. Otherwise, these columns will remain as `0` where `chk_preg_0____9` is `1`.
+#' By default, missing values in a checkbox group will be propagated to all variables
+#' in the group. For example, consider a checkbox group with options "Pregnant,"
+#' "Not pregnant," and "Not applicable," corresponding to variables `chk_preg_0___0`,
+#' `chk_preg_0___1`, and `chk_preg_0____9`, and assuming that `-9` is specified
+#' as a user missing value. By default, `chk_preg_0___0` and `chk_preg_0___1` will
+#' be set to `-9` if `chk_preg_0____9` is `1`. This behavior can be overridden by
+#' setting `propagate_checkbox_missings = FALSE` in `cb_create_options()`, in which
+#' case no values will be changed.
#'
#' @export
cb_create_redcap <- function(data,
metadata,
...,
- .name = field_name,
- .var_label = field_label,
- .val_labels = select_choices_or_calculations,
- .form = form_name,
.user_missing = NULL,
.split_var_labels = NULL,
- .include_types = !.include_r_classes,
- .include_r_classes = FALSE,
- .val_labs_sep1 = ", ",
- .val_labs_sep2 = "\\|",
- .rmv_html = TRUE,
- .rmv_line_breaks = TRUE,
- .coerce_integers = TRUE,
- .checkbox_resp_values = FALSE,
- .propagate_checkbox_missings = TRUE,
- .user_missing_col = c("if_any", "yes", "no"),
- .user_missing_conflict = c("metadata", "missing_label"),
- .user_missing_incompatible = c("ignore", "warn", "error")
- ) {
- .user_missing_col <- match.arg(.user_missing_col)
- .user_missing_conflict <- match.arg(.user_missing_conflict)
- meta <- meta_expand_checkboxes_rc(metadata, data)
+ .options = cb_create_redcap_options()) {
+ check_options(.options, redcap = TRUE)
+ meta <- meta_expand_checkboxes_rc(
+ metadata, data,
+ name = !!.options$name, type = !!.options$type
+ )
cb <- data |>
cb_init(
meta,
- meta_var_name = {{ .name }}, meta_var_label = {{ .var_label }},
- meta_val_labels = {{ .val_labels }}, form = {{ .form }}, ...,
- ..rc_type = field_type,
+ meta_var_name = !!.options$name, meta_var_label = !!.options$var_label,
+ meta_val_labels = !!.options$val_labels, form = !!.options$form, ...,
+ ..rc_type = !!.options$type,
..rc_validate_type = text_validation_type_or_show_slider_number,
)
- if (.coerce_integers) cb <- cb_coerce_integers_rc(cb)
+ if (.options$coerce_integers) cb <- cb_coerce_integers_rc(cb)
cb$..rc_validate_type <- NULL
cb <- cb |>
- cb_clean_fields(rmv_html = .rmv_html, rmv_line_breaks = .rmv_line_breaks) |>
+ cb_clean_fields(
+ rmv_html = .options$rmv_html,
+ rmv_line_breaks = .options$rmv_line_breaks
+ ) |>
cb_user_missings(
user_missing = .user_missing,
- incompatible = .user_missing_incompatible
+ incompatible = .options$user_missing_incompatible
) |>
- cb_add_lookups(sep1 = .val_labs_sep1, sep2 = .val_labs_sep2) |>
- cb_relabel_checkboxes_rc(use_resp_values = .checkbox_resp_values)
+ cb_add_lookups(
+ sep1 = .options$val_labs_sep1,
+ sep2 = .options$val_labs_sep2
+ ) |>
+ cb_relabel_checkboxes_rc(use_resp_values = .options$checkbox_resp_values)
if ("form" %in% names(cb)) cb <- cb_complete_label_rc(cb)
- if (.propagate_checkbox_missings) {
+ if (.options$propagate_checkbox_missings) {
cb <- cb_propagate_user_missing_checkboxes_rc(cb)
}
cb |>
- cb_label_data(conflict = .user_missing_conflict) |>
+ cb_label_data(conflict = .options$user_missing_conflict) |>
cb_zap_data() |>
cb_add_dims() |>
- cb_add_val_labels_col(user_missing_col = .user_missing_col) |>
+ cb_add_val_labels_col(user_missing_col = .options$user_missing_col) |>
cb_add_type_col(
- include_r_classes = .include_r_classes,
- include_types = .include_types
+ include_r_classes = .options$include_r_classes,
+ include_types = .options$include_types
) |>
cb_add_missing_col() |>
cb_split_labels_col(split_var_labels = rlang::enexpr(.split_var_labels)) |>
dplyr::relocate(any_of(c("form", "type", "class")), .after = name)
}
-## `field_name` and `field_type` are hard-coded -- do they always have these names?
-meta_expand_checkboxes_rc <- function(meta, data) {
- if (!("checkbox" %in% meta$field_type)) return(meta)
+#' @rdname cb_create_options
+#'
+#' @param name,var_label,val_labels,type For REDCap data, columns in `metadata` containing variable
+#' name, variable label, value labels, and variable type, respectively.
+#' @param form For REDCap data, column in `metadata` containing form names. (Set to `NULL` to omit.)
+#' @param val_labs_sep1,val_labs_sep2 For REDCap data, regex patterns separating value labels
+#' in `metadata`. `val_labs_sep1` separates values from labels, and `val_labs_sep2`
+#' separates value/label pairs from one another. e.g., if value labels are in
+#' the format `"1, First label|2, Second label"`, set `val_labs_sep1` to `","`
+#' and `val_labs_sep2` to `"\\|"`.
+#' @param coerce_integers For REDCap data, should variables listed as "integer" in `metedata$text_validation_type_or_show_slider_number`
+#' be coerced to integer?
+#' @param checkbox_resp_values For REDCap data, should checkbox values use labels in `metadata` (`TRUE`)
+#' or "Yes" / "No" (`FALSE`)? See "Checkbox data handling" on the `cb_create_redcap()`
+#' help page.
+#' @param propagate_checkbox_missings For REDCap data, should user missing values in a checkbox group
+#' be propagated across all variables in the group? See "Checkbox data handling"
+#' on the `cb_create_redcap()` help page.
+#'
+#' @export
+cb_create_redcap_options <- function(
+ ...,
+ include_types = TRUE,
+ include_r_classes = FALSE,
+ rmv_html = TRUE,
+ rmv_line_breaks = TRUE,
+ user_missing_col = c("if_any", "yes", "no"),
+ user_missing_conflict = c("val_label", "missing_label"),
+ user_missing_incompatible = c("ignore", "warn", "error"),
+ name = field_name,
+ var_label = field_label,
+ val_labels = select_choices_or_calculations,
+ type = field_type,
+ form = form_name,
+ val_labs_sep1 = ", ",
+ val_labs_sep2 = "\\|",
+ coerce_integers = TRUE,
+ checkbox_resp_values = FALSE,
+ propagate_checkbox_missings = TRUE) {
+ rlang::check_dots_empty()
+ out <- list(
+ include_types = include_types, include_r_classes = include_r_classes,
+ rmv_html = rmv_html, rmv_line_breaks = rmv_line_breaks,
+ user_missing_col = user_missing_col,
+ user_missing_conflict = user_missing_conflict,
+ user_missing_incompatible = user_missing_incompatible,
+ name = rlang::enquo(name), var_label = rlang::enquo(var_label),
+ val_labels = rlang::enquo(val_labels), type = rlang::enquo(type),
+ form = rlang::enquo(form), val_labs_sep1 = val_labs_sep1,
+ val_labs_sep2 = val_labs_sep2, coerce_integers = coerce_integers,
+ checkbox_resp_values = checkbox_resp_values,
+ propagate_checkbox_missings = propagate_checkbox_missings
+ )
+ structure(out, class = "cb_create_redcap_options")
+}
+
+meta_expand_checkboxes_rc <- function(meta, data, name, type) {
+ name_chr <- as.character(rlang::ensym(name))
+ type_chr <- as.character(rlang::ensym(type))
+ if (!("checkbox" %in% meta[[type_chr]])) return(meta)
datanames <- names(data)
+ meta <- dplyr::rename(meta, ..name = {{ name }}, ..type = {{ type }})
checkbox_names <- meta |>
- dplyr::filter(field_type == "checkbox") |>
- dplyr::select(field_name) |>
+ dplyr::filter(..type == "checkbox") |>
+ dplyr::select(..name) |>
dplyr::reframe(
.chk_name = datanames[
- stringr::str_starts(datanames, stringr::str_c(field_name, "___"))
+ stringr::str_starts(datanames, stringr::str_c(..name, "___"))
],
- .by = field_name
+ .by = ..name
)
meta |>
- dplyr::left_join(checkbox_names, dplyr::join_by(field_name)) |>
+ dplyr::left_join(checkbox_names, dplyr::join_by(..name)) |>
dplyr::mutate(
# .chk_name_stem = ifelse(!is.na(.chk_name), field_name, NA),
- field_name = dplyr::coalesce(.chk_name, field_name),
+ ..name = dplyr::coalesce(.chk_name, ..name),
.keep = "unused"
- )
+ ) |>
+ dplyr::rename("{name_chr}" := ..name, "{type_chr}" := ..type)
}
cb_coerce_integers_rc <- function(cb) {
diff --git a/R/cb_create_spss.r b/R/cb_create_spss.r
index b366a46..a4827cd 100644
--- a/R/cb_create_spss.r
+++ b/R/cb_create_spss.r
@@ -19,61 +19,47 @@
#' or [tidyselect][dplyr_tidy_select] expressions), and missing values on the
#' right-hand side. If left-hand side is omitted, defaults to `tidyselect::everything()`.
#' See "Specifying user missing values" in [`cb_create()`] documentation for examples.
-#' @param .rmv_html Should HTML tags be removed from variable and value labels?
-#' @param .rmv_line_breaks Should line breaks be removed from variable and value
-#' labels? If `TRUE`, line breaks will be replaced with `" / "`.
-#' @param .user_missing_conflict If labels passed to `.user_missing` conflicts with
-#' a value label in `data`, which should be used?
#'
#' @return
-#' An `"li_codebook"` object, consisting of (1) a tibble summarizing the passed
-#' dataset and (2) attributes containing the passed dataset (in several formats)
-#' and additional metadata. Specifically:
-#' - A tibble with columns:
-#' - `name`: variable name
-#' - `type`: optional column containing simplified variable type
-#' - `class`: optional column containing class(es) of each variable
-#' - `label_stem`: optional column containing variable label stems, if any variables
-#' are specified in `.split_var_labels`
-#' - `label`: variable label
-#' - `values`: values, with labels if applicable
-#' - `user_missing`: optional column, depending on value of `.user_missing_col`,
-#' showing user missing values, with labels if applicable
-#' - `missing`: proportion missing
-#' - Attributes:
-#' - Transformed versions of the passed dataset. See [`cb_get_data()`].
-#' - Lookup tables and other metadata used internally.
+#' An `"li_codebook"` object, consisting of a tibble summarizing the passed
+#' dataset and attributes containing additional metadata. The tibble includes columns:
+#' - `name`: variable name
+#' - `type`: column containing simplified variable type
+#' - `class`: optional column containing class(es) of each variable
+#' - `label_stem`: optional column containing variable label stems, if any variables
+#' are specified in `.split_var_labels`
+#' - `label`: variable label
+#' - `values`: values, with labels if applicable
+#' - `user_missing`: optional column showing user missing values, with labels
+#' if applicable. By default, this column is included only if user missings
+#' are specified for at least one variable. This behavior can be changed using
+#' the `user_missing_col` argument to `cb_create_options()`.
+#' - `missing`: proportion missing
#'
#' @export
cb_create_spss <- function(data,
.user_missing = NULL,
.split_var_labels = NULL,
- .include_types = !.include_r_classes,
- .include_r_classes = FALSE,
- .rmv_html = TRUE,
- .rmv_line_breaks = TRUE,
- .user_missing_col = c("if_any", "yes", "no"),
- .user_missing_conflict = c("val_label", "missing_label"),
- .user_missing_incompatible = c("ignore", "warn", "error")
- ) {
+ .options = cb_create_options()) {
+ check_options(.options)
data |>
cb_init() |>
cb_clean_fields_spss(
- rmv_html = .rmv_html,
- rmv_line_breaks = .rmv_line_breaks
+ rmv_html = .options$rmv_html,
+ rmv_line_breaks = .options$rmv_line_breaks
) |>
cb_add_label_col_spss() |>
cb_update_labels_spss(
user_missing = .user_missing,
- user_missing_conflict = .user_missing_conflict,
- user_missing_incompatible = .user_missing_incompatible
+ user_missing_conflict = .options$user_missing_conflict,
+ user_missing_incompatible = .options$user_missing_incompatible
) |>
cb_zap_data_spss() |>
cb_add_dims() |>
- cb_add_val_labels_col(user_missing_col = .user_missing_col) |>
+ cb_add_val_labels_col(user_missing_col = .options$user_missing_col) |>
cb_add_type_col(
- include_r_classes = .include_r_classes,
- include_types = .include_types
+ include_r_classes = .options$include_r_classes,
+ include_types = .options$include_types
) |>
cb_add_missing_col() |>
cb_split_labels_col(split_var_labels = rlang::enexpr(.split_var_labels))
@@ -114,7 +100,6 @@ cb_update_labels_spss <- function(cb,
cb_add_lookups() |>
set_attrs(data_labelled = data)
} else {
- conflict <- sub("val_label", "metadata", match.arg(user_missing_conflict))
user_missing <- check_user_missing_arg(user_missing)
user_missing_vars <- user_missing |>
lapply(\(um) {
@@ -140,7 +125,7 @@ cb_update_labels_spss <- function(cb,
user_missing = attr_user_missing[names(attr_user_missing) %in% user_missing_vars],
vals_by_label = attr_vals_by_label[names(attr_vals_by_label) %in% user_missing_vars]
) |>
- cb_label_data(conflict = conflict) |>
+ cb_label_data(conflict = user_missing_conflict) |>
# then restore full missing and val attributes
set_attrs(
user_missing = attr_user_missing,
diff --git a/R/cb_summarize.r b/R/cb_summarize.r
index fb22b3d..d2c4650 100644
--- a/R/cb_summarize.r
+++ b/R/cb_summarize.r
@@ -152,7 +152,8 @@ cb_summarize_categorical_impl <- function(cb,
prefixed = TRUE,
detail_missing = missing(group_by),
detail_na_label = "NA",
- warn_if_none = FALSE) {
+ warn_if_none = FALSE,
+ group_rows = NULL) {
force(detail_missing)
data <- attr(cb, "data_labelled")
data_dt <- data.table::as.data.table(data)
@@ -307,14 +308,18 @@ cb_summarize_categorical_impl <- function(cb,
)
freqs <- freqs[, cols_out, with = FALSE]
+ group_cols <- setdiff(group_by, group_rows)
+ if (!length(group_cols)) group_cols <- NULL
+
freqs |>
tibble::as_tibble() |>
set_attrs(
detail_missing = detail_missing,
id_cols = c("name", label_cols, "value"),
group_by = group_by,
- group_cols = group_by,
- group_counts = group_counts(cb, group_by)
+ group_rows = group_rows,
+ group_cols = group_cols,
+ group_counts = group_counts(cb, group_cols)
)
}
diff --git a/R/cb_write.r b/R/cb_write.r
index 748e140..a99c2f5 100644
--- a/R/cb_write.r
+++ b/R/cb_write.r
@@ -15,24 +15,28 @@
#' a variant.
#' @param file Path to write to.
#' @param dataset_name Name of the dataset to display in workbook headers.
-#' @param incl_date,incl_dims Should the date and/or dataset dimensions be included
-#' in the Overview tab header?
-#' @param hyperlinks If `TRUE`, variable names on the Overview sheet will link
-#' to corresponding rows on summary tabs and vice versa.
#' @param group_by <[`tidy-select`][dplyr_tidy_select]> Column or columns to group
#' by. If specified, additional numeric and categorical summary tabs will be included
-#' with grouped summaries. Subgroups are shown in columns by default. For the numeric
-#' summary tab, subgroups for some or all grouping variables can instead be shown
-#' in rows if specified in `group_rows_numeric`.
-#' @param group_rows_numeric <[`tidy-select`][dplyr_tidy_select]> Column or columns
-#' to group by in rows on the grouped numeric summary tab. All columns must also
-#' be specified in `group_by`.
+#' with grouped summaries. Subgroups are shown in columns by default. Some or all
+#' grouping variables can instead be shown in rows if specified in `group_rows`,
+#' `group_rows_numeric`, or `group_rows_categorical`.
+#' @param group_rows <[`tidy-select`][dplyr_tidy_select]> Column or columns to group
+#' by in rows on grouped summary tabs. All columns must also be specified in `group_by`.
+#' Will apply to both numeric and categorical summary tabs unless otherwise specified
+#' in `group_rows_numeric` or `group_rows_categorical`.
+#' @param group_rows_numeric,group_rows_categorical <[`tidy-select`][dplyr_tidy_select]>
+#' Column or columns to group by in rows on grouped numeric or categorical summary
+#' tab.
#' @param detail_missing Include detailed missing value information on ungrouped
#' categorical and text summary tabs? (Detailed missing information for grouped
#' summary tabs is not currently supported.)
#' @param n_text_vals On the text summary tab, how many unique non-missing values
#' should be included for each variable? If there are more than `n_text_vals` + 1
#' unique values, the `n_text_vals` most common non-missing values will be included.
+#' @param incl_date,incl_dims Should the date and/or dataset dimensions be included
+#' in the Overview tab header?
+#' @param hyperlinks If `TRUE`, variable names on the Overview sheet will link
+#' to corresponding rows on summary tabs and vice versa.
#' @param overwrite Overwrite existing file?
#'
#' @return
@@ -54,13 +58,15 @@
cb_write <- function(cb,
file,
dataset_name = NULL,
- incl_date = TRUE,
- incl_dims = TRUE,
- hyperlinks = TRUE,
group_by = NULL,
- group_rows_numeric = NULL,
+ group_rows = NULL,
+ group_rows_numeric = group_rows,
+ group_rows_categorical = group_rows,
detail_missing = c("if_any_user_missing", "yes", "no"),
n_text_vals = 5,
+ incl_date = TRUE,
+ incl_dims = TRUE,
+ hyperlinks = TRUE,
overwrite = TRUE) {
check_codebook(cb)
detail_missing <- match.arg(detail_missing)
@@ -71,32 +77,38 @@ cb_write <- function(cb,
num = cb_summarize_numeric_impl(cb),
cat = cb_summarize_categorical_impl(cb, detail_missing = detail_missing),
txt = cb_summarize_text_impl(
- cb,
- n_text_vals = n_text_vals,
+ cb,
+ n_text_vals = n_text_vals,
detail_missing = detail_missing
)
)
group_by <- cb_untidyselect(cb, {{ group_by }})
- group_rows_numeric <- cb_untidyselect(cb, {{ group_rows_numeric }})
- if (!is.null(group_rows_numeric)) {
- if (is.null(group_by)) {
- cli::cli_abort(
- "If `group_rows_numeric` is specified, `group_by` must also be specified."
- )
- }
- if (length(setdiff(group_rows_numeric, group_by))) {
- cli::cli_abort(
- "All columns specified in `group_rows_numeric` must also be included in `group_by`."
- )
- }
+ group_rows <- cb_untidyselect(cb, {{ group_rows }})
+ if (missing(group_rows_numeric)) {
+ group_rows_numeric <- group_rows
+ } else {
+ group_rows_numeric <- cb_untidyselect(cb, {{ group_rows_numeric }})
+ }
+ if (missing(group_rows_categorical)) {
+ group_rows_categorical <- group_rows
+ } else {
+ group_rows_categorical <- cb_untidyselect(cb, {{ group_rows_categorical }})
}
+ check_group_rows_arg(group_rows, group_by)
+ check_group_rows_arg(group_rows_numeric, group_by)
+ check_group_rows_arg(group_rows_categorical, group_by)
+
if (!is.null(group_by)) {
summaries$num_grp <- cb_summarize_numeric_impl(
cb,
group_by = group_by,
group_rows = group_rows_numeric
)
- summaries$cat_grp <- cb_summarize_categorical_impl(cb, group_by = group_by)
+ summaries$cat_grp <- cb_summarize_categorical_impl(
+ cb,
+ group_by = group_by,
+ group_rows = group_rows_categorical
+ )
}
cb_write_codebook(
cb, summaries,
@@ -768,7 +780,13 @@ cb_write_codebook <- function(cb,
summaries$cat_grp <- summaries$cat_grp |>
cb_format_names(skip = group_by, attrs = "id_cols")
cols_pct <- untidyselect(summaries$cat_grp, tidyselect::starts_with("%"))
- clear_repeats <- setdiff(attr(summaries$cat_grp, "id_cols"), "Value")
+ group_rows <- attr(summaries$cat_grp, "group_rows")
+ id_cols <- attr(summaries$cat_grp, "id_cols")
+ clear_repeats <- c(setdiff(id_cols, "Value"), group_rows)
+ if (!is.null(group_rows)) {
+ attr(summaries$cat_grp, "group_rows") <- c(group_rows, "Value")
+ attr(summaries$cat_grp, "id_cols") <- setdiff(id_cols, "Value")
+ }
sheet_nms$cat_grp <- paste0("Grouped ", sheet_nms$cat)
headers$cat_grp <- c(headers$cat, paste("By ", toString(group_by)))
params$cat_grp <- summaries$cat_grp |>
diff --git a/R/utils.r b/R/utils.r
index 22226cc..6f49877 100644
--- a/R/utils.r
+++ b/R/utils.r
@@ -1,20 +1,56 @@
-is_codebook <- function(x) "li_codebook" %in% class(x)
+is_codebook <- function(x) inherits(x, "li_codebook")
check_codebook <- function(x) {
arg <- as.character(rlang::ensym(x))
if (!is_codebook(x)) {
cli::cli_abort('{.arg {arg}} must be an object of class `"li_codebook"`.')
}
}
+check_options <- function(x, redcap = FALSE) {
+ if (redcap) {
+ opts_class <- "cb_create_redcap_options"
+ opts_class_wrong <- "cb_create_options"
+ } else {
+ opts_class <- "cb_create_options"
+ opts_class_wrong <- "cb_create_redcap_options"
+ }
+ if (inherits(x, opts_class_wrong)) {
+ msg <- c(
+ "!" = "`.options` must be created from `{opts_class}()`, not `{opts_class_wrong}()`."
+ )
+ if (!redcap) {
+ msg <- c(msg, "i" = "Did you mean to call `cb_create_redcap()`?")
+ }
+ cli::cli_abort(msg)
+ }
+ if (!inherits(x, opts_class)) {
+ cli::cli_abort("`.options` must be created from `{opts_class}()`")
+ }
+}
check_user_missing_arg <- function(x) {
arg <- as.character(rlang::ensym(x))
if (!(
- rlang::is_formula(x) || (is.list(x) && all(sapply(x, rlang::is_formula)))
- )) {
+ rlang::is_formula(x) || (is.list(x) && all(sapply(x, rlang::is_formula)))
+ )) {
cli::cli_abort("{.arg {arg}} must be a formula or list of formulas.")
}
if (rlang::is_formula(x)) x <- list(x)
x
}
+check_group_rows_arg <- function(group_rows, group_by) {
+ arg <- as.character(rlang::ensym(group_rows))
+ if (!is.null(group_rows)) {
+ if (is.null(group_by)) {
+ cli::cli_abort(
+ "If {.arg {arg}} is specified, {.arg group_by} must also be specified."
+ )
+ }
+ if (length(setdiff(group_rows, group_by))) {
+ cli::cli_abort(
+ "All columns specified in {.arg {arg}} must also be included in {.arg group_by}."
+ )
+ }
+ }
+}
set_attrs <- function(x, ...) {
dots <- rlang::list2(...)
for (nm in names(dots)) attr(x, nm) <- dots[[nm]]
diff --git a/README.Rmd b/README.Rmd
new file mode 100644
index 0000000..3173f0c
--- /dev/null
+++ b/README.Rmd
@@ -0,0 +1,130 @@
+---
+output: github_document
+---
+
+
+
+```{r, include = FALSE}
+knitr::opts_chunk$set(
+ collapse = TRUE,
+ comment = "#",
+ out.width = "100%",
+ fig.align = "center",
+ fig.path = "man/figures/",
+ eval = FALSE
+)
+```
+
+# lighthouse.codebook
+
+The lighthouse.codebook package includes tools to summarize a dataset into a formatted
+Excel workbook, including a data dictionary and vrariable summaries. It incorporates external
+metadata (such as variable labels, value labels, and user missing / non-response codes),
+with functions for using metadata from SPSS and REDCap datasets. Codebooks can be
+customized in a number of ways, including options for grouped summaries.
+
+## Installation
+
+You can install lighthouse.codebook by running:
+
+```r
+# install.packages("remotes")
+remotes::install_github("ccsarapas/lighthouse.codebook")
+```
+
+## Creating codebooks
+Creating a codebook involves two general steps:
+
+1. Create a "codebook" object in R from a data frame (and,
+ optionally, metadata) using `cb_create()` or a specialized variant
+ (such as `cb_create_spss()` or `cb_create_redcap()`).
+
+2. Write the codebook to disk using `cb_write()`.
+
+```r
+library(lighthouse.codebook)
+
+# create and write a codebook without metadata
+dat |>
+ cb_create() |>
+ cb_write("cb.xlsx")
+
+# with metadata
+dat |>
+ cb_create(metadata = dat1_metadata) |>
+ cb_write("cb.xlsx")
+
+# from SPSS data
+dat_spss <- haven::read_sav("dat_spss.sav", user_na = TRUE)
+
+dat_spss |>
+ cb_create_spss() |>
+ cb_write("cb_spss.xlsx")
+
+# from REDCap data
+dat_rc <- REDCapR::redcap_read(redcap_uri = rc_uri, token = rc_token)
+meta_rc <- REDCapR::redcap_metadata_read(redcap_uri = rc_uri, token = rc_token)
+
+dat_rc$data |>
+ cb_create_redcap(metadata = meta_rc$data) |>
+ cb_write("cb_rc.xlsx")
+```
+
+## Customizing codebooks
+
+There are many options for controlling how data is interpreted, summarized, and
+presented. See `vignette("lighthouse-codebook")` for some of the most useful options,
+including grouped data summaries and specifying user missing codes. Further options
+are detailed in the help pages for `cb_create()` and `cb_write()`.
+
+
+
+## Codebook contents
+
+The codebook written to disk will include an _overview_ tab listing all variables
+in the dataset; _summary_ tabs for numeric, categorical, and text variables; and,
+if grouping variables are specified, _grouped summary_ tabs for numeric and categorical
+variables.
+
+The _overview_ tab includes one row for each variable in the dataset, with information
+on variable types, labels, values, and missingness. By default, each variable is
+hyperlinked to its location on the relevant summary tab.
+
+```{r, overview, echo = FALSE, eval = TRUE}
+knitr::include_graphics("man/figures/README-overview.png")
+```
+
+The _numeric summary_ tab includes descriptive statistics for all numeric variables
+in the dataset:
+
+```{r, numeric, echo = FALSE, eval = TRUE}
+knitr::include_graphics("man/figures/README-numeric.png")
+```
+
+The _categorical summary_ tab includes frequencies for all categorical variables,
+optionally with separate rows for user missing values:
+
+```{r, categorical, echo = FALSE, eval = TRUE}
+knitr::include_graphics("man/figures/README-categorical.png")
+```
+
+Finally, the _text summary_ tab includes frequencies for the most common values for all
+text variables in the dataset. (The number of values shown can be adjusted using
+the `n_text_vals` argument to `cb_write()`.)
+
+```{r, text, echo = FALSE, eval = TRUE}
+knitr::include_graphics("man/figures/README-text.png")
+```
+
+If `group_by` is specified in `cb_write()`, additional numeric and categorical summary
+tabs grouped by the specified variables will be included.
+
+## SPSS extension
+
+Functionality from this package is also available as an SPSS extension command [here](https://github.com/ccsarapas/lighthouse.codebook.spss).
diff --git a/README.md b/README.md
index 3e38f4e..c846ee9 100644
--- a/README.md
+++ b/README.md
@@ -1,16 +1,115 @@
+
+
# lighthouse.codebook
-The lighthouse.codebook package includes tools for summarizing datasets used by staff at the [Lighthouse Institute](https://www.chestnut.org/lighthouse-institute/), the research division of Chestnut Health Systems.
+The lighthouse.codebook package includes tools to summarize a dataset
+into a formatted Excel workbook, including a data dictionary and
+vrariable summaries. It incorporates external metadata (such as variable
+labels, value labels, and user missing / non-response codes), with
+functions for using metadata from SPSS and REDCap datasets. Codebooks
+can be customized in a number of ways, including options for grouped
+summaries.
## Installation
-Install lighthouse.codebook by running:
+You can install lighthouse.codebook by running:
``` r
# install.packages("remotes")
remotes::install_github("ccsarapas/lighthouse.codebook")
```
+
+## Creating codebooks
+
+Creating a codebook involves two general steps:
+
+1. Create a “codebook” object in R from a data frame (and, optionally,
+ metadata) using `cb_create()` or a specialized variant (such as
+ `cb_create_spss()` or `cb_create_redcap()`).
+
+2. Write the codebook to disk using `cb_write()`.
+
+``` r
+library(lighthouse.codebook)
+
+# create and write a codebook without metadata
+dat |>
+ cb_create() |>
+ cb_write("cb.xlsx")
+
+# with metadata
+dat |>
+ cb_create(metadata = dat1_metadata) |>
+ cb_write("cb.xlsx")
+
+# from SPSS data
+dat_spss <- haven::read_sav("dat_spss.sav", user_na = TRUE)
+
+dat_spss |>
+ cb_create_spss() |>
+ cb_write("cb_spss.xlsx")
+
+# from REDCap data
+dat_rc <- REDCapR::redcap_read(redcap_uri = rc_uri, token = rc_token)
+meta_rc <- REDCapR::redcap_metadata_read(redcap_uri = rc_uri, token = rc_token)
+
+dat_rc$data |>
+ cb_create_redcap(metadata = meta_rc$data) |>
+ cb_write("cb_rc.xlsx")
+```
+
+## Customizing codebooks
+
+There are many options for controlling how data is interpreted,
+summarized, and presented. See `vignette("lighthouse-codebook")` for
+some of the most useful options, including grouped data summaries and
+specifying user missing codes. Further options are detailed in the help
+pages for `cb_create()` and `cb_write()`.
+
+
+
+## Codebook contents
+
+The codebook written to disk will include an *overview* tab listing all
+variables in the dataset; *summary* tabs for numeric, categorical, and
+text variables; and, if grouping variables are specified, *grouped
+summary* tabs for numeric and categorical variables.
+
+The *overview* tab includes one row for each variable in the dataset,
+with information on variable types, labels, values, and missingness. By
+default, each variable is hyperlinked to its location on the relevant
+summary tab.
+
+
+
+The *numeric summary* tab includes descriptive statistics for all
+numeric variables in the dataset:
+
+
+
+The *categorical summary* tab includes frequencies for all categorical
+variables, optionally with separate rows for user missing values:
+
+
+
+Finally, the *text summary* tab includes frequencies for the most common
+values for all text variables in the dataset. (The number of values
+shown can be adjusted using the `n_text_vals` argument to `cb_write()`.)
+
+
+
+If `group_by` is specified in `cb_write()`, additional numeric and
+categorical summary tabs grouped by the specified variables will be
+included.
+
## SPSS extension
-Functionality from this package is also available as an SPSS extension command [here](https://github.com/ccsarapas/lighthouse.codebook.spss).
\ No newline at end of file
+Functionality from this package is also available as an SPSS extension
+command [here](https://github.com/ccsarapas/lighthouse.codebook.spss).
diff --git a/_pkgdown.yml b/_pkgdown.yml
index d98e4d4..961f921 100644
--- a/_pkgdown.yml
+++ b/_pkgdown.yml
@@ -1,7 +1,10 @@
url: https://ccsarapas.github.io/lighthouse.codebook/
template:
bootstrap: 5
- bootswatch: sandstone
+ bslib:
+ base_font: {google: "Roboto"}
+ code_font: {google: "Roboto Mono"}
+ font_scale: 1.2
footer:
structure:
left: package
@@ -10,4 +13,4 @@ authors:
Casey Sarapas:
href: "https://chestnut.org/li/scientists-and-project-directors/category/research-scientists/profile/casey-sarapas-phd"
Chestnut Health Systems:
- href: "https://chestnut.org/"
\ No newline at end of file
+ href: "https://chestnut.org/"
diff --git a/man/cb_create.Rd b/man/cb_create.Rd
index c8872c6..0112709 100644
--- a/man/cb_create.Rd
+++ b/man/cb_create.Rd
@@ -13,22 +13,16 @@ cb_create(
.val_labels = val_labels,
.user_missing = NULL,
.split_var_labels = NULL,
- .include_types = !.include_r_classes,
- .include_r_classes = FALSE,
.val_labs_sep1 = NULL,
.val_labs_sep2 = NULL,
- .rmv_html = TRUE,
- .rmv_line_breaks = TRUE,
- .user_missing_col = c("if_any", "yes", "no"),
- .user_missing_conflict = c("metadata", "missing_label"),
- .user_missing_incompatible = c("ignore", "warn", "error")
+ .options = cb_create_options()
)
}
\arguments{
\item{data}{A data frame.}
-\item{metadata}{A data frame containing metadata, such as variable labels and value
-labels.}
+\item{metadata}{A data frame containing metadata, such as variable labels and
+value labels.}
\item{...}{Additional columns from \code{metadata} to preserve in the final codebook.
New names can be assigned by passing named arguments. Columns for variable
@@ -49,58 +43,34 @@ See "Specifying user missing values" below for examples.}
expressions, indicating (sets of) variable labels with a common stem that should
be extracted into a separate column.}
-\item{.include_types}{Include a column listing simplified type for each variable?
-(e.g,. \code{"categorical"}, \code{"date-time"}.)}
-
-\item{.include_r_classes}{Include a column listing class(es) of each variable?
-(e.g., \code{"factor"}, \code{"POSIXct, POSIXt"}.)}
-
\item{.val_labs_sep1, .val_labs_sep2}{Regex patterns separating value labels
in \code{metadata}. \code{.val_labs_sep1} separates values from labels, and \code{.val_labs_sep2}
-separates value/label pairs. e.g., if value labels are in format \code{"1, First label|2, Second label"},
-set \code{.val_labs_sep1} to \code{","} and \code{.val_labs_sep2} to \code{"\\\\|"}.}
-
-\item{.rmv_html}{Should HTML tags be removed from metadata (e.g., from variable
-and value labels)?}
-
-\item{.rmv_line_breaks}{Should line breaks be removed from metadata (e.g., from
-variable and value labels)? If \code{TRUE}, line breaks will be replaced with \code{" / "}.}
-
-\item{.user_missing_col}{Include value labels for user missing values in a separate
-column? The default, \code{"if_any"}, adds the column only if user missings are
-specified for at least one variable.}
-
-\item{.user_missing_conflict}{If different labels for a value are provided in
-metadata and user missings, which should be used?}
+separates value/label pairs from one another. e.g., if value labels are in
+the format \code{"1, First label|2, Second label"}, set \code{.val_labs_sep1} to \code{","}
+and \code{.val_labs_sep2} to \code{"\\\\|"}.}
-\item{.user_missing_incompatible}{How to handle variables specified in \code{.user_missing}
-that aren't compatible with user missing values (e.g., logical, Date, or POSIXt)?}
+\item{.options}{Additional options to use for codebook creation. Must be the result
+from a call to \code{cb_create_options()}. See that function's help page for available
+options.}
}
\value{
-An \code{"li_codebook"} object, consisting of (1) a tibble summarizing the passed
-dataset and (2) attributes containing the passed dataset (in several formats)
-and additional metadata. Specifically:
-\itemize{
-\item A tibble with columns:
+An \code{"li_codebook"} object, consisting of a tibble summarizing the passed
+dataset and attributes containing additional metadata. The tibble includes columns:
\itemize{
\item \code{name}: variable name
-\item \code{type}: optional column containing simplified variable type
+\item \code{type}: column containing simplified variable type
\item \code{class}: optional column containing class(es) of each variable
\item \code{label_stem}: optional column containing variable label stems, if any variables
are specified in \code{.split_var_labels}
\item \code{label}: variable label
\item \code{values}: values, with labels if applicable
-\item \code{user_missing}: optional column, depending on value of \code{.user_missing_col},
-showing user missing values, with labels if applicable
+\item \code{user_missing}: optional column showing user missing values, with labels
+if applicable. By default, this column is included only if user missings
+are specified for at least one variable. This behavior can be changed using
+the \code{user_missing_col} argument to \code{cb_create_options()}.
\item \code{missing}: proportion missing
\item additional columns if specified in \code{...}
}
-\item Attributes:
-\itemize{
-\item Transformed versions of the passed dataset. See \code{\link[=cb_get_data]{cb_get_data()}}
-\item Lookup tables and other metadata used internally.
-}
-}
}
\description{
\code{cb_create()} builds an object of class \code{"li_codebook"} from a dataset and optional
@@ -141,13 +111,15 @@ User missing values may optionally be named to set value labels:
\preformatted{
.user_missing = ~ c(Declined = -98, "Not applicable" = -99)
}
-If labels set in \code{.user_missing} conflict with those in \code{metadata}, \code{.user_missing_conflict}
-controls which labels are used.
-
-User missing values are not compatible with logical, date, or datetime (POSIXt)
-variables. By default, these variables will be ignored if specified in \code{.user_missing}.
-(i.e., user missing values will be applied only to compatible variables.) This behavior
-can be changed using the \code{.user_missing_incompatible} argument.
+If labels set in \code{.user_missing} conflict with those in \code{metadata}, the \code{user_missing_conflict}
+argument to \code{cb_create_options()} controls which labels are used.
+
+User missings may be set for numeric, character, factor/ordered factor, and haven_labelled/haven_labelled_spss
+vectors. For factors, user missings are set based on factor labels (not the underlying
+integer codes). For \code{"haven_labelled"} vectors, user missings are set based on
+values (not value labels). By default, variables with incompatible classes (e.g.,
+logical, Date, POSIXt) will be ignored if specified in \code{.user_missing}. This
+behavior can be changed using the \code{.user_missing_incompatible} argument to \code{cb_create_options()}.
}
\examples{
@@ -170,7 +142,7 @@ diamonds2 |>
transform(
carat_group = factor(carat_group),
price_group = factor(price_group)
- ) |>
+ ) |>
cb_create()
# provide metadata for variable and value labels
diff --git a/man/cb_create_options.Rd b/man/cb_create_options.Rd
new file mode 100644
index 0000000..33b114d
--- /dev/null
+++ b/man/cb_create_options.Rd
@@ -0,0 +1,89 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/cb_create.r, R/cb_create_redcap.r
+\name{cb_create_options}
+\alias{cb_create_options}
+\alias{cb_create_redcap_options}
+\title{Additional options for codebook creation}
+\usage{
+cb_create_options(
+ ...,
+ include_types = TRUE,
+ include_r_classes = FALSE,
+ rmv_html = TRUE,
+ rmv_line_breaks = TRUE,
+ user_missing_col = c("if_any", "yes", "no"),
+ user_missing_conflict = c("val_label", "missing_label"),
+ user_missing_incompatible = c("ignore", "warn", "error")
+)
+
+cb_create_redcap_options(
+ ...,
+ include_types = TRUE,
+ include_r_classes = FALSE,
+ rmv_html = TRUE,
+ rmv_line_breaks = TRUE,
+ user_missing_col = c("if_any", "yes", "no"),
+ user_missing_conflict = c("val_label", "missing_label"),
+ user_missing_incompatible = c("ignore", "warn", "error"),
+ name = field_name,
+ var_label = field_label,
+ val_labels = select_choices_or_calculations,
+ type = field_type,
+ form = form_name,
+ val_labs_sep1 = ", ",
+ val_labs_sep2 = "\\\\|",
+ coerce_integers = TRUE,
+ checkbox_resp_values = FALSE,
+ propagate_checkbox_missings = TRUE
+)
+}
+\arguments{
+\item{...}{These dots are for future extensions and must be empty.}
+
+\item{include_types}{Include a column listing simplified type for each variable?
+(e.g,. \code{"categorical"}, \code{"date-time"}.)}
+
+\item{include_r_classes}{Include a column listing class(es) of each variable?
+(e.g., \code{"factor"}, \code{"POSIXct, POSIXt"}.)}
+
+\item{rmv_html}{Should HTML tags be removed from metadata (e.g., from variable
+and value labels)?}
+
+\item{rmv_line_breaks}{Should line breaks be removed from metadata (e.g., from
+variable and value labels)? If \code{TRUE}, line breaks will be replaced with \code{" / "}.}
+
+\item{user_missing_col}{Include value labels for user missing values in a separate
+column? The default, \code{"if_any"}, adds the column only if user missings are
+specified for at least one variable.}
+
+\item{user_missing_conflict}{If labels passed to \code{.user_missing} conflict with
+value labels in metadata, which should be used?}
+
+\item{user_missing_incompatible}{How to handle variables specified in \code{.user_missing}
+that aren't compatible with user missing values (e.g., logical, Date, or POSIXt)?}
+
+\item{name, var_label, val_labels, type}{For REDCap data, columns in \code{metadata} containing variable
+name, variable label, value labels, and variable type, respectively.}
+
+\item{form}{For REDCap data, column in \code{metadata} containing form names. (Set to \code{NULL} to omit.)}
+
+\item{val_labs_sep1, val_labs_sep2}{For REDCap data, regex patterns separating value labels
+in \code{metadata}. \code{val_labs_sep1} separates values from labels, and \code{val_labs_sep2}
+separates value/label pairs from one another. e.g., if value labels are in
+the format \code{"1, First label|2, Second label"}, set \code{val_labs_sep1} to \code{","}
+and \code{val_labs_sep2} to \code{"\\\\|"}.}
+
+\item{coerce_integers}{For REDCap data, should variables listed as "integer" in \code{metedata$text_validation_type_or_show_slider_number}
+be coerced to integer?}
+
+\item{checkbox_resp_values}{For REDCap data, should checkbox values use labels in \code{metadata} (\code{TRUE})
+or "Yes" / "No" (\code{FALSE})? See "Checkbox data handling" on the \code{cb_create_redcap()}
+help page.}
+
+\item{propagate_checkbox_missings}{For REDCap data, should user missing values in a checkbox group
+be propagated across all variables in the group? See "Checkbox data handling"
+on the \code{cb_create_redcap()} help page.}
+}
+\description{
+Additional options for use by \code{cb_create()}.
+}
diff --git a/man/cb_create_redcap.Rd b/man/cb_create_redcap.Rd
index 9ced111..98cc49f 100644
--- a/man/cb_create_redcap.Rd
+++ b/man/cb_create_redcap.Rd
@@ -8,24 +8,9 @@ cb_create_redcap(
data,
metadata,
...,
- .name = field_name,
- .var_label = field_label,
- .val_labels = select_choices_or_calculations,
- .form = form_name,
.user_missing = NULL,
.split_var_labels = NULL,
- .include_types = !.include_r_classes,
- .include_r_classes = FALSE,
- .val_labs_sep1 = ", ",
- .val_labs_sep2 = "\\\\|",
- .rmv_html = TRUE,
- .rmv_line_breaks = TRUE,
- .coerce_integers = TRUE,
- .checkbox_resp_values = FALSE,
- .propagate_checkbox_missings = TRUE,
- .user_missing_col = c("if_any", "yes", "no"),
- .user_missing_conflict = c("metadata", "missing_label"),
- .user_missing_incompatible = c("ignore", "warn", "error")
+ .options = cb_create_redcap_options()
)
}
\arguments{
@@ -37,11 +22,6 @@ cb_create_redcap(
New names can be assigned by passing named arguments. Columns for variable
name, form, variable label, and value labels are included by default.}
-\item{.name, .var_label, .val_labels}{Columns in \code{metadata} containing variable
-name, variable label, and value labels, respectively.}
-
-\item{.form}{Column in \code{metadata} containing form names. (Set to \code{NULL} to omit.)}
-
\item{.user_missing}{A formula or list of formulas specifying user missing values.
Formulas should specify variables on the left-hand side (as variable names
or \link[dplyr:dplyr_tidy_select]{tidyselect} expressions), and missing values on the
@@ -52,69 +32,29 @@ See "Specifying user missing values" in \code{\link[=cb_create]{cb_create()}} do
expressions, indicating (sets of) variable labels with a common stem that should
be extracted into a separate column.}
-\item{.include_types}{Include a column listing simplified type for each variable?
-(e.g,. \code{"categorical"}, \code{"date-time"}.)}
-
-\item{.include_r_classes}{Include a column listing class(es) of each variable?
-(e.g., \code{"factor"}, \code{"POSIXct, POSIXt"}.)}
-
-\item{.val_labs_sep1, .val_labs_sep2}{Regex patterns separating value labels
-in \code{metadata}. \code{.val_labs_sep1} separates values from labels, and \code{.val_labs_sep2}
-separates value/label pairs. e.g., if value labels are in format \code{"1, First label|2, Second label"},
-set \code{.val_labs_sep1} to \code{","} and \code{.val_labs_sep2} to \code{"\\\\|"}.}
-
-\item{.rmv_html}{Should HTML tags be removed from metadata (e.g., from variable
-and value labels)?}
-
-\item{.rmv_line_breaks}{Should line breaks be removed from metadata (e.g., from
-variable and value labels)? If \code{TRUE}, line breaks will be replaced with \code{" / "}.}
-
-\item{.coerce_integers}{Should variables listed as "integer" in \code{metedata$text_validation_type_or_show_slider_number}
-be coerced to integer?}
-
-\item{.checkbox_resp_values}{Should checkbox values use labels in \code{metadata}
-(\code{TRUE}) or "Yes" / "No" (\code{FALSE})? See "Checkbox data handling" below.}
-
-\item{.propagate_checkbox_missings}{Should user missing values in a checkbox
-group be propagated across all variables in the group? See "Checkbox data handling"
-below.}
-
-\item{.user_missing_col}{Include value labels for user missing values in a separate
-column? The default, \code{"if_any"}, adds the column only if user missings are
-specified for at least one variable.}
-
-\item{.user_missing_conflict}{If different labels for a value are provided in
-metadata and user missings, which should be used?}
-
-\item{.user_missing_incompatible}{How to handle variables specified in \code{.user_missing}
-that aren't compatible with user missing values (e.g., logical, Date, or POSIXt)?}
+\item{.options}{Additional options to use for codebook creation. Must be the result
+from a call to \code{cb_create_redcap_options()} or \code{cb_create_options()}. See \code{?cb_create_redcap_options}
+for available options.}
}
\value{
-An \code{"li_codebook"} object, consisting of (1) a tibble summarizing the passed
-dataset and (2) attributes containing the passed dataset (in several formats)
-and additional metadata. Specifically:
-\itemize{
-\item A tibble with columns:
+An \code{"li_codebook"} object, consisting of a tibble summarizing the passed
+dataset and attributes containing additional metadata. The tibble includes columns:
\itemize{
\item \code{name}: variable name
\item \code{form}: form name
-\item \code{type}: optional column containing simplified variable type
+\item \code{type}: column containing simplified variable type
\item \code{class}: optional column containing class(es) of each variable
\item \code{label_stem}: optional column containing variable label stems, if any variables
are specified in \code{.split_var_labels}
\item \code{label}: variable label
\item \code{values}: values, with labels if applicable
-\item \code{user_missing}: optional column, depending on value of \code{.user_missing_col},
-showing user missing values, with labels if applicable
+\item \code{user_missing}: optional column showing user missing values, with labels
+if applicable. By default, this column is included only if user missings
+are specified for at least one variable. This behavior can be changed using
+the \code{user_missing_col} argument to \code{cb_create_options()}.
\item \code{missing}: proportion missing
\item additional columns if specified in \code{...}
}
-\item Attributes:
-\itemize{
-\item Transformed versions of the passed dataset. See \code{\link[=cb_get_data]{cb_get_data()}}.
-\item Lookup tables and other metadata used internally.
-}
-}
}
\description{
\code{cb_create_redcap()} builds an object of class \code{"li_codebook"} from a dataset and
@@ -131,24 +71,23 @@ and metadata, including:
\item Unpacking, labelling, and optional missing propagation for checkbox data
\item Optional coercion for character variables marked as "integer" in \code{metedata$text_validation_type_or_show_slider_number}
}
+
+All of these behaviors can be controlled using the \code{.options} argument.
}
\section{Checkbox data handling}{
\subsection{Value labels}{
Data from REDCap checkboxes yields one variable in the dataset for each response
-option. These will be labelled generically with \code{"Yes"} or \code{"No"}, unless \code{.checkbox_resp_values}
-is \code{TRUE}, in which case response-specific labels from \code{metadata} will be used.
-For example, if a checkbox group has options "In the past year," "More than a
+option. By default, these will be labelled generically with \code{"Yes"} or \code{"No"}.
+For example, consider a checkbox group with options "In the past year," "More than a
year ago," and "Never," corresponding to variables \code{chk_var1___0}, \code{chk_var1___1},
-and \code{chk_var1___2}: if \code{.checkbox_resp_values} is \code{FALSE}, all of these will
-have values:
+and \code{chk_var1___2}. By default, all of these will be given the same value labels:
\itemize{
\item \code{chk_var1___0}, \code{chk_var1___1}, \code{chk_var1___2}: 0 = "No"; 1 = "Yes".
-}
-
-If \code{.checkbox_resp_values} is \code{TRUE}, each variable will have unique labels:
-\itemize{
+This behavior can be changed by setting \code{checkbox_resp_values = TRUE} in \code{cb_create_options()}.
+In this case, response-specific labels from \code{metadata} will be used, so that
+each variable will have unique labels:
\item \code{chk_var1___0}: 0 = "Not selected," 1 = "In the past year"
\item \code{chk_var1___1}: 0 = "Not selected," 1 = "More than a year ago"
\item \code{chk_var1___2}: 0 = "Not selected," 0 = "Never"
@@ -157,13 +96,14 @@ If \code{.checkbox_resp_values} is \code{TRUE}, each variable will have unique l
\subsection{Missing value propagation}{
-If \code{.propagate_checkbox_missings} is \code{TRUE}, missing values in a checkbox group
-variable will be propagated to all variables in the group. For example, given
-a checkbox group with options "Pregnant," "Not pregnant," and "Not applicable,"
-corresponding to variables \code{chk_preg_0___0}, \code{chk_preg_0___1}, and \code{chk_preg_0____9},
-and assuming that \code{-9} is specified as a user missing value. If \code{.propagate_checkbox_missings}
-is \code{TRUE}, \code{chk_preg_0___0} and \code{chk_preg_0___1} will be set to \code{-9} if \code{chk_preg_0____9}
-is \code{1}. Otherwise, these columns will remain as \code{0} where \code{chk_preg_0____9} is \code{1}.
+By default, missing values in a checkbox group will be propagated to all variables
+in the group. For example, consider a checkbox group with options "Pregnant,"
+"Not pregnant," and "Not applicable," corresponding to variables \code{chk_preg_0___0},
+\code{chk_preg_0___1}, and \code{chk_preg_0____9}, and assuming that \code{-9} is specified
+as a user missing value. By default, \code{chk_preg_0___0} and \code{chk_preg_0___1} will
+be set to \code{-9} if \code{chk_preg_0____9} is \code{1}. This behavior can be overridden by
+setting \code{propagate_checkbox_missings = FALSE} in \code{cb_create_options()}, in which
+case no values will be changed.
}
}
diff --git a/man/cb_create_spss.Rd b/man/cb_create_spss.Rd
index b67dc25..f5ab4c8 100644
--- a/man/cb_create_spss.Rd
+++ b/man/cb_create_spss.Rd
@@ -8,13 +8,7 @@ cb_create_spss(
data,
.user_missing = NULL,
.split_var_labels = NULL,
- .include_types = !.include_r_classes,
- .include_r_classes = FALSE,
- .rmv_html = TRUE,
- .rmv_line_breaks = TRUE,
- .user_missing_col = c("if_any", "yes", "no"),
- .user_missing_conflict = c("val_label", "missing_label"),
- .user_missing_incompatible = c("ignore", "warn", "error")
+ .options = cb_create_options()
)
}
\arguments{
@@ -31,51 +25,27 @@ See "Specifying user missing values" in \code{\link[=cb_create]{cb_create()}} do
expressions, indicating (sets of) variable labels with a common stem that should
be extracted into a separate column.}
-\item{.include_types}{Include a column listing simplified type for each variable?
-(e.g,. \code{"categorical"}, \code{"date-time"}.)}
-
-\item{.include_r_classes}{Include a column listing class(es) of each variable?
-(e.g., \code{"factor"}, \code{"POSIXct, POSIXt"}.)}
-
-\item{.rmv_html}{Should HTML tags be removed from variable and value labels?}
-
-\item{.rmv_line_breaks}{Should line breaks be removed from variable and value
-labels? If \code{TRUE}, line breaks will be replaced with \code{" / "}.}
-
-\item{.user_missing_col}{Include value labels for user missing values in a separate
-column? The default, \code{"if_any"}, adds the column only if user missings are
-specified for at least one variable.}
-
-\item{.user_missing_conflict}{If labels passed to \code{.user_missing} conflicts with
-a value label in \code{data}, which should be used?}
-
-\item{.user_missing_incompatible}{How to handle variables specified in \code{.user_missing}
-that aren't compatible with user missing values (e.g., logical, Date, or POSIXt)?}
+\item{.options}{Additional options to use for codebook creation. Must be the result
+from a call to \code{cb_create_options()}. See that function's help page for available
+options.}
}
\value{
-An \code{"li_codebook"} object, consisting of (1) a tibble summarizing the passed
-dataset and (2) attributes containing the passed dataset (in several formats)
-and additional metadata. Specifically:
-\itemize{
-\item A tibble with columns:
+An \code{"li_codebook"} object, consisting of a tibble summarizing the passed
+dataset and attributes containing additional metadata. The tibble includes columns:
\itemize{
\item \code{name}: variable name
-\item \code{type}: optional column containing simplified variable type
+\item \code{type}: column containing simplified variable type
\item \code{class}: optional column containing class(es) of each variable
\item \code{label_stem}: optional column containing variable label stems, if any variables
are specified in \code{.split_var_labels}
\item \code{label}: variable label
\item \code{values}: values, with labels if applicable
-\item \code{user_missing}: optional column, depending on value of \code{.user_missing_col},
-showing user missing values, with labels if applicable
+\item \code{user_missing}: optional column showing user missing values, with labels
+if applicable. By default, this column is included only if user missings
+are specified for at least one variable. This behavior can be changed using
+the \code{user_missing_col} argument to \code{cb_create_options()}.
\item \code{missing}: proportion missing
}
-\item Attributes:
-\itemize{
-\item Transformed versions of the passed dataset. See \code{\link[=cb_get_data]{cb_get_data()}}.
-\item Lookup tables and other metadata used internally.
-}
-}
}
\description{
\code{cb_create_spss()} builds an object of class \code{"li_codebook"} from an imported
diff --git a/man/cb_get_data.Rd b/man/cb_get_data.Rd
index 6cab0a9..d876368 100644
--- a/man/cb_get_data.Rd
+++ b/man/cb_get_data.Rd
@@ -4,26 +4,27 @@
\alias{cb_get_data}
\title{Extract data from a codebook object}
\usage{
-cb_get_data(cb, format = c("factors", "haven", "values"))
+cb_get_data(cb, format = c("factors", "haven"))
}
\arguments{
\item{cb}{An object of class \code{"li_codebook"} as produced by \code{\link[=cb_create]{cb_create()}} or
a variant.}
-\item{format}{Format of the returned data; see below for details.}
+\item{format}{Format of the returned data, either \code{"factors"} or \code{"haven"};
+see below for details.}
}
\value{
A tibble with variables formatted based on the \code{format} argument.
\itemize{
-\item For \code{format = "values"}, all variables retain the same values as the original
-dataset, including values for user missings. The data may reflect transformations
-made by variants of \code{\link[=cb_create]{cb_create()}} -- e.g., for \code{\link[=cb_create_redcap]{cb_create_redcap()}}, integer coercion
-and propagation of user missings across checkbox variables.
-\item For \code{"haven"}, value labels and user missings are encoded using class
-\code{\link[haven:labelled]{"haven_labelled"}}`
\item For \code{"factors"}, all variables with value labels are converted to factors,
and all user missings are converted to \code{NA}.
+\item For \code{"haven"}, variable labels, value labels, and user missings are encoded
+using class \code{\link[haven:labelled]{"haven_labelled_spss"}}`.
}
+
+Both formats may also reflect transformations made by variants of \code{\link[=cb_create]{cb_create()}}.
+In particular, for codebooks created using \code{\link[=cb_create_redcap]{cb_create_redcap()}}, integer coercion
+and propagation of user missings across checkbox variables.
}
\description{
Codebook objects created by \code{\link[=cb_create]{cb_create()}} and friends contain several transformed
diff --git a/man/cb_write.Rd b/man/cb_write.Rd
index 0018f5b..1699465 100644
--- a/man/cb_write.Rd
+++ b/man/cb_write.Rd
@@ -8,13 +8,15 @@ cb_write(
cb,
file,
dataset_name = NULL,
- incl_date = TRUE,
- incl_dims = TRUE,
- hyperlinks = TRUE,
group_by = NULL,
- group_rows_numeric = NULL,
+ group_rows = NULL,
+ group_rows_numeric = group_rows,
+ group_rows_categorical = group_rows,
detail_missing = c("if_any_user_missing", "yes", "no"),
n_text_vals = 5,
+ incl_date = TRUE,
+ incl_dims = TRUE,
+ hyperlinks = TRUE,
overwrite = TRUE
)
}
@@ -26,21 +28,20 @@ a variant.}
\item{dataset_name}{Name of the dataset to display in workbook headers.}
-\item{incl_date, incl_dims}{Should the date and/or dataset dimensions be included
-in the Overview tab header?}
-
-\item{hyperlinks}{If \code{TRUE}, variable names on the Overview sheet will link
-to corresponding rows on summary tabs and vice versa.}
-
\item{group_by}{<\code{\link[dplyr:dplyr_tidy_select]{tidy-select}}> Column or columns to group
by. If specified, additional numeric and categorical summary tabs will be included
-with grouped summaries. Subgroups are shown in columns by default. For the numeric
-summary tab, subgroups for some or all grouping variables can instead be shown
-in rows if specified in \code{group_rows_numeric}.}
+with grouped summaries. Subgroups are shown in columns by default. Some or all
+grouping variables can instead be shown in rows if specified in \code{group_rows},
+\code{group_rows_numeric}, or \code{group_rows_categorical}.}
+
+\item{group_rows}{<\code{\link[dplyr:dplyr_tidy_select]{tidy-select}}> Column or columns to group
+by in rows on grouped summary tabs. All columns must also be specified in \code{group_by}.
+Will apply to both numeric and categorical summary tabs unless otherwise specified
+in \code{group_rows_numeric} or \code{group_rows_categorical}.}
-\item{group_rows_numeric}{<\code{\link[dplyr:dplyr_tidy_select]{tidy-select}}> Column or columns
-to group by in rows on the grouped numeric summary tab. All columns must also
-be specified in \code{group_by}.}
+\item{group_rows_numeric, group_rows_categorical}{<\code{\link[dplyr:dplyr_tidy_select]{tidy-select}}>
+Column or columns to group by in rows on grouped numeric or categorical summary
+tab.}
\item{detail_missing}{Include detailed missing value information on ungrouped
categorical and text summary tabs? (Detailed missing information for grouped
@@ -50,6 +51,12 @@ summary tabs is not currently supported.)}
should be included for each variable? If there are more than \code{n_text_vals} + 1
unique values, the \code{n_text_vals} most common non-missing values will be included.}
+\item{incl_date, incl_dims}{Should the date and/or dataset dimensions be included
+in the Overview tab header?}
+
+\item{hyperlinks}{If \code{TRUE}, variable names on the Overview sheet will link
+to corresponding rows on summary tabs and vice versa.}
+
\item{overwrite}{Overwrite existing file?}
}
\value{
diff --git a/man/figures/README-categorical.png b/man/figures/README-categorical.png
new file mode 100644
index 0000000..3766ac3
Binary files /dev/null and b/man/figures/README-categorical.png differ
diff --git a/man/figures/README-numeric.png b/man/figures/README-numeric.png
new file mode 100644
index 0000000..4599abe
Binary files /dev/null and b/man/figures/README-numeric.png differ
diff --git a/man/figures/README-overview.png b/man/figures/README-overview.png
new file mode 100644
index 0000000..dbad3da
Binary files /dev/null and b/man/figures/README-overview.png differ
diff --git a/man/figures/README-text.png b/man/figures/README-text.png
new file mode 100644
index 0000000..ccb966a
Binary files /dev/null and b/man/figures/README-text.png differ
diff --git a/vignettes/.gitignore b/vignettes/.gitignore
new file mode 100644
index 0000000..097b241
--- /dev/null
+++ b/vignettes/.gitignore
@@ -0,0 +1,2 @@
+*.html
+*.R
diff --git a/vignettes/lighthouse-codebook.Rmd b/vignettes/lighthouse-codebook.Rmd
new file mode 100644
index 0000000..678a4f8
--- /dev/null
+++ b/vignettes/lighthouse-codebook.Rmd
@@ -0,0 +1,406 @@
+---
+title: "Introduction to lighthouse.codebook"
+output: rmarkdown::html_vignette
+vignette: >
+ %\VignetteIndexEntry{Introduction to lighthouse.codebook}
+ %\VignetteEngine{knitr::rmarkdown}
+ %\VignetteEncoding{UTF-8}
+---
+
+```{r, include = FALSE}
+knitr::opts_chunk$set(
+ eval = FALSE,
+ comment = "#",
+ out.width = "100%"
+)
+```
+
+## Creating codebooks
+
+Creating a codebook involves two general steps:
+
+1. Create a “codebook” object in R from a data frame (and, optionally, metadata),
+ using `cb_create()` or a specialized variant (such as `cb_create_spss()` or `cb_create_redcap()`).
+
+2. Write the codebook to disk using `cb_write()`.
+
+```r
+library(lighthouse.codebook)
+
+# create and write a codebook without metadata
+dat |>
+ cb_create() |>
+ cb_write("cb.xlsx")
+
+# with metadata
+dat |>
+ cb_create(metadata = dat1_metadata) |>
+ cb_write("cb.xlsx")
+
+# from SPSS data
+dat_spss <- haven::read_sav("dat_spss.sav", user_na = TRUE)
+
+dat_spss |>
+ cb_create_spss() |>
+ cb_write("cb_spss.xlsx")
+
+# from REDCap data
+dat_rc <- REDCapR::redcap_read(redcap_uri = rc_uri, token = rc_token)
+meta_rc <- REDCapR::redcap_metadata_read(redcap_uri = rc_uri, token = rc_token)
+
+dat_rc$data |>
+ cb_create_redcap(metadata = meta_rc$data) |>
+ cb_write("cb_rc.xlsx")
+```
+The codebook written to disk will include an overview tab listing all variables
+in the dataset; summary tabs for numeric, categorical, and text variables; and,
+if grouping variables are specified, grouped summary tabs for numeric and categorical
+variables.
+
+## Customizing codebooks
+
+There are many options for controlling how data is interpreted, summarized, and
+presented. This section shows a few of the most useful options. Further options
+are detailed in the documentation for `cb_create()` and `cb_write()`.
+
+### Grouped summaries
+
+Numeric and categorical data summaries can be grouped by one or more variables by
+specifying them in the `group_by` argument to `cb_write()`.
+```r
+cb_create(data, metadata) |>
+ cb_write("cb.xlsx", group_by = treatment_group)
+
+cb_create(data, metadata) |>
+ cb_write("cb.xlsx", group_by = c(treatment_group, timepoint, age_group))
+```
+
+By default, values for each subgroup are shown in separate columns, with decked
+heads if more than one grouping variable is specified. However, some or all grouping
+variables can instead be shown in rows using the `group_rows` argument.
+```r
+# show `treatment_group` in columns and `timepoint` in rows
+cb_create(data, metadata) |>
+ cb_write(
+ "cb.xlsx",
+ group_by = c(treatment_group, timepoint),
+ group_rows = timepoint
+ )
+```
+Different row grouping behavior can be specified for numeric versus categorical
+summary tabs using the `group_rows_numeric` and `group_rows_categorical` arguments.
+```r
+# for numeric summary, show `treatment_group` in columns and `timepoint` in rows;
+# for categorical summary, show all grouping variables in columns
+cb_create(data, metadata) |>
+ cb_write(
+ "cb.xlsx",
+ group_by = c(treatment_group, timepoint),
+ group_rows_numeric = timepoint
+ )
+
+# for numeric summary, show all grouping variables in rows;
+# for categorical summary, show `treatment_group` in rows
+cb_create(data, metadata) |>
+ cb_write(
+ "cb.xlsx",
+ group_by = c(treatment_group, timepoint),
+ group_rows_numeric = c(treatment_group, timepoint),
+ group_rows_categorical = treatment_group
+ )
+```
+
+### User missing values
+
+User missing values (also known as nonresponse codes, reserve codes, or special
+values) can be specified using the `.user_missing` argument to `cb_create()`.
+Missing values are specified using a formula or list of formulas, with variables
+on the left-hand side (as names or [tidyselect](https://dplyr.tidyverse.org/reference/dplyr_tidy_select.html)
+expressions) and values on the right-hand side. If the left-hand side is empty,
+user missings will be set for all compatible variables in the dataset.
+
+```r
+# set a single missing value for a single variable
+cb <- cb_create(data, metadata, .user_missing = var1 ~ 99)
+
+# for variables `var1` through `var5`
+cb <- cb_create(data, metadata, .user_missing = var1:var5 ~ c(98, 99))
+
+# for all numeric variables, plus `var6` and `var7`
+cb <- cb_create(
+ data,
+ metadata,
+ .user_missing = c(where(is.numeric), var6, var7) ~ c(-9, -8, -7)
+)
+
+# for all compatible variables in dataset
+cb <- cb_create(data, metadata, .user_missing = ~ c(98, 99))
+```
+
+If the user missing values are named, the names will be treated as value labels
+in data summaries.
+
+```r
+cb <- cb_create(
+ data,
+ metadata,
+ .user_missing = var1:var5 ~ c("Declined" = 98, "Not applicable" = 99)
+)
+```
+
+To apply different user missings for different variables, pass a list of formulas.
+```r
+cb <- cb_create(
+ data,
+ metadata,
+ .user_missing = list(
+ starts_with("status") ~ c("Declined" = 98, "Not applicable" = 99),
+ var7:var10 ~ c("Don't know" = -4, "Not applicable" = -5)
+ )
+)
+```
+### Missing value handling
+
+On numeric summary tabs, missing values (including both user missing values and `NA`)
+are dropped for computation of summary statistics.
+
+On ungrouped categorical and text summary tabs, by default, user missing values
+are individually tabulated. For example, if `.user_missing = ~ c("Declined" = 98,
+"Not applicable" = 99)`, then categorical and text summary tabs will include rows
+giving counts for `"[98] Declined"` and `"[99] Not applicable"`. All user missing
+values and `NA` can instead be collapsed into a single `"(Missing)"` row using the
+`detail_missing` argument to `cb_write()`.
+```r
+dat |>
+ cb_create(.user_missing = ~ ~ c("Declined" = 98, "Not applicable" = 99)) |>
+ cb_write("cb.xlsx", detail_missing = FALSE)
+```
+Finally, user missing values are always collapsed (as though `detail_missing = FALSE`)
+on _grouped_ summary tabs.
+
+### Splitting long variable labels
+
+Variable labels for sets of related variables sometimes share a common prefix. Using
+the `.split_var_labels` argument to `cb_create()`, this prefix can be extracted
+into a separate column, making it easier to see at a glance what is unique about
+each variable.
+
+For example, given a set of variable labels that all begin with `"What colors do
+you like? Select all that apply: "`:
+
+| Name | Label |
+| ---- | ----- |
+| age | How old are you today? |
+| colors1 | What colors do you like? Select all that apply: Red |
+| colors2 | What colors do you like? Select all that apply: Green |
+| colors3 | What colors do you like? Select all that apply: Blue |
+| colors4 | What colors do you like? Select all that apply: Orange |
+| height | What is your height in inches? |
+
+You can split the labels for these variables, specifying them using a [tidyselect](https://dplyr.tidyverse.org/reference/dplyr_tidy_select.html)
+expression:
+
+```r
+cb_create(
+ data,
+ metadata,
+ .split_var_labels = starts_with("colors")
+ ) |>
+ cb_write("cb.xlsx")
+```
+
+| Name | Label Stem | Label |
+| ---- | ---------- | ----- |
+| age | | How old are you today? |
+| colors1 | What colors do you like? Select all that apply: | Red |
+| colors2 | What colors do you like? Select all that apply: | Green |
+| colors3 | What colors do you like? Select all that apply: | Blue |
+| colors4 | What colors do you like? Select all that apply: | Orange |
+| height | | What is your height in inches? |
+
+Multiple sets of variables with common prefixes can be specified by passing a
+list of tidyselect expressions.
+
+```r
+cb_create(
+ data,
+ metadata,
+ .split_var_labels = list(
+ starts_with("colors"),
+ fav_food2:fav_food9,
+ c(rating1, rating4:rating7, rating9)
+ )
+ ) |>
+ cb_write("cb.xlsx")
+```
+
+## Variable typing
+Data summaries are produced for "numeric," "categorical," and "text" variables. For a given variable `x`,
+* `x` is treated as categorical if (1) it is a factor, ordered factor, or logical vector, _or_ (2) it has associated value labels other than missing value codes (specified in metadata or, for SPSS data, in a `"haven_labelled"` vector).
+* `x` is treated as numeric if (1) it is numeric (i.e., `is.numeric(x)` is `TRUE`) _and_ (2) it has no associated value labels other than missing value codes.
+* `x` is treated as text if (1) it is a character vector _and_ (2) it has no associated value labels other than missing value codes.
+
+Thus, you can change how a variable is summarized by changing its class. For
+instance, to get complete frequencies for a numeric or character variable, convert
+it to a factor; to get only the top frequencies for a factor with many levels, convert
+it to character.
+
+Variables of other classes, such as dates, datetimes, and lists, are not currently
+included on summary tabs. Summaries for dates and datetimes are planned for a future
+release.
+
+## Other uses for the codebook object
+The `"lighthouse_codebook"` object created by `cb_create()` will most commonly be
+used to write an Excel codebook to disk using `cb_write()`. However, it can also
+be used to create other objects in R.
+
+```r
+# example data
+q4_subset <- gain_q4 |>
+ subset(select = c(XPID, XOBS, XRA, B17, SU4a, SU4b, SU1f99v))
+
+# create codebook
+cb <- cb_create(
+ q4_subset,
+ metadata = q4_metadata,
+ .user_missing = ~ c("Not Asked" = -3,
+ "Missing" = -4,
+ "Confidential" = -6,
+ "Refused" = -7,
+ "Don't Know" = -8,
+ "Legitimate Skip" = -9)
+)
+
+cb
+# # A tibble: 7 × 6
+# name type label values user_missings missing
+#
+# 1 XPID text Participant ID NA [-9] Legitim… 0
+# 2 XOBS categorical Observation Wave [0] I… [-9] Legitim… 0
+# 3 XRA categorical Random assignment [0] C… [-9] Legitim… 0
+# 4 B17 categorical Pregnant [0] N… [-9] Legitim… 0.221
+# 5 SU4a numeric PPS - P90 days alcohol use NA [-9] Legitim… 0.394
+# 6 SU4b numeric PPS - P90 days drunk or 5+ d… NA [-9] Legitim… 0.442
+# 7 SU1f99v text QCS - P90 Days Other AOD Tx … NA [-9] Legitim… 0.923
+```
+### Extract transformed data
+Use `cb_get_data()` to extract transformed data based in several formats. `format
+= "factors"` yields a dataset with all variables with value labels converted to
+factors and user missings converted to `NA`.
+```r
+cb_get_data(cb, format = "factors")
+# # A tibble: 104 × 7
+# XPID XOBS XRA B17 SU4a SU4b SU1f99v
+#
+# 1 001 Intake Treatment No NA NA NA
+# 2 002 Intake Control No 10 4 NA
+# 3 003 Intake Treatment No 10 1 Peer counselor
+# 4 003 3-month Treatment No 39 15 NA
+# 5 003 6-month Treatment No NA NA NA
+# 6 004 Intake Control No 10 2 NA
+# 7 004 3-month Control No 55 63 NA
+# 8 004 6-month Control No 10 1 NA
+# 9 005 Intake Control Yes 35 0 Social worker
+# 10 005 3-month Control Yes 55 39 NA
+# # ℹ 94 more rows
+```
+Whereas `format = "haven"` yields a dataset with SPSS-style variable labels, value
+labels, and user missings encoded using the `"haven_labelled_spss"` class.
+```r
+cb_get_data(cb, format = "haven")
+# # A tibble: 104 × 7
+# XPID XOBS XRA B17 SU4a SU4b SU1f99v
+#
+# 1 001 0 [Intake] 1 [Treatment] 0 [No] -9 (NA) -9 (NA) -9 (NA)
+# 2 002 0 [Intake] 0 [Control] 0 [No] 10 4 -9 (NA)
+# 3 003 0 [Intake] 1 [Treatment] 0 [No] 10 1 Peer couns…
+# 4 003 1 [3-month] 1 [Treatment] 0 [No] 39 15 -4 (NA)
+# 5 003 2 [6-month] 1 [Treatment] 0 [No] -4 (NA) -4 (NA) -4 (NA)
+# 6 004 0 [Intake] 0 [Control] 0 [No] 10 2 -9 (NA)
+# 7 004 1 [3-month] 0 [Control] 0 [No] 55 63 -9 (NA)
+# 8 004 2 [6-month] 0 [Control] 0 [No] 10 1 -9 (NA)
+# 9 005 0 [Intake] 0 [Control] 1 [Yes] 35 0 Social wor…
+# 10 005 1 [3-month] 0 [Control] 1 [Yes] 55 39 -9 (NA)
+# # ℹ 94 more rows
+```
+### Get data summaries
+`cb_summarize_numeric()`, `cb_summarize_categorical()`, and `cb_summarize_text()`
+return summaries for all variables of their respective types. These are the basis
+of the summary tabs generated by `cb_write()`.
+```r
+cb_summarize_numeric(cb)
+# # A tibble: 2 × 8
+# name label valid_n valid_pct mean SD median MAD
+#
+# 1 SU4a PPS - P90 days alcohol use 63 0.606 23.4 21.2 20 25.2
+# 2 SU4b PPS - P90 days drunk or 5+ d… 58 0.558 8.09 13.6 2 2.97
+# # ℹ 5 more variables: min , max , range , skew , kurt
+
+cb_summarize_categorical(cb) |>
+# # A tibble: 9 × 7
+# name label is_missing value n pct_of_all pct_of_valid
+#
+# 1 XOBS Observation Wave FALSE [0] Intake 42 0.404 0.404
+# 2 XOBS Observation Wave FALSE [1] 3-month 34 0.327 0.327
+# 3 XOBS Observation Wave FALSE [2] 6-month 28 0.269 0.269
+# 4 XRA Random assignment FALSE [0] Control 50 0.481 0.481
+# 5 XRA Random assignment FALSE [1] Treatment 54 0.519 0.519
+# 6 B17 Pregnant FALSE [1] Yes 6 0.0577 0.0741
+# 7 B17 Pregnant FALSE [0] No 75 0.721 0.926
+# 8 B17 Pregnant TRUE [-9] Legitim… 22 0.212 NA
+# 9 B17 Pregnant TRUE [-4] Missing 1 0.00962 NA
+# # ℹ 1 more variable: pct_of_missing
+
+cb_summarize_text(cb)
+# # A tibble: 14 × 7
+# name label is_missing unique_n value n pct_of_all
+#
+# 1 XPID Participant ID FALSE 42 003 3 0.0288
+# 2 XPID Participant ID FALSE 42 004 3 0.0288
+# 3 XPID Participant ID FALSE 42 005 3 0.0288
+# 4 XPID Participant ID FALSE 42 006 3 0.0288
+# 5 XPID Participant ID FALSE 42 010 3 0.0288
+# 6 XPID Participant ID FALSE 42 (37 … 89 0.856
+# 7 SU1f99v QCS - P90 Days Other AOD … FALSE 8 AA 1 0.00962
+# 8 SU1f99v QCS - P90 Days Other AOD … FALSE 8 Alco… 1 0.00962
+# 9 SU1f99v QCS - P90 Days Other AOD … FALSE 8 Case… 1 0.00962
+# 10 SU1f99v QCS - P90 Days Other AOD … FALSE 8 Group 1 0.00962
+# 11 SU1f99v QCS - P90 Days Other AOD … FALSE 8 NA 1 0.00962
+# 12 SU1f99v QCS - P90 Days Other AOD … FALSE 8 (3 o… 3 0.0288
+# 13 SU1f99v QCS - P90 Days Other AOD … TRUE NA [-4]… 44 0.423
+# 14 SU1f99v QCS - P90 Days Other AOD … TRUE NA [-9]… 52 0.5
+# # ℹ 2 more variables: pct_of_valid , pct_of_missing
+```
+`cb_summarize_numeric()` and `cb_summarize_categorical()` can also return grouped
+summaries:
+```r
+cb_summarize_numeric(cb, group_by = XOBS)
+# # A tibble: 6 × 8
+# XOBS name label valid_n valid_pct mean SD median
+#
+# 1 Intake SU4a PPS - P90 days alcohol use 30 0.714 28.8 20.1 26.5
+# 2 3-month SU4a PPS - P90 days alcohol use 20 0.588 21.4 21.0 12.5
+# 3 6-month SU4a PPS - P90 days alcohol use 13 0.464 13.9 21.5 6
+# 4 Intake SU4b PPS - P90 days drunk or 5… 30 0.714 7.13 10.7 4
+# 5 3-month SU4b PPS - P90 days drunk or 5… 15 0.441 16.1 20.0 3
+# 6 6-month SU4b PPS - P90 days drunk or 5… 13 0.464 1 0.707 1
+# # ℹ 6 more variables: MAD , min , max , range , skew ,
+# # kurt
+
+cb_summarize_categorical(cb, group_by = XRA)
+# # A tibble: 12 × 7
+# XRA name label value n pct_of_all pct_of_valid
+#
+# 1 Control XOBS Observation Wave [0] Intake 20 0.4 0.4
+# 2 Control XOBS Observation Wave [1] 3-month 16 0.32 0.32
+# 3 Control XOBS Observation Wave [2] 6-month 14 0.28 0.28
+# 4 Treatment XOBS Observation Wave [0] Intake 22 0.407 0.407
+# 5 Treatment XOBS Observation Wave [1] 3-month 18 0.333 0.333
+# 6 Treatment XOBS Observation Wave [2] 6-month 14 0.259 0.259
+# 7 Control B17 Pregnant [1] Yes 3 0.06 0.0811
+# 8 Control B17 Pregnant [0] No 34 0.68 0.919
+# 9 Control B17 Pregnant (Missing) 13 0.26 NA
+# 10 Treatment B17 Pregnant [1] Yes 3 0.0556 0.0682
+# 11 Treatment B17 Pregnant [0] No 41 0.759 0.932
+# 12 Treatment B17 Pregnant (Missing) 10 0.185 NA
+```