diff --git a/.Rbuildignore b/.Rbuildignore index a8fb882..8721688 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -5,3 +5,6 @@ ^docs$ ^pkgdown$ ^\.github$ +^README\.Rmd$ +^doc$ +^Meta$ diff --git a/.gitignore b/.gitignore index 457525e..54f27cb 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,6 @@ .DS_Store .quarto docs +inst/doc +/doc/ +/Meta/ diff --git a/DESCRIPTION b/DESCRIPTION index 45a37b1..d99ccdd 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: lighthouse.codebook Title: Summarize Datasets for Lighthouse Institute Projects -Version: 0.2.3 +Version: 0.3.0 Authors@R: c( person("Casey", "Sarapas", email = "ccsarapas@chestnut.org", @@ -35,3 +35,7 @@ Roxygen: list(markdown = TRUE) RoxygenNote: 7.3.3 URL: https://github.com/ccsarapas/lighthouse.codebook, https://ccsarapas.github.io/lighthouse.codebook/ BugReports: https://github.com/ccsarapas/lighthouse.codebook/issues +Suggests: + knitr, + rmarkdown +VignetteBuilder: knitr diff --git a/NAMESPACE b/NAMESPACE index 7a19b66..af707b1 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -4,7 +4,9 @@ S3method(nan_to_na,data.frame) S3method(nan_to_na,default) S3method(nan_to_na,list) export(cb_create) +export(cb_create_options) export(cb_create_redcap) +export(cb_create_redcap_options) export(cb_create_spss) export(cb_get_data) export(cb_summarize_categorical) diff --git a/NEWS.md b/NEWS.md index d928098..5e0ed4d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,63 @@ +# lighthouse.codebook 0.3.0 + +## Added + +* Added options to `cb_write()` to show grouping variables for categorical summaries + in rows (which was previously only possible for numeric summaries.) `cb_write()` + now includes three arguments for showing some or all grouping variables in rows: + `group_rows` controls both numeric and categorical summaries, while `group_rows_numeric` + and `group_rows_categorical` control numeric and categorical summaries, respectively. + +* Added an introductory vignette (see `vignette("lighthouse-codebook")`). + +* Expanded the README. + +## Changed + +* `cb_create()`, `cb_create_spss()`, and `cb_create_redcap()` now use a single `.options` + argument for less commonly used settings. Arguments for those settings have been + moved into an options object created with `cb_create_options()` or `cb_create_redcap_options()`. + ```r + # previously + cb <- cb_create( + dat, metadata = metadata, + .rmv_html = FALSE, .include_r_classes = TRUE + ) + + # now + cb <- cb_create( + dat, metadata = metadata, + .options = cb_create_options(rmv_html = FALSE, include_r_classes = TRUE) + ) + + ### `cb_create_spss()` also uses `cb_create_options()` + # previously + cb_spss <- cb_create_spss(dat_spss, .rmv_line_breaks = FALSE) + + # now + cb_spss <- cb_create_spss( + dat_spss, + .options = cb_create_options(rmv_line_breaks = FALSE) + ) + + ### Note that `cb_create_redcap()` has its own options constructor + # previously + cb <- cb_create_redcap( + dat_rc, metadata = metadata_rc, + .form = NULL, .rmv_html = FALSE + ) + + # now + cb <- cb_create_redcap( + dat_rc, metadata = metadata_rc, + .options = cb_create_redcap_options(form = NULL, rmv_html = FALSE) + ) + ``` + +## Removed + +* The `format = "values"` option in `cb_get_data()` has been removed (see #26). + # lighthouse.codebook 0.2.3 * `cb_create_spss()` now accepts `.rmv_html` and `.rmv_line_breaks` arguments, consistent diff --git a/R/cb_create.r b/R/cb_create.r index db26640..7e0fdab 100644 --- a/R/cb_create.r +++ b/R/cb_create.r @@ -8,8 +8,8 @@ #' [`cb_summarize_text()`]). #' #' @param data A data frame. -#' @param metadata A data frame containing metadata, such as variable labels and value -#' labels. +#' @param metadata A data frame containing metadata, such as variable labels and +#' value labels. #' @param ... Additional columns from `metadata` to preserve in the final codebook. #' New names can be assigned by passing named arguments. Columns for variable #' name, form, variable label, and value labels are included by default. @@ -23,47 +23,33 @@ #' right-hand side. If left-hand side is omitted, defaults to `tidyselect::everything()`. #' See "Specifying user missing values" below for examples. #' @param .split_var_labels A [`tidyselect`][dplyr_tidy_select] expression or list of tidyselect -#' expressions, indicating (sets of) variable labels with a common stem that should +#' expressions, indicating (sets of) variable labels with a common stem that should #' be extracted into a separate column. -#' @param .include_r_classes Include a column listing class(es) of each variable? -#' (e.g., `"factor"`, `"POSIXct, POSIXt"`.) -#' @param .include_types Include a column listing simplified type for each variable? -#' (e.g,. `"categorical"`, `"date-time"`.) #' @param .val_labs_sep1,.val_labs_sep2 Regex patterns separating value labels -#' in `metadata`. `.val_labs_sep1` separates values from labels, and `.val_labs_sep2` -#' separates value/label pairs. e.g., if value labels are in format `"1, First label|2, Second label"`, -#' set `.val_labs_sep1` to `","` and `.val_labs_sep2` to `"\\|"`. -#' @param .rmv_html Should HTML tags be removed from metadata (e.g., from variable -#' and value labels)? -#' @param .rmv_line_breaks Should line breaks be removed from metadata (e.g., from -#' variable and value labels)? If `TRUE`, line breaks will be replaced with `" / "`. -#' @param .user_missing_col Include value labels for user missing values in a separate -#' column? The default, `"if_any"`, adds the column only if user missings are -#' specified for at least one variable. -#' @param .user_missing_conflict If different labels for a value are provided in -#' metadata and user missings, which should be used? -#' @param .user_missing_incompatible How to handle variables specified in `.user_missing` -#' that aren't compatible with user missing values (e.g., logical, Date, or POSIXt)? -#' +#' in `metadata`. `.val_labs_sep1` separates values from labels, and `.val_labs_sep2` +#' separates value/label pairs from one another. e.g., if value labels are in +#' the format `"1, First label|2, Second label"`, set `.val_labs_sep1` to `","` +#' and `.val_labs_sep2` to `"\\|"`. +#' @param .options Additional options to use for codebook creation. Must be the result +#' from a call to `cb_create_options()`. See that function's help page for available +#' options. +#' #' @return -#' An `"li_codebook"` object, consisting of (1) a tibble summarizing the passed -#' dataset and (2) attributes containing the passed dataset (in several formats) -#' and additional metadata. Specifically: -#' - A tibble with columns: -#' - `name`: variable name -#' - `type`: optional column containing simplified variable type -#' - `class`: optional column containing class(es) of each variable -#' - `label_stem`: optional column containing variable label stems, if any variables -#' are specified in `.split_var_labels` -#' - `label`: variable label -#' - `values`: values, with labels if applicable -#' - `user_missing`: optional column, depending on value of `.user_missing_col`, -#' showing user missing values, with labels if applicable -#' - `missing`: proportion missing -#' - additional columns if specified in `...` -#' - Attributes: -#' - Transformed versions of the passed dataset. See [`cb_get_data()`] -#' - Lookup tables and other metadata used internally. +#' An `"li_codebook"` object, consisting of a tibble summarizing the passed +#' dataset and attributes containing additional metadata. The tibble includes columns: +#' - `name`: variable name +#' - `type`: column containing simplified variable type +#' - `class`: optional column containing class(es) of each variable +#' - `label_stem`: optional column containing variable label stems, if any variables +#' are specified in `.split_var_labels` +#' - `label`: variable label +#' - `values`: values, with labels if applicable +#' - `user_missing`: optional column showing user missing values, with labels +#' if applicable. By default, this column is included only if user missings +#' are specified for at least one variable. This behavior can be changed using +#' the `user_missing_col` argument to `cb_create_options()`. +#' - `missing`: proportion missing +#' - additional columns if specified in `...` #' #' @section Specifying user missing values: #' User missing values are defined by passing a formula or list of formulas to the @@ -72,7 +58,7 @@ #' \preformatted{ #' cb <- cb_create(data, metadata, .user_missing = var1 ~ 99) #' } -#' The same user missings can be applied to multiple variables using [tidyselect][dplyr_tidy_select] +#' The same user missings can be applied to multiple variables using [tidyselect][dplyr_tidy_select] #' expressions. #' \preformatted{ #' # for variables `var1` through `var5` @@ -80,7 +66,7 @@ #' #' # for all numeric variables, plus `var6` and `var7` #' .user_missing = c(where(is.numeric), var6, var7) ~ c(-9, -8, -7) -#' +#' #' # omitted left-hand side defaults to `tidyselect::everything()` #' .user_missing = ~ -99 #' } @@ -96,14 +82,16 @@ #' \preformatted{ #' .user_missing = ~ c(Declined = -98, "Not applicable" = -99) #' } -#' If labels set in `.user_missing` conflict with those in `metadata`, `.user_missing_conflict` -#' controls which labels are used. -#' -#' User missing values are not compatible with logical, date, or datetime (POSIXt) -#' variables. By default, these variables will be ignored if specified in `.user_missing`. -#' (i.e., user missing values will be applied only to compatible variables.) This behavior -#' can be changed using the `.user_missing_incompatible` argument. -#' +#' If labels set in `.user_missing` conflict with those in `metadata`, the `user_missing_conflict` +#' argument to `cb_create_options()` controls which labels are used. +#' +#' User missings may be set for numeric, character, factor/ordered factor, and haven_labelled/haven_labelled_spss +#' vectors. For factors, user missings are set based on factor labels (not the underlying +#' integer codes). For `"haven_labelled"` vectors, user missings are set based on +#' values (not value labels). By default, variables with incompatible classes (e.g., +#' logical, Date, POSIXt) will be ignored if specified in `.user_missing`. This +#' behavior can be changed using the `.user_missing_incompatible` argument to `cb_create_options()`. +#' #' @examples #' diamonds2 <- ggplot2::diamonds |> #' transform( @@ -115,18 +103,18 @@ #' right = FALSE #' )) #' ) -#' +#' #' # basic codebook #' cb_create(diamonds2) -#' +#' #' # convert variables to factor to treat as categorical #' diamonds2 |> #' transform( #' carat_group = factor(carat_group), #' price_group = factor(price_group) -#' ) |> +#' ) |> #' cb_create() -#' +#' #' # provide metadata for variable and value labels #' diamonds_meta <- data.frame( #' name = names(diamonds2), @@ -151,7 +139,7 @@ #' "1 = <$500; 2 = $500-$999; 3 = $1,000-$1,999; 4 = $2,000-$4,999; 5 = $5,000-$9,999; 6 = $10,000+" #' ) #' ) -#' +#' #' cb_create( #' diamonds2, diamonds_meta, #' .val_labs_sep1 = " = ", .val_labs_sep2 = "; " @@ -165,40 +153,80 @@ cb_create <- function(data, .val_labels = val_labels, .user_missing = NULL, .split_var_labels = NULL, - .include_types = !.include_r_classes, - .include_r_classes = FALSE, .val_labs_sep1 = NULL, .val_labs_sep2 = NULL, - .rmv_html = TRUE, - .rmv_line_breaks = TRUE, - .user_missing_col = c("if_any", "yes", "no"), - .user_missing_conflict = c("metadata", "missing_label"), - .user_missing_incompatible = c("ignore", "warn", "error") - ) { + .options = cb_create_options()) { + check_options(.options) data |> cb_init( metadata, meta_var_name = {{ .name }}, meta_var_label = {{ .var_label }}, meta_val_labels = {{ .val_labels }}, ... ) |> - cb_clean_fields(rmv_html = .rmv_html, rmv_line_breaks = .rmv_line_breaks) |> + cb_clean_fields( + rmv_html = .options$rmv_html, + rmv_line_breaks = .options$rmv_line_breaks + ) |> cb_user_missings( user_missing = .user_missing, - incompatible = .user_missing_incompatible + incompatible = .options$user_missing_incompatible ) |> cb_add_lookups(sep1 = .val_labs_sep1, sep2 = .val_labs_sep2) |> - cb_label_data(conflict = .user_missing_conflict) |> + cb_label_data(conflict = .options$user_missing_conflict) |> cb_zap_data() |> cb_add_dims() |> - cb_add_val_labels_col(user_missing_col = .user_missing_col) |> + cb_add_val_labels_col(user_missing_col = .options$user_missing_col) |> cb_add_type_col( - include_r_classes = .include_r_classes, - include_types = .include_types + include_r_classes = .options$include_r_classes, + include_types = .options$include_types ) |> cb_add_missing_col() |> cb_split_labels_col(split_var_labels = rlang::enexpr(.split_var_labels)) } +#' Additional options for codebook creation +#' +#' @description +#' Additional options for use by `cb_create()`. +#' +#' @inheritParams rlang::args_dots_empty +#' @param include_types Include a column listing simplified type for each variable? +#' (e.g,. `"categorical"`, `"date-time"`.) +#' @param include_r_classes Include a column listing class(es) of each variable? +#' (e.g., `"factor"`, `"POSIXct, POSIXt"`.) +#' @param rmv_html Should HTML tags be removed from metadata (e.g., from variable +#' and value labels)? +#' @param rmv_line_breaks Should line breaks be removed from metadata (e.g., from +#' variable and value labels)? If `TRUE`, line breaks will be replaced with `" / "`. +#' @param user_missing_col Include value labels for user missing values in a separate +#' column? The default, `"if_any"`, adds the column only if user missings are +#' specified for at least one variable. +#' @param user_missing_conflict If labels passed to `.user_missing` conflict with +#' value labels in metadata, which should be used? +#' @param user_missing_incompatible How to handle variables specified in `.user_missing` +#' that aren't compatible with user missing values (e.g., logical, Date, or POSIXt)? +#' +#' @export +cb_create_options <- function( + ..., + include_types = TRUE, + include_r_classes = FALSE, + rmv_html = TRUE, + rmv_line_breaks = TRUE, + user_missing_col = c("if_any", "yes", "no"), + user_missing_conflict = c("val_label", "missing_label"), + user_missing_incompatible = c("ignore", "warn", "error")) { + rlang::check_dots_empty() + out <- list( + include_types = include_types, include_r_classes = include_r_classes, + rmv_html = rmv_html, rmv_line_breaks = rmv_line_breaks, + user_missing_col = user_missing_col, + user_missing_conflict = user_missing_conflict, + user_missing_incompatible = user_missing_incompatible + ) + structure(out, class = "cb_create_options") +} + #' Extract data from a codebook object #' #' Codebook objects created by [`cb_create()`] and friends contain several transformed @@ -206,27 +234,37 @@ cb_create <- function(data, #' #' @param cb An object of class `"li_codebook"` as produced by [`cb_create()`] or #' a variant. -#' @param format Format of the returned data; see below for details. +#' @param format Format of the returned data, either `"factors"` or `"haven"`; +#' see below for details. #' #' @return #' A tibble with variables formatted based on the `format` argument. -#' - For `format = "values"`, all variables retain the same values as the original -#' dataset, including values for user missings. The data may reflect transformations -#' made by variants of [`cb_create()`] -- e.g., for [`cb_create_redcap()`], integer coercion -#' and propagation of user missings across checkbox variables. -#' - For `"haven"`, value labels and user missings are encoded using class -#' [`"haven_labelled"`][haven::labelled]` #' - For `"factors"`, all variables with value labels are converted to factors, #' and all user missings are converted to `NA`. +#' - For `"haven"`, variable labels, value labels, and user missings are encoded +#' using class [`"haven_labelled_spss"`][haven::labelled]`. +#' +#' Both formats may also reflect transformations made by variants of [`cb_create()`]. +#' In particular, for codebooks created using [`cb_create_redcap()`], integer coercion +#' and propagation of user missings across checkbox variables. #' #' @export -cb_get_data <- function(cb, format = c("factors", "haven", "values")) { +cb_get_data <- function(cb, format = c("factors", "haven")) { check_codebook(cb) - switch(match.arg(format), - factors = attr(cb, "data_zapped"), - haven = attr(cb, "data_labelled"), - values = attr(cb, "data") + tryCatch( + format <- match.arg(format), + error = \(e) { + if (format == "values") { + cli::cli_abort( + '`format = "values"` is no longer supported.', + call = parent.frame(4) + ) + } + stop(e) + } ) + if (format == "factors") attr(cb, "data_zapped") + else attr(cb, "data_labelled") } cb_init <- function(data, @@ -254,8 +292,8 @@ cb_init <- function(data, out <- out |> dplyr::mutate(values = NA_character_) } + out <- structure(out, class = c("li_codebook", class(out))) out <- set_attrs(out, data = data) - class(out) <- c("li_codebook", class(out)) out } @@ -293,7 +331,7 @@ cb_user_missings_by_var <- function(cb, ) if (length(bad_vars) > 4) bad_vars <- c(head(bad_vars, 3), "...") bad_vars <- paste(bad_vars, collapse = ", ") - msg <- "{n_bad} variable{?s} specified in {.arg .user_missing} are not compatible with user missing values" + msg <- "{n_bad} variable{?s} specified in {.arg .user_missing} {?is/are} not compatible with user missing values" if (incompatible == "error") cli::cli_abort(c("!" = msg, "*" = bad_vars)) cli::cli_warn(c("!" = paste0(msg, " and will be ignored"), "*" = bad_vars)) } @@ -426,7 +464,7 @@ cb_add_lookups <- function(cb, sep1, sep2) { reconcile_missing_labels <- function(val_labs, missings, - conflict = c("metadata", "missing_label")) { + conflict = c("val_label", "missing_label")) { conflict <- match.arg(conflict) labs_in_missing <- val_labs[match(missings, val_labs)] @@ -447,7 +485,7 @@ reconcile_missing_labels <- function(val_labs, names(missings)[label_miss] <- lab_name[label_miss] ### if na is labelled and in vals and labels don't match # relabel based on `conflict` - if (conflict == "metadata") { + if (conflict == "val_label") { names(missings)[mismatch] <- lab_name[mismatch] } else if (conflict == "missing_label") { names(val_labs)[match(lab_val[mismatch], val_labs)] <- miss_name[mismatch] @@ -455,8 +493,7 @@ reconcile_missing_labels <- function(val_labs, list(val_labs = val_labs, missings = missings) } -cb_label_data <- function(cb, conflict = c("metadata", "missing_label")) { - conflict <- match.arg(conflict) +cb_label_data <- function(cb, conflict = c("val_label", "missing_label")) { data <- attr(cb, "data") vals_by_label <- attr(cb, "vals_by_label") factors <- attr(cb, "factors") diff --git a/R/cb_create_redcap.r b/R/cb_create_redcap.r index a2d7c1c..ccb8c58 100644 --- a/R/cb_create_redcap.r +++ b/R/cb_create_redcap.r @@ -1,5 +1,5 @@ #' Generate a codebook object from REDCap data -#' +#' #' @description #' `cb_create_redcap()` builds an object of class `"li_codebook"` from a dataset and #' corresponding codebook exported from REDCap. The resulting object can be used @@ -14,156 +14,199 @@ #' - Unpacking, labelling, and optional missing propagation for checkbox data #' - Optional coercion for character variables marked as "integer" in `metedata$text_validation_type_or_show_slider_number` #' +#' All of these behaviors can be controlled using the `.options` argument. +#' #' @inheritParams cb_create #' @param data A data frame exported or retrieved from REDCap. #' @param metadata A data frame containing the REDCap codebook associated with `data`. #' @param ... Additional columns from `metadata` to preserve in the final codebook. #' New names can be assigned by passing named arguments. Columns for variable #' name, form, variable label, and value labels are included by default. -#' @param .name,.var_label,.val_labels Columns in `metadata` containing variable -#' name, variable label, and value labels, respectively. -#' @param .form Column in `metadata` containing form names. (Set to `NULL` to omit.) #' @param .user_missing A formula or list of formulas specifying user missing values. #' Formulas should specify variables on the left-hand side (as variable names #' or [tidyselect][dplyr_tidy_select] expressions), and missing values on the #' right-hand side. If left-hand side is omitted, defaults to `tidyselect::everything()`. #' See "Specifying user missing values" in [`cb_create()`] documentation for examples. -#' @param .coerce_integers Should variables listed as "integer" in `metedata$text_validation_type_or_show_slider_number` -#' be coerced to integer? -#' @param .checkbox_resp_values Should checkbox values use labels in `metadata` -#' (`TRUE`) or "Yes" / "No" (`FALSE`)? See "Checkbox data handling" below. -#' @param .propagate_checkbox_missings Should user missing values in a checkbox -#' group be propagated across all variables in the group? See "Checkbox data handling" -#' below. +#' @param .options Additional options to use for codebook creation. Must be the result +#' from a call to `cb_create_redcap_options()` or `cb_create_options()`. See `?cb_create_redcap_options` +#' for available options. #' #' @return -#' An `"li_codebook"` object, consisting of (1) a tibble summarizing the passed -#' dataset and (2) attributes containing the passed dataset (in several formats) -#' and additional metadata. Specifically: -#' - A tibble with columns: -#' - `name`: variable name -#' - `form`: form name -#' - `type`: optional column containing simplified variable type -#' - `class`: optional column containing class(es) of each variable -#' - `label_stem`: optional column containing variable label stems, if any variables -#' are specified in `.split_var_labels` -#' - `label`: variable label -#' - `values`: values, with labels if applicable -#' - `user_missing`: optional column, depending on value of `.user_missing_col`, -#' showing user missing values, with labels if applicable -#' - `missing`: proportion missing -#' - additional columns if specified in `...` -#' - Attributes: -#' - Transformed versions of the passed dataset. See [`cb_get_data()`]. -#' - Lookup tables and other metadata used internally. +#' An `"li_codebook"` object, consisting of a tibble summarizing the passed +#' dataset and attributes containing additional metadata. The tibble includes columns: +#' - `name`: variable name +#' - `form`: form name +#' - `type`: column containing simplified variable type +#' - `class`: optional column containing class(es) of each variable +#' - `label_stem`: optional column containing variable label stems, if any variables +#' are specified in `.split_var_labels` +#' - `label`: variable label +#' - `values`: values, with labels if applicable +#' - `user_missing`: optional column showing user missing values, with labels +#' if applicable. By default, this column is included only if user missings +#' are specified for at least one variable. This behavior can be changed using +#' the `user_missing_col` argument to `cb_create_options()`. +#' - `missing`: proportion missing +#' - additional columns if specified in `...` #' #' @section Checkbox data handling: #' ## Value labels #' Data from REDCap checkboxes yields one variable in the dataset for each response -#' option. These will be labelled generically with `"Yes"` or `"No"`, unless `.checkbox_resp_values` -#' is `TRUE`, in which case response-specific labels from `metadata` will be used. -#' For example, if a checkbox group has options "In the past year," "More than a +#' option. By default, these will be labelled generically with `"Yes"` or `"No"`. +#' For example, consider a checkbox group with options "In the past year," "More than a #' year ago," and "Never," corresponding to variables `chk_var1___0`, `chk_var1___1`, -#' and `chk_var1___2`: if `.checkbox_resp_values` is `FALSE`, all of these will -#' have values: +#' and `chk_var1___2`. By default, all of these will be given the same value labels: #' - `chk_var1___0`, `chk_var1___1`, `chk_var1___2`: 0 = "No"; 1 = "Yes". -#' -#' If `.checkbox_resp_values` is `TRUE`, each variable will have unique labels: +#' This behavior can be changed by setting `checkbox_resp_values = TRUE` in `cb_create_options()`. +#' In this case, response-specific labels from `metadata` will be used, so that +#' each variable will have unique labels: #' - `chk_var1___0`: 0 = "Not selected," 1 = "In the past year" #' - `chk_var1___1`: 0 = "Not selected," 1 = "More than a year ago" #' - `chk_var1___2`: 0 = "Not selected," 0 = "Never" #' #' ## Missing value propagation -#' If `.propagate_checkbox_missings` is `TRUE`, missing values in a checkbox group -#' variable will be propagated to all variables in the group. For example, given -#' a checkbox group with options "Pregnant," "Not pregnant," and "Not applicable," -#' corresponding to variables `chk_preg_0___0`, `chk_preg_0___1`, and `chk_preg_0____9`, -#' and assuming that `-9` is specified as a user missing value. If `.propagate_checkbox_missings` -#' is `TRUE`, `chk_preg_0___0` and `chk_preg_0___1` will be set to `-9` if `chk_preg_0____9` -#' is `1`. Otherwise, these columns will remain as `0` where `chk_preg_0____9` is `1`. +#' By default, missing values in a checkbox group will be propagated to all variables +#' in the group. For example, consider a checkbox group with options "Pregnant," +#' "Not pregnant," and "Not applicable," corresponding to variables `chk_preg_0___0`, +#' `chk_preg_0___1`, and `chk_preg_0____9`, and assuming that `-9` is specified +#' as a user missing value. By default, `chk_preg_0___0` and `chk_preg_0___1` will +#' be set to `-9` if `chk_preg_0____9` is `1`. This behavior can be overridden by +#' setting `propagate_checkbox_missings = FALSE` in `cb_create_options()`, in which +#' case no values will be changed. #' #' @export cb_create_redcap <- function(data, metadata, ..., - .name = field_name, - .var_label = field_label, - .val_labels = select_choices_or_calculations, - .form = form_name, .user_missing = NULL, .split_var_labels = NULL, - .include_types = !.include_r_classes, - .include_r_classes = FALSE, - .val_labs_sep1 = ", ", - .val_labs_sep2 = "\\|", - .rmv_html = TRUE, - .rmv_line_breaks = TRUE, - .coerce_integers = TRUE, - .checkbox_resp_values = FALSE, - .propagate_checkbox_missings = TRUE, - .user_missing_col = c("if_any", "yes", "no"), - .user_missing_conflict = c("metadata", "missing_label"), - .user_missing_incompatible = c("ignore", "warn", "error") - ) { - .user_missing_col <- match.arg(.user_missing_col) - .user_missing_conflict <- match.arg(.user_missing_conflict) - meta <- meta_expand_checkboxes_rc(metadata, data) + .options = cb_create_redcap_options()) { + check_options(.options, redcap = TRUE) + meta <- meta_expand_checkboxes_rc( + metadata, data, + name = !!.options$name, type = !!.options$type + ) cb <- data |> cb_init( meta, - meta_var_name = {{ .name }}, meta_var_label = {{ .var_label }}, - meta_val_labels = {{ .val_labels }}, form = {{ .form }}, ..., - ..rc_type = field_type, + meta_var_name = !!.options$name, meta_var_label = !!.options$var_label, + meta_val_labels = !!.options$val_labels, form = !!.options$form, ..., + ..rc_type = !!.options$type, ..rc_validate_type = text_validation_type_or_show_slider_number, ) - if (.coerce_integers) cb <- cb_coerce_integers_rc(cb) + if (.options$coerce_integers) cb <- cb_coerce_integers_rc(cb) cb$..rc_validate_type <- NULL cb <- cb |> - cb_clean_fields(rmv_html = .rmv_html, rmv_line_breaks = .rmv_line_breaks) |> + cb_clean_fields( + rmv_html = .options$rmv_html, + rmv_line_breaks = .options$rmv_line_breaks + ) |> cb_user_missings( user_missing = .user_missing, - incompatible = .user_missing_incompatible + incompatible = .options$user_missing_incompatible ) |> - cb_add_lookups(sep1 = .val_labs_sep1, sep2 = .val_labs_sep2) |> - cb_relabel_checkboxes_rc(use_resp_values = .checkbox_resp_values) + cb_add_lookups( + sep1 = .options$val_labs_sep1, + sep2 = .options$val_labs_sep2 + ) |> + cb_relabel_checkboxes_rc(use_resp_values = .options$checkbox_resp_values) if ("form" %in% names(cb)) cb <- cb_complete_label_rc(cb) - if (.propagate_checkbox_missings) { + if (.options$propagate_checkbox_missings) { cb <- cb_propagate_user_missing_checkboxes_rc(cb) } cb |> - cb_label_data(conflict = .user_missing_conflict) |> + cb_label_data(conflict = .options$user_missing_conflict) |> cb_zap_data() |> cb_add_dims() |> - cb_add_val_labels_col(user_missing_col = .user_missing_col) |> + cb_add_val_labels_col(user_missing_col = .options$user_missing_col) |> cb_add_type_col( - include_r_classes = .include_r_classes, - include_types = .include_types + include_r_classes = .options$include_r_classes, + include_types = .options$include_types ) |> cb_add_missing_col() |> cb_split_labels_col(split_var_labels = rlang::enexpr(.split_var_labels)) |> dplyr::relocate(any_of(c("form", "type", "class")), .after = name) } -## `field_name` and `field_type` are hard-coded -- do they always have these names? -meta_expand_checkboxes_rc <- function(meta, data) { - if (!("checkbox" %in% meta$field_type)) return(meta) +#' @rdname cb_create_options +#' +#' @param name,var_label,val_labels,type For REDCap data, columns in `metadata` containing variable +#' name, variable label, value labels, and variable type, respectively. +#' @param form For REDCap data, column in `metadata` containing form names. (Set to `NULL` to omit.) +#' @param val_labs_sep1,val_labs_sep2 For REDCap data, regex patterns separating value labels +#' in `metadata`. `val_labs_sep1` separates values from labels, and `val_labs_sep2` +#' separates value/label pairs from one another. e.g., if value labels are in +#' the format `"1, First label|2, Second label"`, set `val_labs_sep1` to `","` +#' and `val_labs_sep2` to `"\\|"`. +#' @param coerce_integers For REDCap data, should variables listed as "integer" in `metedata$text_validation_type_or_show_slider_number` +#' be coerced to integer? +#' @param checkbox_resp_values For REDCap data, should checkbox values use labels in `metadata` (`TRUE`) +#' or "Yes" / "No" (`FALSE`)? See "Checkbox data handling" on the `cb_create_redcap()` +#' help page. +#' @param propagate_checkbox_missings For REDCap data, should user missing values in a checkbox group +#' be propagated across all variables in the group? See "Checkbox data handling" +#' on the `cb_create_redcap()` help page. +#' +#' @export +cb_create_redcap_options <- function( + ..., + include_types = TRUE, + include_r_classes = FALSE, + rmv_html = TRUE, + rmv_line_breaks = TRUE, + user_missing_col = c("if_any", "yes", "no"), + user_missing_conflict = c("val_label", "missing_label"), + user_missing_incompatible = c("ignore", "warn", "error"), + name = field_name, + var_label = field_label, + val_labels = select_choices_or_calculations, + type = field_type, + form = form_name, + val_labs_sep1 = ", ", + val_labs_sep2 = "\\|", + coerce_integers = TRUE, + checkbox_resp_values = FALSE, + propagate_checkbox_missings = TRUE) { + rlang::check_dots_empty() + out <- list( + include_types = include_types, include_r_classes = include_r_classes, + rmv_html = rmv_html, rmv_line_breaks = rmv_line_breaks, + user_missing_col = user_missing_col, + user_missing_conflict = user_missing_conflict, + user_missing_incompatible = user_missing_incompatible, + name = rlang::enquo(name), var_label = rlang::enquo(var_label), + val_labels = rlang::enquo(val_labels), type = rlang::enquo(type), + form = rlang::enquo(form), val_labs_sep1 = val_labs_sep1, + val_labs_sep2 = val_labs_sep2, coerce_integers = coerce_integers, + checkbox_resp_values = checkbox_resp_values, + propagate_checkbox_missings = propagate_checkbox_missings + ) + structure(out, class = "cb_create_redcap_options") +} + +meta_expand_checkboxes_rc <- function(meta, data, name, type) { + name_chr <- as.character(rlang::ensym(name)) + type_chr <- as.character(rlang::ensym(type)) + if (!("checkbox" %in% meta[[type_chr]])) return(meta) datanames <- names(data) + meta <- dplyr::rename(meta, ..name = {{ name }}, ..type = {{ type }}) checkbox_names <- meta |> - dplyr::filter(field_type == "checkbox") |> - dplyr::select(field_name) |> + dplyr::filter(..type == "checkbox") |> + dplyr::select(..name) |> dplyr::reframe( .chk_name = datanames[ - stringr::str_starts(datanames, stringr::str_c(field_name, "___")) + stringr::str_starts(datanames, stringr::str_c(..name, "___")) ], - .by = field_name + .by = ..name ) meta |> - dplyr::left_join(checkbox_names, dplyr::join_by(field_name)) |> + dplyr::left_join(checkbox_names, dplyr::join_by(..name)) |> dplyr::mutate( # .chk_name_stem = ifelse(!is.na(.chk_name), field_name, NA), - field_name = dplyr::coalesce(.chk_name, field_name), + ..name = dplyr::coalesce(.chk_name, ..name), .keep = "unused" - ) + ) |> + dplyr::rename("{name_chr}" := ..name, "{type_chr}" := ..type) } cb_coerce_integers_rc <- function(cb) { diff --git a/R/cb_create_spss.r b/R/cb_create_spss.r index b366a46..a4827cd 100644 --- a/R/cb_create_spss.r +++ b/R/cb_create_spss.r @@ -19,61 +19,47 @@ #' or [tidyselect][dplyr_tidy_select] expressions), and missing values on the #' right-hand side. If left-hand side is omitted, defaults to `tidyselect::everything()`. #' See "Specifying user missing values" in [`cb_create()`] documentation for examples. -#' @param .rmv_html Should HTML tags be removed from variable and value labels? -#' @param .rmv_line_breaks Should line breaks be removed from variable and value -#' labels? If `TRUE`, line breaks will be replaced with `" / "`. -#' @param .user_missing_conflict If labels passed to `.user_missing` conflicts with -#' a value label in `data`, which should be used? #' #' @return -#' An `"li_codebook"` object, consisting of (1) a tibble summarizing the passed -#' dataset and (2) attributes containing the passed dataset (in several formats) -#' and additional metadata. Specifically: -#' - A tibble with columns: -#' - `name`: variable name -#' - `type`: optional column containing simplified variable type -#' - `class`: optional column containing class(es) of each variable -#' - `label_stem`: optional column containing variable label stems, if any variables -#' are specified in `.split_var_labels` -#' - `label`: variable label -#' - `values`: values, with labels if applicable -#' - `user_missing`: optional column, depending on value of `.user_missing_col`, -#' showing user missing values, with labels if applicable -#' - `missing`: proportion missing -#' - Attributes: -#' - Transformed versions of the passed dataset. See [`cb_get_data()`]. -#' - Lookup tables and other metadata used internally. +#' An `"li_codebook"` object, consisting of a tibble summarizing the passed +#' dataset and attributes containing additional metadata. The tibble includes columns: +#' - `name`: variable name +#' - `type`: column containing simplified variable type +#' - `class`: optional column containing class(es) of each variable +#' - `label_stem`: optional column containing variable label stems, if any variables +#' are specified in `.split_var_labels` +#' - `label`: variable label +#' - `values`: values, with labels if applicable +#' - `user_missing`: optional column showing user missing values, with labels +#' if applicable. By default, this column is included only if user missings +#' are specified for at least one variable. This behavior can be changed using +#' the `user_missing_col` argument to `cb_create_options()`. +#' - `missing`: proportion missing #' #' @export cb_create_spss <- function(data, .user_missing = NULL, .split_var_labels = NULL, - .include_types = !.include_r_classes, - .include_r_classes = FALSE, - .rmv_html = TRUE, - .rmv_line_breaks = TRUE, - .user_missing_col = c("if_any", "yes", "no"), - .user_missing_conflict = c("val_label", "missing_label"), - .user_missing_incompatible = c("ignore", "warn", "error") - ) { + .options = cb_create_options()) { + check_options(.options) data |> cb_init() |> cb_clean_fields_spss( - rmv_html = .rmv_html, - rmv_line_breaks = .rmv_line_breaks + rmv_html = .options$rmv_html, + rmv_line_breaks = .options$rmv_line_breaks ) |> cb_add_label_col_spss() |> cb_update_labels_spss( user_missing = .user_missing, - user_missing_conflict = .user_missing_conflict, - user_missing_incompatible = .user_missing_incompatible + user_missing_conflict = .options$user_missing_conflict, + user_missing_incompatible = .options$user_missing_incompatible ) |> cb_zap_data_spss() |> cb_add_dims() |> - cb_add_val_labels_col(user_missing_col = .user_missing_col) |> + cb_add_val_labels_col(user_missing_col = .options$user_missing_col) |> cb_add_type_col( - include_r_classes = .include_r_classes, - include_types = .include_types + include_r_classes = .options$include_r_classes, + include_types = .options$include_types ) |> cb_add_missing_col() |> cb_split_labels_col(split_var_labels = rlang::enexpr(.split_var_labels)) @@ -114,7 +100,6 @@ cb_update_labels_spss <- function(cb, cb_add_lookups() |> set_attrs(data_labelled = data) } else { - conflict <- sub("val_label", "metadata", match.arg(user_missing_conflict)) user_missing <- check_user_missing_arg(user_missing) user_missing_vars <- user_missing |> lapply(\(um) { @@ -140,7 +125,7 @@ cb_update_labels_spss <- function(cb, user_missing = attr_user_missing[names(attr_user_missing) %in% user_missing_vars], vals_by_label = attr_vals_by_label[names(attr_vals_by_label) %in% user_missing_vars] ) |> - cb_label_data(conflict = conflict) |> + cb_label_data(conflict = user_missing_conflict) |> # then restore full missing and val attributes set_attrs( user_missing = attr_user_missing, diff --git a/R/cb_summarize.r b/R/cb_summarize.r index fb22b3d..d2c4650 100644 --- a/R/cb_summarize.r +++ b/R/cb_summarize.r @@ -152,7 +152,8 @@ cb_summarize_categorical_impl <- function(cb, prefixed = TRUE, detail_missing = missing(group_by), detail_na_label = "NA", - warn_if_none = FALSE) { + warn_if_none = FALSE, + group_rows = NULL) { force(detail_missing) data <- attr(cb, "data_labelled") data_dt <- data.table::as.data.table(data) @@ -307,14 +308,18 @@ cb_summarize_categorical_impl <- function(cb, ) freqs <- freqs[, cols_out, with = FALSE] + group_cols <- setdiff(group_by, group_rows) + if (!length(group_cols)) group_cols <- NULL + freqs |> tibble::as_tibble() |> set_attrs( detail_missing = detail_missing, id_cols = c("name", label_cols, "value"), group_by = group_by, - group_cols = group_by, - group_counts = group_counts(cb, group_by) + group_rows = group_rows, + group_cols = group_cols, + group_counts = group_counts(cb, group_cols) ) } diff --git a/R/cb_write.r b/R/cb_write.r index 748e140..a99c2f5 100644 --- a/R/cb_write.r +++ b/R/cb_write.r @@ -15,24 +15,28 @@ #' a variant. #' @param file Path to write to. #' @param dataset_name Name of the dataset to display in workbook headers. -#' @param incl_date,incl_dims Should the date and/or dataset dimensions be included -#' in the Overview tab header? -#' @param hyperlinks If `TRUE`, variable names on the Overview sheet will link -#' to corresponding rows on summary tabs and vice versa. #' @param group_by <[`tidy-select`][dplyr_tidy_select]> Column or columns to group #' by. If specified, additional numeric and categorical summary tabs will be included -#' with grouped summaries. Subgroups are shown in columns by default. For the numeric -#' summary tab, subgroups for some or all grouping variables can instead be shown -#' in rows if specified in `group_rows_numeric`. -#' @param group_rows_numeric <[`tidy-select`][dplyr_tidy_select]> Column or columns -#' to group by in rows on the grouped numeric summary tab. All columns must also -#' be specified in `group_by`. +#' with grouped summaries. Subgroups are shown in columns by default. Some or all +#' grouping variables can instead be shown in rows if specified in `group_rows`, +#' `group_rows_numeric`, or `group_rows_categorical`. +#' @param group_rows <[`tidy-select`][dplyr_tidy_select]> Column or columns to group +#' by in rows on grouped summary tabs. All columns must also be specified in `group_by`. +#' Will apply to both numeric and categorical summary tabs unless otherwise specified +#' in `group_rows_numeric` or `group_rows_categorical`. +#' @param group_rows_numeric,group_rows_categorical <[`tidy-select`][dplyr_tidy_select]> +#' Column or columns to group by in rows on grouped numeric or categorical summary +#' tab. #' @param detail_missing Include detailed missing value information on ungrouped #' categorical and text summary tabs? (Detailed missing information for grouped #' summary tabs is not currently supported.) #' @param n_text_vals On the text summary tab, how many unique non-missing values #' should be included for each variable? If there are more than `n_text_vals` + 1 #' unique values, the `n_text_vals` most common non-missing values will be included. +#' @param incl_date,incl_dims Should the date and/or dataset dimensions be included +#' in the Overview tab header? +#' @param hyperlinks If `TRUE`, variable names on the Overview sheet will link +#' to corresponding rows on summary tabs and vice versa. #' @param overwrite Overwrite existing file? #' #' @return @@ -54,13 +58,15 @@ cb_write <- function(cb, file, dataset_name = NULL, - incl_date = TRUE, - incl_dims = TRUE, - hyperlinks = TRUE, group_by = NULL, - group_rows_numeric = NULL, + group_rows = NULL, + group_rows_numeric = group_rows, + group_rows_categorical = group_rows, detail_missing = c("if_any_user_missing", "yes", "no"), n_text_vals = 5, + incl_date = TRUE, + incl_dims = TRUE, + hyperlinks = TRUE, overwrite = TRUE) { check_codebook(cb) detail_missing <- match.arg(detail_missing) @@ -71,32 +77,38 @@ cb_write <- function(cb, num = cb_summarize_numeric_impl(cb), cat = cb_summarize_categorical_impl(cb, detail_missing = detail_missing), txt = cb_summarize_text_impl( - cb, - n_text_vals = n_text_vals, + cb, + n_text_vals = n_text_vals, detail_missing = detail_missing ) ) group_by <- cb_untidyselect(cb, {{ group_by }}) - group_rows_numeric <- cb_untidyselect(cb, {{ group_rows_numeric }}) - if (!is.null(group_rows_numeric)) { - if (is.null(group_by)) { - cli::cli_abort( - "If `group_rows_numeric` is specified, `group_by` must also be specified." - ) - } - if (length(setdiff(group_rows_numeric, group_by))) { - cli::cli_abort( - "All columns specified in `group_rows_numeric` must also be included in `group_by`." - ) - } + group_rows <- cb_untidyselect(cb, {{ group_rows }}) + if (missing(group_rows_numeric)) { + group_rows_numeric <- group_rows + } else { + group_rows_numeric <- cb_untidyselect(cb, {{ group_rows_numeric }}) + } + if (missing(group_rows_categorical)) { + group_rows_categorical <- group_rows + } else { + group_rows_categorical <- cb_untidyselect(cb, {{ group_rows_categorical }}) } + check_group_rows_arg(group_rows, group_by) + check_group_rows_arg(group_rows_numeric, group_by) + check_group_rows_arg(group_rows_categorical, group_by) + if (!is.null(group_by)) { summaries$num_grp <- cb_summarize_numeric_impl( cb, group_by = group_by, group_rows = group_rows_numeric ) - summaries$cat_grp <- cb_summarize_categorical_impl(cb, group_by = group_by) + summaries$cat_grp <- cb_summarize_categorical_impl( + cb, + group_by = group_by, + group_rows = group_rows_categorical + ) } cb_write_codebook( cb, summaries, @@ -768,7 +780,13 @@ cb_write_codebook <- function(cb, summaries$cat_grp <- summaries$cat_grp |> cb_format_names(skip = group_by, attrs = "id_cols") cols_pct <- untidyselect(summaries$cat_grp, tidyselect::starts_with("%")) - clear_repeats <- setdiff(attr(summaries$cat_grp, "id_cols"), "Value") + group_rows <- attr(summaries$cat_grp, "group_rows") + id_cols <- attr(summaries$cat_grp, "id_cols") + clear_repeats <- c(setdiff(id_cols, "Value"), group_rows) + if (!is.null(group_rows)) { + attr(summaries$cat_grp, "group_rows") <- c(group_rows, "Value") + attr(summaries$cat_grp, "id_cols") <- setdiff(id_cols, "Value") + } sheet_nms$cat_grp <- paste0("Grouped ", sheet_nms$cat) headers$cat_grp <- c(headers$cat, paste("By ", toString(group_by))) params$cat_grp <- summaries$cat_grp |> diff --git a/R/utils.r b/R/utils.r index 22226cc..6f49877 100644 --- a/R/utils.r +++ b/R/utils.r @@ -1,20 +1,56 @@ -is_codebook <- function(x) "li_codebook" %in% class(x) +is_codebook <- function(x) inherits(x, "li_codebook") check_codebook <- function(x) { arg <- as.character(rlang::ensym(x)) if (!is_codebook(x)) { cli::cli_abort('{.arg {arg}} must be an object of class `"li_codebook"`.') } } +check_options <- function(x, redcap = FALSE) { + if (redcap) { + opts_class <- "cb_create_redcap_options" + opts_class_wrong <- "cb_create_options" + } else { + opts_class <- "cb_create_options" + opts_class_wrong <- "cb_create_redcap_options" + } + if (inherits(x, opts_class_wrong)) { + msg <- c( + "!" = "`.options` must be created from `{opts_class}()`, not `{opts_class_wrong}()`." + ) + if (!redcap) { + msg <- c(msg, "i" = "Did you mean to call `cb_create_redcap()`?") + } + cli::cli_abort(msg) + } + if (!inherits(x, opts_class)) { + cli::cli_abort("`.options` must be created from `{opts_class}()`") + } +} check_user_missing_arg <- function(x) { arg <- as.character(rlang::ensym(x)) if (!( - rlang::is_formula(x) || (is.list(x) && all(sapply(x, rlang::is_formula))) - )) { + rlang::is_formula(x) || (is.list(x) && all(sapply(x, rlang::is_formula))) + )) { cli::cli_abort("{.arg {arg}} must be a formula or list of formulas.") } if (rlang::is_formula(x)) x <- list(x) x } +check_group_rows_arg <- function(group_rows, group_by) { + arg <- as.character(rlang::ensym(group_rows)) + if (!is.null(group_rows)) { + if (is.null(group_by)) { + cli::cli_abort( + "If {.arg {arg}} is specified, {.arg group_by} must also be specified." + ) + } + if (length(setdiff(group_rows, group_by))) { + cli::cli_abort( + "All columns specified in {.arg {arg}} must also be included in {.arg group_by}." + ) + } + } +} set_attrs <- function(x, ...) { dots <- rlang::list2(...) for (nm in names(dots)) attr(x, nm) <- dots[[nm]] diff --git a/README.Rmd b/README.Rmd new file mode 100644 index 0000000..3173f0c --- /dev/null +++ b/README.Rmd @@ -0,0 +1,130 @@ +--- +output: github_document +--- + + + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#", + out.width = "100%", + fig.align = "center", + fig.path = "man/figures/", + eval = FALSE +) +``` + +# lighthouse.codebook + +The lighthouse.codebook package includes tools to summarize a dataset into a formatted +Excel workbook, including a data dictionary and vrariable summaries. It incorporates external +metadata (such as variable labels, value labels, and user missing / non-response codes), +with functions for using metadata from SPSS and REDCap datasets. Codebooks can be +customized in a number of ways, including options for grouped summaries. + +## Installation + +You can install lighthouse.codebook by running: + +```r +# install.packages("remotes") +remotes::install_github("ccsarapas/lighthouse.codebook") +``` + +## Creating codebooks +Creating a codebook involves two general steps: + +1. Create a "codebook" object in R from a data frame (and, + optionally, metadata) using `cb_create()` or a specialized variant + (such as `cb_create_spss()` or `cb_create_redcap()`). + +2. Write the codebook to disk using `cb_write()`. + +```r +library(lighthouse.codebook) + +# create and write a codebook without metadata +dat |> + cb_create() |> + cb_write("cb.xlsx") + +# with metadata +dat |> + cb_create(metadata = dat1_metadata) |> + cb_write("cb.xlsx") + +# from SPSS data +dat_spss <- haven::read_sav("dat_spss.sav", user_na = TRUE) + +dat_spss |> + cb_create_spss() |> + cb_write("cb_spss.xlsx") + +# from REDCap data +dat_rc <- REDCapR::redcap_read(redcap_uri = rc_uri, token = rc_token) +meta_rc <- REDCapR::redcap_metadata_read(redcap_uri = rc_uri, token = rc_token) + +dat_rc$data |> + cb_create_redcap(metadata = meta_rc$data) |> + cb_write("cb_rc.xlsx") +``` + +## Customizing codebooks + +There are many options for controlling how data is interpreted, summarized, and +presented. See `vignette("lighthouse-codebook")` for some of the most useful options, +including grouped data summaries and specifying user missing codes. Further options +are detailed in the help pages for `cb_create()` and `cb_write()`. + + + +## Codebook contents + +The codebook written to disk will include an _overview_ tab listing all variables +in the dataset; _summary_ tabs for numeric, categorical, and text variables; and, +if grouping variables are specified, _grouped summary_ tabs for numeric and categorical +variables. + +The _overview_ tab includes one row for each variable in the dataset, with information +on variable types, labels, values, and missingness. By default, each variable is +hyperlinked to its location on the relevant summary tab. + +```{r, overview, echo = FALSE, eval = TRUE} +knitr::include_graphics("man/figures/README-overview.png") +``` + +The _numeric summary_ tab includes descriptive statistics for all numeric variables +in the dataset: + +```{r, numeric, echo = FALSE, eval = TRUE} +knitr::include_graphics("man/figures/README-numeric.png") +``` + +The _categorical summary_ tab includes frequencies for all categorical variables, +optionally with separate rows for user missing values: + +```{r, categorical, echo = FALSE, eval = TRUE} +knitr::include_graphics("man/figures/README-categorical.png") +``` + +Finally, the _text summary_ tab includes frequencies for the most common values for all +text variables in the dataset. (The number of values shown can be adjusted using +the `n_text_vals` argument to `cb_write()`.) + +```{r, text, echo = FALSE, eval = TRUE} +knitr::include_graphics("man/figures/README-text.png") +``` + +If `group_by` is specified in `cb_write()`, additional numeric and categorical summary +tabs grouped by the specified variables will be included. + +## SPSS extension + +Functionality from this package is also available as an SPSS extension command [here](https://github.com/ccsarapas/lighthouse.codebook.spss). diff --git a/README.md b/README.md index 3e38f4e..c846ee9 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,115 @@ + + # lighthouse.codebook -The lighthouse.codebook package includes tools for summarizing datasets used by staff at the [Lighthouse Institute](https://www.chestnut.org/lighthouse-institute/), the research division of Chestnut Health Systems. +The lighthouse.codebook package includes tools to summarize a dataset +into a formatted Excel workbook, including a data dictionary and +vrariable summaries. It incorporates external metadata (such as variable +labels, value labels, and user missing / non-response codes), with +functions for using metadata from SPSS and REDCap datasets. Codebooks +can be customized in a number of ways, including options for grouped +summaries. ## Installation -Install lighthouse.codebook by running: +You can install lighthouse.codebook by running: ``` r # install.packages("remotes") remotes::install_github("ccsarapas/lighthouse.codebook") ``` + +## Creating codebooks + +Creating a codebook involves two general steps: + +1. Create a “codebook” object in R from a data frame (and, optionally, + metadata) using `cb_create()` or a specialized variant (such as + `cb_create_spss()` or `cb_create_redcap()`). + +2. Write the codebook to disk using `cb_write()`. + +``` r +library(lighthouse.codebook) + +# create and write a codebook without metadata +dat |> + cb_create() |> + cb_write("cb.xlsx") + +# with metadata +dat |> + cb_create(metadata = dat1_metadata) |> + cb_write("cb.xlsx") + +# from SPSS data +dat_spss <- haven::read_sav("dat_spss.sav", user_na = TRUE) + +dat_spss |> + cb_create_spss() |> + cb_write("cb_spss.xlsx") + +# from REDCap data +dat_rc <- REDCapR::redcap_read(redcap_uri = rc_uri, token = rc_token) +meta_rc <- REDCapR::redcap_metadata_read(redcap_uri = rc_uri, token = rc_token) + +dat_rc$data |> + cb_create_redcap(metadata = meta_rc$data) |> + cb_write("cb_rc.xlsx") +``` + +## Customizing codebooks + +There are many options for controlling how data is interpreted, +summarized, and presented. See `vignette("lighthouse-codebook")` for +some of the most useful options, including grouped data summaries and +specifying user missing codes. Further options are detailed in the help +pages for `cb_create()` and `cb_write()`. + + + +## Codebook contents + +The codebook written to disk will include an *overview* tab listing all +variables in the dataset; *summary* tabs for numeric, categorical, and +text variables; and, if grouping variables are specified, *grouped +summary* tabs for numeric and categorical variables. + +The *overview* tab includes one row for each variable in the dataset, +with information on variable types, labels, values, and missingness. By +default, each variable is hyperlinked to its location on the relevant +summary tab. + + + +The *numeric summary* tab includes descriptive statistics for all +numeric variables in the dataset: + + + +The *categorical summary* tab includes frequencies for all categorical +variables, optionally with separate rows for user missing values: + + + +Finally, the *text summary* tab includes frequencies for the most common +values for all text variables in the dataset. (The number of values +shown can be adjusted using the `n_text_vals` argument to `cb_write()`.) + + + +If `group_by` is specified in `cb_write()`, additional numeric and +categorical summary tabs grouped by the specified variables will be +included. + ## SPSS extension -Functionality from this package is also available as an SPSS extension command [here](https://github.com/ccsarapas/lighthouse.codebook.spss). \ No newline at end of file +Functionality from this package is also available as an SPSS extension +command [here](https://github.com/ccsarapas/lighthouse.codebook.spss). diff --git a/_pkgdown.yml b/_pkgdown.yml index d98e4d4..961f921 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -1,7 +1,10 @@ url: https://ccsarapas.github.io/lighthouse.codebook/ template: bootstrap: 5 - bootswatch: sandstone + bslib: + base_font: {google: "Roboto"} + code_font: {google: "Roboto Mono"} + font_scale: 1.2 footer: structure: left: package @@ -10,4 +13,4 @@ authors: Casey Sarapas: href: "https://chestnut.org/li/scientists-and-project-directors/category/research-scientists/profile/casey-sarapas-phd" Chestnut Health Systems: - href: "https://chestnut.org/" \ No newline at end of file + href: "https://chestnut.org/" diff --git a/man/cb_create.Rd b/man/cb_create.Rd index c8872c6..0112709 100644 --- a/man/cb_create.Rd +++ b/man/cb_create.Rd @@ -13,22 +13,16 @@ cb_create( .val_labels = val_labels, .user_missing = NULL, .split_var_labels = NULL, - .include_types = !.include_r_classes, - .include_r_classes = FALSE, .val_labs_sep1 = NULL, .val_labs_sep2 = NULL, - .rmv_html = TRUE, - .rmv_line_breaks = TRUE, - .user_missing_col = c("if_any", "yes", "no"), - .user_missing_conflict = c("metadata", "missing_label"), - .user_missing_incompatible = c("ignore", "warn", "error") + .options = cb_create_options() ) } \arguments{ \item{data}{A data frame.} -\item{metadata}{A data frame containing metadata, such as variable labels and value -labels.} +\item{metadata}{A data frame containing metadata, such as variable labels and +value labels.} \item{...}{Additional columns from \code{metadata} to preserve in the final codebook. New names can be assigned by passing named arguments. Columns for variable @@ -49,58 +43,34 @@ See "Specifying user missing values" below for examples.} expressions, indicating (sets of) variable labels with a common stem that should be extracted into a separate column.} -\item{.include_types}{Include a column listing simplified type for each variable? -(e.g,. \code{"categorical"}, \code{"date-time"}.)} - -\item{.include_r_classes}{Include a column listing class(es) of each variable? -(e.g., \code{"factor"}, \code{"POSIXct, POSIXt"}.)} - \item{.val_labs_sep1, .val_labs_sep2}{Regex patterns separating value labels in \code{metadata}. \code{.val_labs_sep1} separates values from labels, and \code{.val_labs_sep2} -separates value/label pairs. e.g., if value labels are in format \code{"1, First label|2, Second label"}, -set \code{.val_labs_sep1} to \code{","} and \code{.val_labs_sep2} to \code{"\\\\|"}.} - -\item{.rmv_html}{Should HTML tags be removed from metadata (e.g., from variable -and value labels)?} - -\item{.rmv_line_breaks}{Should line breaks be removed from metadata (e.g., from -variable and value labels)? If \code{TRUE}, line breaks will be replaced with \code{" / "}.} - -\item{.user_missing_col}{Include value labels for user missing values in a separate -column? The default, \code{"if_any"}, adds the column only if user missings are -specified for at least one variable.} - -\item{.user_missing_conflict}{If different labels for a value are provided in -metadata and user missings, which should be used?} +separates value/label pairs from one another. e.g., if value labels are in +the format \code{"1, First label|2, Second label"}, set \code{.val_labs_sep1} to \code{","} +and \code{.val_labs_sep2} to \code{"\\\\|"}.} -\item{.user_missing_incompatible}{How to handle variables specified in \code{.user_missing} -that aren't compatible with user missing values (e.g., logical, Date, or POSIXt)?} +\item{.options}{Additional options to use for codebook creation. Must be the result +from a call to \code{cb_create_options()}. See that function's help page for available +options.} } \value{ -An \code{"li_codebook"} object, consisting of (1) a tibble summarizing the passed -dataset and (2) attributes containing the passed dataset (in several formats) -and additional metadata. Specifically: -\itemize{ -\item A tibble with columns: +An \code{"li_codebook"} object, consisting of a tibble summarizing the passed +dataset and attributes containing additional metadata. The tibble includes columns: \itemize{ \item \code{name}: variable name -\item \code{type}: optional column containing simplified variable type +\item \code{type}: column containing simplified variable type \item \code{class}: optional column containing class(es) of each variable \item \code{label_stem}: optional column containing variable label stems, if any variables are specified in \code{.split_var_labels} \item \code{label}: variable label \item \code{values}: values, with labels if applicable -\item \code{user_missing}: optional column, depending on value of \code{.user_missing_col}, -showing user missing values, with labels if applicable +\item \code{user_missing}: optional column showing user missing values, with labels +if applicable. By default, this column is included only if user missings +are specified for at least one variable. This behavior can be changed using +the \code{user_missing_col} argument to \code{cb_create_options()}. \item \code{missing}: proportion missing \item additional columns if specified in \code{...} } -\item Attributes: -\itemize{ -\item Transformed versions of the passed dataset. See \code{\link[=cb_get_data]{cb_get_data()}} -\item Lookup tables and other metadata used internally. -} -} } \description{ \code{cb_create()} builds an object of class \code{"li_codebook"} from a dataset and optional @@ -141,13 +111,15 @@ User missing values may optionally be named to set value labels: \preformatted{ .user_missing = ~ c(Declined = -98, "Not applicable" = -99) } -If labels set in \code{.user_missing} conflict with those in \code{metadata}, \code{.user_missing_conflict} -controls which labels are used. - -User missing values are not compatible with logical, date, or datetime (POSIXt) -variables. By default, these variables will be ignored if specified in \code{.user_missing}. -(i.e., user missing values will be applied only to compatible variables.) This behavior -can be changed using the \code{.user_missing_incompatible} argument. +If labels set in \code{.user_missing} conflict with those in \code{metadata}, the \code{user_missing_conflict} +argument to \code{cb_create_options()} controls which labels are used. + +User missings may be set for numeric, character, factor/ordered factor, and haven_labelled/haven_labelled_spss +vectors. For factors, user missings are set based on factor labels (not the underlying +integer codes). For \code{"haven_labelled"} vectors, user missings are set based on +values (not value labels). By default, variables with incompatible classes (e.g., +logical, Date, POSIXt) will be ignored if specified in \code{.user_missing}. This +behavior can be changed using the \code{.user_missing_incompatible} argument to \code{cb_create_options()}. } \examples{ @@ -170,7 +142,7 @@ diamonds2 |> transform( carat_group = factor(carat_group), price_group = factor(price_group) - ) |> + ) |> cb_create() # provide metadata for variable and value labels diff --git a/man/cb_create_options.Rd b/man/cb_create_options.Rd new file mode 100644 index 0000000..33b114d --- /dev/null +++ b/man/cb_create_options.Rd @@ -0,0 +1,89 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cb_create.r, R/cb_create_redcap.r +\name{cb_create_options} +\alias{cb_create_options} +\alias{cb_create_redcap_options} +\title{Additional options for codebook creation} +\usage{ +cb_create_options( + ..., + include_types = TRUE, + include_r_classes = FALSE, + rmv_html = TRUE, + rmv_line_breaks = TRUE, + user_missing_col = c("if_any", "yes", "no"), + user_missing_conflict = c("val_label", "missing_label"), + user_missing_incompatible = c("ignore", "warn", "error") +) + +cb_create_redcap_options( + ..., + include_types = TRUE, + include_r_classes = FALSE, + rmv_html = TRUE, + rmv_line_breaks = TRUE, + user_missing_col = c("if_any", "yes", "no"), + user_missing_conflict = c("val_label", "missing_label"), + user_missing_incompatible = c("ignore", "warn", "error"), + name = field_name, + var_label = field_label, + val_labels = select_choices_or_calculations, + type = field_type, + form = form_name, + val_labs_sep1 = ", ", + val_labs_sep2 = "\\\\|", + coerce_integers = TRUE, + checkbox_resp_values = FALSE, + propagate_checkbox_missings = TRUE +) +} +\arguments{ +\item{...}{These dots are for future extensions and must be empty.} + +\item{include_types}{Include a column listing simplified type for each variable? +(e.g,. \code{"categorical"}, \code{"date-time"}.)} + +\item{include_r_classes}{Include a column listing class(es) of each variable? +(e.g., \code{"factor"}, \code{"POSIXct, POSIXt"}.)} + +\item{rmv_html}{Should HTML tags be removed from metadata (e.g., from variable +and value labels)?} + +\item{rmv_line_breaks}{Should line breaks be removed from metadata (e.g., from +variable and value labels)? If \code{TRUE}, line breaks will be replaced with \code{" / "}.} + +\item{user_missing_col}{Include value labels for user missing values in a separate +column? The default, \code{"if_any"}, adds the column only if user missings are +specified for at least one variable.} + +\item{user_missing_conflict}{If labels passed to \code{.user_missing} conflict with +value labels in metadata, which should be used?} + +\item{user_missing_incompatible}{How to handle variables specified in \code{.user_missing} +that aren't compatible with user missing values (e.g., logical, Date, or POSIXt)?} + +\item{name, var_label, val_labels, type}{For REDCap data, columns in \code{metadata} containing variable +name, variable label, value labels, and variable type, respectively.} + +\item{form}{For REDCap data, column in \code{metadata} containing form names. (Set to \code{NULL} to omit.)} + +\item{val_labs_sep1, val_labs_sep2}{For REDCap data, regex patterns separating value labels +in \code{metadata}. \code{val_labs_sep1} separates values from labels, and \code{val_labs_sep2} +separates value/label pairs from one another. e.g., if value labels are in +the format \code{"1, First label|2, Second label"}, set \code{val_labs_sep1} to \code{","} +and \code{val_labs_sep2} to \code{"\\\\|"}.} + +\item{coerce_integers}{For REDCap data, should variables listed as "integer" in \code{metedata$text_validation_type_or_show_slider_number} +be coerced to integer?} + +\item{checkbox_resp_values}{For REDCap data, should checkbox values use labels in \code{metadata} (\code{TRUE}) +or "Yes" / "No" (\code{FALSE})? See "Checkbox data handling" on the \code{cb_create_redcap()} +help page.} + +\item{propagate_checkbox_missings}{For REDCap data, should user missing values in a checkbox group +be propagated across all variables in the group? See "Checkbox data handling" +on the \code{cb_create_redcap()} help page.} +} +\description{ +Additional options for use by \code{cb_create()}. +} diff --git a/man/cb_create_redcap.Rd b/man/cb_create_redcap.Rd index 9ced111..98cc49f 100644 --- a/man/cb_create_redcap.Rd +++ b/man/cb_create_redcap.Rd @@ -8,24 +8,9 @@ cb_create_redcap( data, metadata, ..., - .name = field_name, - .var_label = field_label, - .val_labels = select_choices_or_calculations, - .form = form_name, .user_missing = NULL, .split_var_labels = NULL, - .include_types = !.include_r_classes, - .include_r_classes = FALSE, - .val_labs_sep1 = ", ", - .val_labs_sep2 = "\\\\|", - .rmv_html = TRUE, - .rmv_line_breaks = TRUE, - .coerce_integers = TRUE, - .checkbox_resp_values = FALSE, - .propagate_checkbox_missings = TRUE, - .user_missing_col = c("if_any", "yes", "no"), - .user_missing_conflict = c("metadata", "missing_label"), - .user_missing_incompatible = c("ignore", "warn", "error") + .options = cb_create_redcap_options() ) } \arguments{ @@ -37,11 +22,6 @@ cb_create_redcap( New names can be assigned by passing named arguments. Columns for variable name, form, variable label, and value labels are included by default.} -\item{.name, .var_label, .val_labels}{Columns in \code{metadata} containing variable -name, variable label, and value labels, respectively.} - -\item{.form}{Column in \code{metadata} containing form names. (Set to \code{NULL} to omit.)} - \item{.user_missing}{A formula or list of formulas specifying user missing values. Formulas should specify variables on the left-hand side (as variable names or \link[dplyr:dplyr_tidy_select]{tidyselect} expressions), and missing values on the @@ -52,69 +32,29 @@ See "Specifying user missing values" in \code{\link[=cb_create]{cb_create()}} do expressions, indicating (sets of) variable labels with a common stem that should be extracted into a separate column.} -\item{.include_types}{Include a column listing simplified type for each variable? -(e.g,. \code{"categorical"}, \code{"date-time"}.)} - -\item{.include_r_classes}{Include a column listing class(es) of each variable? -(e.g., \code{"factor"}, \code{"POSIXct, POSIXt"}.)} - -\item{.val_labs_sep1, .val_labs_sep2}{Regex patterns separating value labels -in \code{metadata}. \code{.val_labs_sep1} separates values from labels, and \code{.val_labs_sep2} -separates value/label pairs. e.g., if value labels are in format \code{"1, First label|2, Second label"}, -set \code{.val_labs_sep1} to \code{","} and \code{.val_labs_sep2} to \code{"\\\\|"}.} - -\item{.rmv_html}{Should HTML tags be removed from metadata (e.g., from variable -and value labels)?} - -\item{.rmv_line_breaks}{Should line breaks be removed from metadata (e.g., from -variable and value labels)? If \code{TRUE}, line breaks will be replaced with \code{" / "}.} - -\item{.coerce_integers}{Should variables listed as "integer" in \code{metedata$text_validation_type_or_show_slider_number} -be coerced to integer?} - -\item{.checkbox_resp_values}{Should checkbox values use labels in \code{metadata} -(\code{TRUE}) or "Yes" / "No" (\code{FALSE})? See "Checkbox data handling" below.} - -\item{.propagate_checkbox_missings}{Should user missing values in a checkbox -group be propagated across all variables in the group? See "Checkbox data handling" -below.} - -\item{.user_missing_col}{Include value labels for user missing values in a separate -column? The default, \code{"if_any"}, adds the column only if user missings are -specified for at least one variable.} - -\item{.user_missing_conflict}{If different labels for a value are provided in -metadata and user missings, which should be used?} - -\item{.user_missing_incompatible}{How to handle variables specified in \code{.user_missing} -that aren't compatible with user missing values (e.g., logical, Date, or POSIXt)?} +\item{.options}{Additional options to use for codebook creation. Must be the result +from a call to \code{cb_create_redcap_options()} or \code{cb_create_options()}. See \code{?cb_create_redcap_options} +for available options.} } \value{ -An \code{"li_codebook"} object, consisting of (1) a tibble summarizing the passed -dataset and (2) attributes containing the passed dataset (in several formats) -and additional metadata. Specifically: -\itemize{ -\item A tibble with columns: +An \code{"li_codebook"} object, consisting of a tibble summarizing the passed +dataset and attributes containing additional metadata. The tibble includes columns: \itemize{ \item \code{name}: variable name \item \code{form}: form name -\item \code{type}: optional column containing simplified variable type +\item \code{type}: column containing simplified variable type \item \code{class}: optional column containing class(es) of each variable \item \code{label_stem}: optional column containing variable label stems, if any variables are specified in \code{.split_var_labels} \item \code{label}: variable label \item \code{values}: values, with labels if applicable -\item \code{user_missing}: optional column, depending on value of \code{.user_missing_col}, -showing user missing values, with labels if applicable +\item \code{user_missing}: optional column showing user missing values, with labels +if applicable. By default, this column is included only if user missings +are specified for at least one variable. This behavior can be changed using +the \code{user_missing_col} argument to \code{cb_create_options()}. \item \code{missing}: proportion missing \item additional columns if specified in \code{...} } -\item Attributes: -\itemize{ -\item Transformed versions of the passed dataset. See \code{\link[=cb_get_data]{cb_get_data()}}. -\item Lookup tables and other metadata used internally. -} -} } \description{ \code{cb_create_redcap()} builds an object of class \code{"li_codebook"} from a dataset and @@ -131,24 +71,23 @@ and metadata, including: \item Unpacking, labelling, and optional missing propagation for checkbox data \item Optional coercion for character variables marked as "integer" in \code{metedata$text_validation_type_or_show_slider_number} } + +All of these behaviors can be controlled using the \code{.options} argument. } \section{Checkbox data handling}{ \subsection{Value labels}{ Data from REDCap checkboxes yields one variable in the dataset for each response -option. These will be labelled generically with \code{"Yes"} or \code{"No"}, unless \code{.checkbox_resp_values} -is \code{TRUE}, in which case response-specific labels from \code{metadata} will be used. -For example, if a checkbox group has options "In the past year," "More than a +option. By default, these will be labelled generically with \code{"Yes"} or \code{"No"}. +For example, consider a checkbox group with options "In the past year," "More than a year ago," and "Never," corresponding to variables \code{chk_var1___0}, \code{chk_var1___1}, -and \code{chk_var1___2}: if \code{.checkbox_resp_values} is \code{FALSE}, all of these will -have values: +and \code{chk_var1___2}. By default, all of these will be given the same value labels: \itemize{ \item \code{chk_var1___0}, \code{chk_var1___1}, \code{chk_var1___2}: 0 = "No"; 1 = "Yes". -} - -If \code{.checkbox_resp_values} is \code{TRUE}, each variable will have unique labels: -\itemize{ +This behavior can be changed by setting \code{checkbox_resp_values = TRUE} in \code{cb_create_options()}. +In this case, response-specific labels from \code{metadata} will be used, so that +each variable will have unique labels: \item \code{chk_var1___0}: 0 = "Not selected," 1 = "In the past year" \item \code{chk_var1___1}: 0 = "Not selected," 1 = "More than a year ago" \item \code{chk_var1___2}: 0 = "Not selected," 0 = "Never" @@ -157,13 +96,14 @@ If \code{.checkbox_resp_values} is \code{TRUE}, each variable will have unique l \subsection{Missing value propagation}{ -If \code{.propagate_checkbox_missings} is \code{TRUE}, missing values in a checkbox group -variable will be propagated to all variables in the group. For example, given -a checkbox group with options "Pregnant," "Not pregnant," and "Not applicable," -corresponding to variables \code{chk_preg_0___0}, \code{chk_preg_0___1}, and \code{chk_preg_0____9}, -and assuming that \code{-9} is specified as a user missing value. If \code{.propagate_checkbox_missings} -is \code{TRUE}, \code{chk_preg_0___0} and \code{chk_preg_0___1} will be set to \code{-9} if \code{chk_preg_0____9} -is \code{1}. Otherwise, these columns will remain as \code{0} where \code{chk_preg_0____9} is \code{1}. +By default, missing values in a checkbox group will be propagated to all variables +in the group. For example, consider a checkbox group with options "Pregnant," +"Not pregnant," and "Not applicable," corresponding to variables \code{chk_preg_0___0}, +\code{chk_preg_0___1}, and \code{chk_preg_0____9}, and assuming that \code{-9} is specified +as a user missing value. By default, \code{chk_preg_0___0} and \code{chk_preg_0___1} will +be set to \code{-9} if \code{chk_preg_0____9} is \code{1}. This behavior can be overridden by +setting \code{propagate_checkbox_missings = FALSE} in \code{cb_create_options()}, in which +case no values will be changed. } } diff --git a/man/cb_create_spss.Rd b/man/cb_create_spss.Rd index b67dc25..f5ab4c8 100644 --- a/man/cb_create_spss.Rd +++ b/man/cb_create_spss.Rd @@ -8,13 +8,7 @@ cb_create_spss( data, .user_missing = NULL, .split_var_labels = NULL, - .include_types = !.include_r_classes, - .include_r_classes = FALSE, - .rmv_html = TRUE, - .rmv_line_breaks = TRUE, - .user_missing_col = c("if_any", "yes", "no"), - .user_missing_conflict = c("val_label", "missing_label"), - .user_missing_incompatible = c("ignore", "warn", "error") + .options = cb_create_options() ) } \arguments{ @@ -31,51 +25,27 @@ See "Specifying user missing values" in \code{\link[=cb_create]{cb_create()}} do expressions, indicating (sets of) variable labels with a common stem that should be extracted into a separate column.} -\item{.include_types}{Include a column listing simplified type for each variable? -(e.g,. \code{"categorical"}, \code{"date-time"}.)} - -\item{.include_r_classes}{Include a column listing class(es) of each variable? -(e.g., \code{"factor"}, \code{"POSIXct, POSIXt"}.)} - -\item{.rmv_html}{Should HTML tags be removed from variable and value labels?} - -\item{.rmv_line_breaks}{Should line breaks be removed from variable and value -labels? If \code{TRUE}, line breaks will be replaced with \code{" / "}.} - -\item{.user_missing_col}{Include value labels for user missing values in a separate -column? The default, \code{"if_any"}, adds the column only if user missings are -specified for at least one variable.} - -\item{.user_missing_conflict}{If labels passed to \code{.user_missing} conflicts with -a value label in \code{data}, which should be used?} - -\item{.user_missing_incompatible}{How to handle variables specified in \code{.user_missing} -that aren't compatible with user missing values (e.g., logical, Date, or POSIXt)?} +\item{.options}{Additional options to use for codebook creation. Must be the result +from a call to \code{cb_create_options()}. See that function's help page for available +options.} } \value{ -An \code{"li_codebook"} object, consisting of (1) a tibble summarizing the passed -dataset and (2) attributes containing the passed dataset (in several formats) -and additional metadata. Specifically: -\itemize{ -\item A tibble with columns: +An \code{"li_codebook"} object, consisting of a tibble summarizing the passed +dataset and attributes containing additional metadata. The tibble includes columns: \itemize{ \item \code{name}: variable name -\item \code{type}: optional column containing simplified variable type +\item \code{type}: column containing simplified variable type \item \code{class}: optional column containing class(es) of each variable \item \code{label_stem}: optional column containing variable label stems, if any variables are specified in \code{.split_var_labels} \item \code{label}: variable label \item \code{values}: values, with labels if applicable -\item \code{user_missing}: optional column, depending on value of \code{.user_missing_col}, -showing user missing values, with labels if applicable +\item \code{user_missing}: optional column showing user missing values, with labels +if applicable. By default, this column is included only if user missings +are specified for at least one variable. This behavior can be changed using +the \code{user_missing_col} argument to \code{cb_create_options()}. \item \code{missing}: proportion missing } -\item Attributes: -\itemize{ -\item Transformed versions of the passed dataset. See \code{\link[=cb_get_data]{cb_get_data()}}. -\item Lookup tables and other metadata used internally. -} -} } \description{ \code{cb_create_spss()} builds an object of class \code{"li_codebook"} from an imported diff --git a/man/cb_get_data.Rd b/man/cb_get_data.Rd index 6cab0a9..d876368 100644 --- a/man/cb_get_data.Rd +++ b/man/cb_get_data.Rd @@ -4,26 +4,27 @@ \alias{cb_get_data} \title{Extract data from a codebook object} \usage{ -cb_get_data(cb, format = c("factors", "haven", "values")) +cb_get_data(cb, format = c("factors", "haven")) } \arguments{ \item{cb}{An object of class \code{"li_codebook"} as produced by \code{\link[=cb_create]{cb_create()}} or a variant.} -\item{format}{Format of the returned data; see below for details.} +\item{format}{Format of the returned data, either \code{"factors"} or \code{"haven"}; +see below for details.} } \value{ A tibble with variables formatted based on the \code{format} argument. \itemize{ -\item For \code{format = "values"}, all variables retain the same values as the original -dataset, including values for user missings. The data may reflect transformations -made by variants of \code{\link[=cb_create]{cb_create()}} -- e.g., for \code{\link[=cb_create_redcap]{cb_create_redcap()}}, integer coercion -and propagation of user missings across checkbox variables. -\item For \code{"haven"}, value labels and user missings are encoded using class -\code{\link[haven:labelled]{"haven_labelled"}}` \item For \code{"factors"}, all variables with value labels are converted to factors, and all user missings are converted to \code{NA}. +\item For \code{"haven"}, variable labels, value labels, and user missings are encoded +using class \code{\link[haven:labelled]{"haven_labelled_spss"}}`. } + +Both formats may also reflect transformations made by variants of \code{\link[=cb_create]{cb_create()}}. +In particular, for codebooks created using \code{\link[=cb_create_redcap]{cb_create_redcap()}}, integer coercion +and propagation of user missings across checkbox variables. } \description{ Codebook objects created by \code{\link[=cb_create]{cb_create()}} and friends contain several transformed diff --git a/man/cb_write.Rd b/man/cb_write.Rd index 0018f5b..1699465 100644 --- a/man/cb_write.Rd +++ b/man/cb_write.Rd @@ -8,13 +8,15 @@ cb_write( cb, file, dataset_name = NULL, - incl_date = TRUE, - incl_dims = TRUE, - hyperlinks = TRUE, group_by = NULL, - group_rows_numeric = NULL, + group_rows = NULL, + group_rows_numeric = group_rows, + group_rows_categorical = group_rows, detail_missing = c("if_any_user_missing", "yes", "no"), n_text_vals = 5, + incl_date = TRUE, + incl_dims = TRUE, + hyperlinks = TRUE, overwrite = TRUE ) } @@ -26,21 +28,20 @@ a variant.} \item{dataset_name}{Name of the dataset to display in workbook headers.} -\item{incl_date, incl_dims}{Should the date and/or dataset dimensions be included -in the Overview tab header?} - -\item{hyperlinks}{If \code{TRUE}, variable names on the Overview sheet will link -to corresponding rows on summary tabs and vice versa.} - \item{group_by}{<\code{\link[dplyr:dplyr_tidy_select]{tidy-select}}> Column or columns to group by. If specified, additional numeric and categorical summary tabs will be included -with grouped summaries. Subgroups are shown in columns by default. For the numeric -summary tab, subgroups for some or all grouping variables can instead be shown -in rows if specified in \code{group_rows_numeric}.} +with grouped summaries. Subgroups are shown in columns by default. Some or all +grouping variables can instead be shown in rows if specified in \code{group_rows}, +\code{group_rows_numeric}, or \code{group_rows_categorical}.} + +\item{group_rows}{<\code{\link[dplyr:dplyr_tidy_select]{tidy-select}}> Column or columns to group +by in rows on grouped summary tabs. All columns must also be specified in \code{group_by}. +Will apply to both numeric and categorical summary tabs unless otherwise specified +in \code{group_rows_numeric} or \code{group_rows_categorical}.} -\item{group_rows_numeric}{<\code{\link[dplyr:dplyr_tidy_select]{tidy-select}}> Column or columns -to group by in rows on the grouped numeric summary tab. All columns must also -be specified in \code{group_by}.} +\item{group_rows_numeric, group_rows_categorical}{<\code{\link[dplyr:dplyr_tidy_select]{tidy-select}}> +Column or columns to group by in rows on grouped numeric or categorical summary +tab.} \item{detail_missing}{Include detailed missing value information on ungrouped categorical and text summary tabs? (Detailed missing information for grouped @@ -50,6 +51,12 @@ summary tabs is not currently supported.)} should be included for each variable? If there are more than \code{n_text_vals} + 1 unique values, the \code{n_text_vals} most common non-missing values will be included.} +\item{incl_date, incl_dims}{Should the date and/or dataset dimensions be included +in the Overview tab header?} + +\item{hyperlinks}{If \code{TRUE}, variable names on the Overview sheet will link +to corresponding rows on summary tabs and vice versa.} + \item{overwrite}{Overwrite existing file?} } \value{ diff --git a/man/figures/README-categorical.png b/man/figures/README-categorical.png new file mode 100644 index 0000000..3766ac3 Binary files /dev/null and b/man/figures/README-categorical.png differ diff --git a/man/figures/README-numeric.png b/man/figures/README-numeric.png new file mode 100644 index 0000000..4599abe Binary files /dev/null and b/man/figures/README-numeric.png differ diff --git a/man/figures/README-overview.png b/man/figures/README-overview.png new file mode 100644 index 0000000..dbad3da Binary files /dev/null and b/man/figures/README-overview.png differ diff --git a/man/figures/README-text.png b/man/figures/README-text.png new file mode 100644 index 0000000..ccb966a Binary files /dev/null and b/man/figures/README-text.png differ diff --git a/vignettes/.gitignore b/vignettes/.gitignore new file mode 100644 index 0000000..097b241 --- /dev/null +++ b/vignettes/.gitignore @@ -0,0 +1,2 @@ +*.html +*.R diff --git a/vignettes/lighthouse-codebook.Rmd b/vignettes/lighthouse-codebook.Rmd new file mode 100644 index 0000000..678a4f8 --- /dev/null +++ b/vignettes/lighthouse-codebook.Rmd @@ -0,0 +1,406 @@ +--- +title: "Introduction to lighthouse.codebook" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Introduction to lighthouse.codebook} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + eval = FALSE, + comment = "#", + out.width = "100%" +) +``` + +## Creating codebooks + +Creating a codebook involves two general steps: + +1. Create a “codebook” object in R from a data frame (and, optionally, metadata), + using `cb_create()` or a specialized variant (such as `cb_create_spss()` or `cb_create_redcap()`). + +2. Write the codebook to disk using `cb_write()`. + +```r +library(lighthouse.codebook) + +# create and write a codebook without metadata +dat |> + cb_create() |> + cb_write("cb.xlsx") + +# with metadata +dat |> + cb_create(metadata = dat1_metadata) |> + cb_write("cb.xlsx") + +# from SPSS data +dat_spss <- haven::read_sav("dat_spss.sav", user_na = TRUE) + +dat_spss |> + cb_create_spss() |> + cb_write("cb_spss.xlsx") + +# from REDCap data +dat_rc <- REDCapR::redcap_read(redcap_uri = rc_uri, token = rc_token) +meta_rc <- REDCapR::redcap_metadata_read(redcap_uri = rc_uri, token = rc_token) + +dat_rc$data |> + cb_create_redcap(metadata = meta_rc$data) |> + cb_write("cb_rc.xlsx") +``` +The codebook written to disk will include an overview tab listing all variables +in the dataset; summary tabs for numeric, categorical, and text variables; and, +if grouping variables are specified, grouped summary tabs for numeric and categorical +variables. + +## Customizing codebooks + +There are many options for controlling how data is interpreted, summarized, and +presented. This section shows a few of the most useful options. Further options +are detailed in the documentation for `cb_create()` and `cb_write()`. + +### Grouped summaries + +Numeric and categorical data summaries can be grouped by one or more variables by +specifying them in the `group_by` argument to `cb_write()`. +```r +cb_create(data, metadata) |> + cb_write("cb.xlsx", group_by = treatment_group) + +cb_create(data, metadata) |> + cb_write("cb.xlsx", group_by = c(treatment_group, timepoint, age_group)) +``` + +By default, values for each subgroup are shown in separate columns, with decked +heads if more than one grouping variable is specified. However, some or all grouping +variables can instead be shown in rows using the `group_rows` argument. +```r +# show `treatment_group` in columns and `timepoint` in rows +cb_create(data, metadata) |> + cb_write( + "cb.xlsx", + group_by = c(treatment_group, timepoint), + group_rows = timepoint + ) +``` +Different row grouping behavior can be specified for numeric versus categorical +summary tabs using the `group_rows_numeric` and `group_rows_categorical` arguments. +```r +# for numeric summary, show `treatment_group` in columns and `timepoint` in rows; +# for categorical summary, show all grouping variables in columns +cb_create(data, metadata) |> + cb_write( + "cb.xlsx", + group_by = c(treatment_group, timepoint), + group_rows_numeric = timepoint + ) + +# for numeric summary, show all grouping variables in rows; +# for categorical summary, show `treatment_group` in rows +cb_create(data, metadata) |> + cb_write( + "cb.xlsx", + group_by = c(treatment_group, timepoint), + group_rows_numeric = c(treatment_group, timepoint), + group_rows_categorical = treatment_group + ) +``` + +### User missing values + +User missing values (also known as nonresponse codes, reserve codes, or special +values) can be specified using the `.user_missing` argument to `cb_create()`. +Missing values are specified using a formula or list of formulas, with variables +on the left-hand side (as names or [tidyselect](https://dplyr.tidyverse.org/reference/dplyr_tidy_select.html) +expressions) and values on the right-hand side. If the left-hand side is empty, +user missings will be set for all compatible variables in the dataset. + +```r +# set a single missing value for a single variable +cb <- cb_create(data, metadata, .user_missing = var1 ~ 99) + +# for variables `var1` through `var5` +cb <- cb_create(data, metadata, .user_missing = var1:var5 ~ c(98, 99)) + +# for all numeric variables, plus `var6` and `var7` +cb <- cb_create( + data, + metadata, + .user_missing = c(where(is.numeric), var6, var7) ~ c(-9, -8, -7) +) + +# for all compatible variables in dataset +cb <- cb_create(data, metadata, .user_missing = ~ c(98, 99)) +``` + +If the user missing values are named, the names will be treated as value labels +in data summaries. + +```r +cb <- cb_create( + data, + metadata, + .user_missing = var1:var5 ~ c("Declined" = 98, "Not applicable" = 99) +) +``` + +To apply different user missings for different variables, pass a list of formulas. +```r +cb <- cb_create( + data, + metadata, + .user_missing = list( + starts_with("status") ~ c("Declined" = 98, "Not applicable" = 99), + var7:var10 ~ c("Don't know" = -4, "Not applicable" = -5) + ) +) +``` +### Missing value handling + +On numeric summary tabs, missing values (including both user missing values and `NA`) +are dropped for computation of summary statistics. + +On ungrouped categorical and text summary tabs, by default, user missing values +are individually tabulated. For example, if `.user_missing = ~ c("Declined" = 98, +"Not applicable" = 99)`, then categorical and text summary tabs will include rows +giving counts for `"[98] Declined"` and `"[99] Not applicable"`. All user missing +values and `NA` can instead be collapsed into a single `"(Missing)"` row using the +`detail_missing` argument to `cb_write()`. +```r +dat |> + cb_create(.user_missing = ~ ~ c("Declined" = 98, "Not applicable" = 99)) |> + cb_write("cb.xlsx", detail_missing = FALSE) +``` +Finally, user missing values are always collapsed (as though `detail_missing = FALSE`) +on _grouped_ summary tabs. + +### Splitting long variable labels + +Variable labels for sets of related variables sometimes share a common prefix. Using +the `.split_var_labels` argument to `cb_create()`, this prefix can be extracted +into a separate column, making it easier to see at a glance what is unique about +each variable. + +For example, given a set of variable labels that all begin with `"What colors do +you like? Select all that apply: "`: + +| Name | Label | +| ---- | ----- | +| age | How old are you today? | +| colors1 | What colors do you like? Select all that apply: Red | +| colors2 | What colors do you like? Select all that apply: Green | +| colors3 | What colors do you like? Select all that apply: Blue | +| colors4 | What colors do you like? Select all that apply: Orange | +| height | What is your height in inches? | + +You can split the labels for these variables, specifying them using a [tidyselect](https://dplyr.tidyverse.org/reference/dplyr_tidy_select.html) +expression: + +```r +cb_create( + data, + metadata, + .split_var_labels = starts_with("colors") + ) |> + cb_write("cb.xlsx") +``` + +| Name | Label Stem | Label | +| ---- | ---------- | ----- | +| age | | How old are you today? | +| colors1 | What colors do you like? Select all that apply: | Red | +| colors2 | What colors do you like? Select all that apply: | Green | +| colors3 | What colors do you like? Select all that apply: | Blue | +| colors4 | What colors do you like? Select all that apply: | Orange | +| height | | What is your height in inches? | + +Multiple sets of variables with common prefixes can be specified by passing a +list of tidyselect expressions. + +```r +cb_create( + data, + metadata, + .split_var_labels = list( + starts_with("colors"), + fav_food2:fav_food9, + c(rating1, rating4:rating7, rating9) + ) + ) |> + cb_write("cb.xlsx") +``` + +## Variable typing +Data summaries are produced for "numeric," "categorical," and "text" variables. For a given variable `x`, +* `x` is treated as categorical if (1) it is a factor, ordered factor, or logical vector, _or_ (2) it has associated value labels other than missing value codes (specified in metadata or, for SPSS data, in a `"haven_labelled"` vector). +* `x` is treated as numeric if (1) it is numeric (i.e., `is.numeric(x)` is `TRUE`) _and_ (2) it has no associated value labels other than missing value codes. +* `x` is treated as text if (1) it is a character vector _and_ (2) it has no associated value labels other than missing value codes. + +Thus, you can change how a variable is summarized by changing its class. For +instance, to get complete frequencies for a numeric or character variable, convert +it to a factor; to get only the top frequencies for a factor with many levels, convert +it to character. + +Variables of other classes, such as dates, datetimes, and lists, are not currently +included on summary tabs. Summaries for dates and datetimes are planned for a future +release. + +## Other uses for the codebook object +The `"lighthouse_codebook"` object created by `cb_create()` will most commonly be +used to write an Excel codebook to disk using `cb_write()`. However, it can also +be used to create other objects in R. + +```r +# example data +q4_subset <- gain_q4 |> + subset(select = c(XPID, XOBS, XRA, B17, SU4a, SU4b, SU1f99v)) + +# create codebook +cb <- cb_create( + q4_subset, + metadata = q4_metadata, + .user_missing = ~ c("Not Asked" = -3, + "Missing" = -4, + "Confidential" = -6, + "Refused" = -7, + "Don't Know" = -8, + "Legitimate Skip" = -9) +) + +cb +# # A tibble: 7 × 6 +# name type label values user_missings missing +# +# 1 XPID text Participant ID NA [-9] Legitim… 0 +# 2 XOBS categorical Observation Wave [0] I… [-9] Legitim… 0 +# 3 XRA categorical Random assignment [0] C… [-9] Legitim… 0 +# 4 B17 categorical Pregnant [0] N… [-9] Legitim… 0.221 +# 5 SU4a numeric PPS - P90 days alcohol use NA [-9] Legitim… 0.394 +# 6 SU4b numeric PPS - P90 days drunk or 5+ d… NA [-9] Legitim… 0.442 +# 7 SU1f99v text QCS - P90 Days Other AOD Tx … NA [-9] Legitim… 0.923 +``` +### Extract transformed data +Use `cb_get_data()` to extract transformed data based in several formats. `format += "factors"` yields a dataset with all variables with value labels converted to +factors and user missings converted to `NA`. +```r +cb_get_data(cb, format = "factors") +# # A tibble: 104 × 7 +# XPID XOBS XRA B17 SU4a SU4b SU1f99v +# +# 1 001 Intake Treatment No NA NA NA +# 2 002 Intake Control No 10 4 NA +# 3 003 Intake Treatment No 10 1 Peer counselor +# 4 003 3-month Treatment No 39 15 NA +# 5 003 6-month Treatment No NA NA NA +# 6 004 Intake Control No 10 2 NA +# 7 004 3-month Control No 55 63 NA +# 8 004 6-month Control No 10 1 NA +# 9 005 Intake Control Yes 35 0 Social worker +# 10 005 3-month Control Yes 55 39 NA +# # ℹ 94 more rows +``` +Whereas `format = "haven"` yields a dataset with SPSS-style variable labels, value +labels, and user missings encoded using the `"haven_labelled_spss"` class. +```r +cb_get_data(cb, format = "haven") +# # A tibble: 104 × 7 +# XPID XOBS XRA B17 SU4a SU4b SU1f99v +# +# 1 001 0 [Intake] 1 [Treatment] 0 [No] -9 (NA) -9 (NA) -9 (NA) +# 2 002 0 [Intake] 0 [Control] 0 [No] 10 4 -9 (NA) +# 3 003 0 [Intake] 1 [Treatment] 0 [No] 10 1 Peer couns… +# 4 003 1 [3-month] 1 [Treatment] 0 [No] 39 15 -4 (NA) +# 5 003 2 [6-month] 1 [Treatment] 0 [No] -4 (NA) -4 (NA) -4 (NA) +# 6 004 0 [Intake] 0 [Control] 0 [No] 10 2 -9 (NA) +# 7 004 1 [3-month] 0 [Control] 0 [No] 55 63 -9 (NA) +# 8 004 2 [6-month] 0 [Control] 0 [No] 10 1 -9 (NA) +# 9 005 0 [Intake] 0 [Control] 1 [Yes] 35 0 Social wor… +# 10 005 1 [3-month] 0 [Control] 1 [Yes] 55 39 -9 (NA) +# # ℹ 94 more rows +``` +### Get data summaries +`cb_summarize_numeric()`, `cb_summarize_categorical()`, and `cb_summarize_text()` +return summaries for all variables of their respective types. These are the basis +of the summary tabs generated by `cb_write()`. +```r +cb_summarize_numeric(cb) +# # A tibble: 2 × 8 +# name label valid_n valid_pct mean SD median MAD +# +# 1 SU4a PPS - P90 days alcohol use 63 0.606 23.4 21.2 20 25.2 +# 2 SU4b PPS - P90 days drunk or 5+ d… 58 0.558 8.09 13.6 2 2.97 +# # ℹ 5 more variables: min , max , range , skew , kurt + +cb_summarize_categorical(cb) |> +# # A tibble: 9 × 7 +# name label is_missing value n pct_of_all pct_of_valid +# +# 1 XOBS Observation Wave FALSE [0] Intake 42 0.404 0.404 +# 2 XOBS Observation Wave FALSE [1] 3-month 34 0.327 0.327 +# 3 XOBS Observation Wave FALSE [2] 6-month 28 0.269 0.269 +# 4 XRA Random assignment FALSE [0] Control 50 0.481 0.481 +# 5 XRA Random assignment FALSE [1] Treatment 54 0.519 0.519 +# 6 B17 Pregnant FALSE [1] Yes 6 0.0577 0.0741 +# 7 B17 Pregnant FALSE [0] No 75 0.721 0.926 +# 8 B17 Pregnant TRUE [-9] Legitim… 22 0.212 NA +# 9 B17 Pregnant TRUE [-4] Missing 1 0.00962 NA +# # ℹ 1 more variable: pct_of_missing + +cb_summarize_text(cb) +# # A tibble: 14 × 7 +# name label is_missing unique_n value n pct_of_all +# +# 1 XPID Participant ID FALSE 42 003 3 0.0288 +# 2 XPID Participant ID FALSE 42 004 3 0.0288 +# 3 XPID Participant ID FALSE 42 005 3 0.0288 +# 4 XPID Participant ID FALSE 42 006 3 0.0288 +# 5 XPID Participant ID FALSE 42 010 3 0.0288 +# 6 XPID Participant ID FALSE 42 (37 … 89 0.856 +# 7 SU1f99v QCS - P90 Days Other AOD … FALSE 8 AA 1 0.00962 +# 8 SU1f99v QCS - P90 Days Other AOD … FALSE 8 Alco… 1 0.00962 +# 9 SU1f99v QCS - P90 Days Other AOD … FALSE 8 Case… 1 0.00962 +# 10 SU1f99v QCS - P90 Days Other AOD … FALSE 8 Group 1 0.00962 +# 11 SU1f99v QCS - P90 Days Other AOD … FALSE 8 NA 1 0.00962 +# 12 SU1f99v QCS - P90 Days Other AOD … FALSE 8 (3 o… 3 0.0288 +# 13 SU1f99v QCS - P90 Days Other AOD … TRUE NA [-4]… 44 0.423 +# 14 SU1f99v QCS - P90 Days Other AOD … TRUE NA [-9]… 52 0.5 +# # ℹ 2 more variables: pct_of_valid , pct_of_missing +``` +`cb_summarize_numeric()` and `cb_summarize_categorical()` can also return grouped +summaries: +```r +cb_summarize_numeric(cb, group_by = XOBS) +# # A tibble: 6 × 8 +# XOBS name label valid_n valid_pct mean SD median +# +# 1 Intake SU4a PPS - P90 days alcohol use 30 0.714 28.8 20.1 26.5 +# 2 3-month SU4a PPS - P90 days alcohol use 20 0.588 21.4 21.0 12.5 +# 3 6-month SU4a PPS - P90 days alcohol use 13 0.464 13.9 21.5 6 +# 4 Intake SU4b PPS - P90 days drunk or 5… 30 0.714 7.13 10.7 4 +# 5 3-month SU4b PPS - P90 days drunk or 5… 15 0.441 16.1 20.0 3 +# 6 6-month SU4b PPS - P90 days drunk or 5… 13 0.464 1 0.707 1 +# # ℹ 6 more variables: MAD , min , max , range , skew , +# # kurt + +cb_summarize_categorical(cb, group_by = XRA) +# # A tibble: 12 × 7 +# XRA name label value n pct_of_all pct_of_valid +# +# 1 Control XOBS Observation Wave [0] Intake 20 0.4 0.4 +# 2 Control XOBS Observation Wave [1] 3-month 16 0.32 0.32 +# 3 Control XOBS Observation Wave [2] 6-month 14 0.28 0.28 +# 4 Treatment XOBS Observation Wave [0] Intake 22 0.407 0.407 +# 5 Treatment XOBS Observation Wave [1] 3-month 18 0.333 0.333 +# 6 Treatment XOBS Observation Wave [2] 6-month 14 0.259 0.259 +# 7 Control B17 Pregnant [1] Yes 3 0.06 0.0811 +# 8 Control B17 Pregnant [0] No 34 0.68 0.919 +# 9 Control B17 Pregnant (Missing) 13 0.26 NA +# 10 Treatment B17 Pregnant [1] Yes 3 0.0556 0.0682 +# 11 Treatment B17 Pregnant [0] No 41 0.759 0.932 +# 12 Treatment B17 Pregnant (Missing) 10 0.185 NA +```