From 656ad7af52a7468dea5d10ed50a16fbdb3dccf50 Mon Sep 17 00:00:00 2001 From: statzhero Date: Tue, 31 Mar 2026 18:37:37 +0200 Subject: [PATCH 1/7] feat: Add tidy-r skill for modern tidyverse R development --- .claude-plugin/marketplace.json | 9 + tidyverse/README.md | 6 +- tidyverse/tidy-r/SKILL.md | 210 +++++++++++++++++ .../tidy-r/references/grouping-examples.md | 175 ++++++++++++++ tidyverse/tidy-r/references/join-examples.md | 123 ++++++++++ .../tidy-r/references/migration-examples.md | 165 ++++++++++++++ .../references/recode-replace-examples.md | 188 +++++++++++++++ .../tidy-r/references/stringr-examples.md | 102 +++++++++ .../tidy-r/references/tidyverse-style.md | 215 ++++++++++++++++++ 9 files changed, 1192 insertions(+), 1 deletion(-) create mode 100644 tidyverse/tidy-r/SKILL.md create mode 100644 tidyverse/tidy-r/references/grouping-examples.md create mode 100644 tidyverse/tidy-r/references/join-examples.md create mode 100644 tidyverse/tidy-r/references/migration-examples.md create mode 100644 tidyverse/tidy-r/references/recode-replace-examples.md create mode 100644 tidyverse/tidy-r/references/stringr-examples.md create mode 100644 tidyverse/tidy-r/references/tidyverse-style.md diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index 4edc581..fb80891 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -76,6 +76,15 @@ "./quarto/quarto-authoring", "./quarto/quarto-alt-text" ] + }, + { + "name": "tidyverse", + "description": "Collection of skills for tidyverse-style R development", + "source": "./", + "strict": false, + "skills": [ + "./tidyverse/tidy-r" + ] } ] } diff --git a/tidyverse/README.md b/tidyverse/README.md index e840b29..d58365d 100644 --- a/tidyverse/README.md +++ b/tidyverse/README.md @@ -1,6 +1,10 @@ # Tidyverse Skills -Skills specific to using tidyverse packages and tidyverse-specific package development patterns. +Skills for tidyverse-style R development, covering modern patterns, style guidelines, and best practices. + +## Skills + +- **[tidy-r](./tidy-r/)** - Modern tidyverse patterns, style guide, and migration guidance for R development. Covers native pipe usage, join_by() syntax, .by grouping, pick/across/reframe operations, filter_out/when_any/when_all, recode_values/replace_values/replace_when, tidy selection, stringr patterns, naming conventions, and migration from base R or older tidyverse APIs. ## Potential Skills diff --git a/tidyverse/tidy-r/SKILL.md b/tidyverse/tidy-r/SKILL.md new file mode 100644 index 0000000..325d5eb --- /dev/null +++ b/tidyverse/tidy-r/SKILL.md @@ -0,0 +1,210 @@ +--- +name: tidy-r +description: | + Modern tidyverse patterns, style guide, and migration guidance for R development. Use when writing, reviewing, or refactoring tidyverse code. Covers native pipe, join_by(), .by grouping, pick/across/reframe, filter_out/when_any/when_all, recode_values/replace_values/replace_when, tidy selection, stringr, naming conventions, and migration from base R or older tidyverse APIs. +metadata: + r_version: "4.5+" + tidyverse_version: "2.0+" + dplyr_version: "1.2+" +allowed-tools: Read, Edit, Write, Grep, Glob, Bash, mcp__r-btw__* +--- + +# Writing Modern Tidyverse R + +This skill covers modern tidyverse patterns for R 4.5+ and tidyverse 2.0+, style guidelines, and migration from legacy patterns. + +## Core philosophy + +R's tidyverse evolves. Code from blog posts and StackOverflow often uses deprecated APIs, magrittr pipes, or base R patterns where a modern tidyverse function exists. This skill encodes the current recommended approach so the model writes code that experienced R developers would recognize as idiomatic. + +## When to use this skill + +- Writing new R code with dplyr, tidyr, stringr, purrr, or other tidyverse packages +- Reviewing or refactoring existing R code for modern patterns +- Migrating from base R, magrittr pipes, or older tidyverse APIs +- Applying tidyverse style conventions (naming, spacing, error handling) +- Choosing between similar functions (e.g., `case_when` vs `recode_values`) +- Working with joins, grouping, recoding, or string manipulation in R + +## When NOT to use this skill + +- Writing data.table code (different paradigm) +- Pure base R projects that intentionally avoid tidyverse +- Shiny UI/server logic (use a Shiny-specific skill) +- Package development internals (NAMESPACE, DESCRIPTION, roxygen) +- ggplot2 visualization (use the socviz skill) +- Statistical modeling or Bayesian analysis + +## Instructions + +When you receive a request, classify it and consult the appropriate reference: + +### Step 1: Classify the request + +| Category | Reference file | Trigger | +|----------|---------------|---------| +| **Joins** | [join-examples.md](references/join-examples.md) | Merging data, `*_join`, `join_by`, matching rows, lookup tables | +| **Grouping & columns** | [grouping-examples.md](references/grouping-examples.md) | `.by`, `group_by`, `across`, `pick`, `reframe`, column operations | +| **Recoding & replacing** | [recode-replace-examples.md](references/recode-replace-examples.md) | `case_when`, `recode_values`, `replace_values`, `replace_when`, `filter_out`, `when_any`, `when_all`, recoding, replacing, conditional updates | +| **Strings** | [stringr-examples.md](references/stringr-examples.md) | String manipulation, regex, `str_*` functions, text processing | +| **Style** | [tidyverse-style.md](references/tidyverse-style.md) | Naming, formatting, spacing, error messages, `cli::cli_abort` | +| **Migration** | [migration-examples.md](references/migration-examples.md) | Updating old code, base R conversion, deprecated functions | + +### Step 2: Read the reference file(s) + +Use the Read tool to load the relevant reference. For requests that span multiple categories (e.g., "rewrite this old code" touches migration + style), read multiple files. + +### Step 3: Apply core principles + +1. **Use modern tidyverse patterns** - Prioritize dplyr 1.2+ features, native pipe, and current APIs +2. **Write readable code first** - Optimize only when necessary +3. **Follow tidyverse style guide** - Consistent naming, spacing, and structure +4. **Use R MCP tools** - Automatically resolve function documentation and library references without being asked. If the `mcp__r-btw__*` tools are unavailable, fall back to running R help via Bash (see below) + +### R documentation lookup fallback + +When `mcp__r-btw__*` tools are available, use them to look up function signatures, help pages, and package docs. When they are not available (e.g., the r-btw MCP server is not configured), fall back to Bash: + +```bash +# Help page for a function +Rscript --vanilla -e '?dplyr::recode_values' 2>/dev/null || Rscript --vanilla -e 'utils::help("recode_values", package = "dplyr")' + +# Function signature / arguments +Rscript --vanilla -e 'args(dplyr::recode_values)' + +# List exported functions in a package +Rscript --vanilla -e 'ls("package:dplyr")' + +# Check if a package is installed +Rscript --vanilla -e 'requireNamespace("dplyr", quietly = TRUE)' +``` + +### Step 4: Write the code + +Follow the quick reference and anti-patterns below. When in doubt, consult the reference files. + +## Quick reference + +### Pipe and lambda + +- Always `|>`, never `%>%` +- Always `\(x)`, never `function(x)` or `~` in map/keep/etc. + +### Code organization + +Use newspaper style: high-level logic first, helpers below. Don't define functions inside other functions unless they are very brief. + +### Grouping + +- Use `.by` for per-operation grouping, never `group_by() |> ... |> ungroup()` +- Place `.by` on its own line for readability + +### Joins + +- Use `join_by()`, never `c("a" = "b")` +- Use `relationship`, `unmatched`, `na_matches` for quality control + +### Recoding and replacing (dplyr 1.2+) + +| Task | Function | +|------|----------| +| Recode values (new column) | `recode_values()` | +| Replace values in place | `replace_values()` | +| Conditional update in place | `replace_when()` | +| Complex conditional (new column) | `case_when()` | +| Drop rows (NA-safe) | `filter_out()` | +| OR conditions | `when_any()` | +| AND conditions | `when_all()` | + +### Error handling + +Use `cli::cli_abort()` with problem statement + bullets, never `stop()`. + +### R idioms + +- `TRUE`/`FALSE`, never `T`/`F` +- `message()` for info, never `cat()` +- `map_*()` over `sapply()` for type stability +- `set.seed()` with date-time, never 42 +- `qs2::qs_save()`/`qs2::qs_read()`, never `qs` + +## Anti-patterns + +| Avoid | Use instead | +|-------|-------------| +| `%>%` | `|>` | +| `function(x)` or `~` | `\(x)` | +| `by = c("a" = "b")` | `by = join_by(a == b)` | +| `multiple = "error"` in joins | `relationship = "many-to-one"` (or `"one-to-one"`) | +| `sapply()` | `map_*()` (type-stable) | +| `group_by() \|> ... \|> ungroup()` | `.by` argument | +| `cat()` for messages | `message()` or `cli::cli_inform()` | +| `stop()` for errors | `cli::cli_abort()` | +| `distinct(id)` | `distinct(id, .keep_all = TRUE)` | +| `mean(x, na.rm = TRUE)` | `mean(x)` with tidyna loaded | +| `case_match(x, ...)` | `recode_values(x, ...)` | +| `recode(x, ...)` | `recode_values(x, ...)` or `replace_values(x, ...)` | +| `filter(x != val \| is.na(x))` | `filter_out(x == val)` | +| `coalesce(x, default)` | `replace_values(x, NA ~ default)` | +| `na_if(x, val)` | `replace_values(x, val ~ NA)` | +| `qs::qsave()` / `qs::qread()` | `qs2::qs_save()` / `qs2::qs_read()` | + +## Complete workflow example + +```r +library(tidyverse) + +# Read and clean data +sales <- read_csv("data/sales.csv") |> + rename( + region = Region, + product = Product, + revenue = Revenue, + date = Date + ) |> + mutate( + quarter = quarter(date), + product = product |> + replace_values( + c("Widget A", "WidgetA") ~ "Widget A", + c("Widget B", "WidgetB") ~ "Widget B" + ) + ) |> + filter_out(is.na(revenue)) + +# Enrich with lookup table +sales_enriched <- sales |> + left_join( + regions, + by = join_by(region == region_code), + unmatched = "error" + ) + +# Summarise by group +quarterly <- sales_enriched |> + summarise( + total_revenue = sum(revenue), + avg_revenue = mean(revenue), + n_transactions = n(), + .by = c(region_name, quarter) + ) |> + mutate( + performance = revenue |> + replace_when( + total_revenue > 100000 ~ "high", + total_revenue > 50000 ~ "medium" + ) + ) |> + arrange(region_name, quarter) +``` + +## Best practices + +1. **Use `.unmatched = "error"`** in `case_when()` and `recode_values()` for defensive programming +4. **Place `.by` on its own line** for readability +5. **Prefer `filter_out()` over negated `filter()`** for NA-safe row removal +6. **Use `recode_values()` over `case_match()`** (dplyr 1.2+ preferred API) +7. **Use `replace_when()` over `case_when()` with `.default`** when updating a column in place +8. **Name variables as nouns, functions as verbs** in snake_case +9. **Explain "why" in comments**, not "what" +10. **Use `qs2` for serialization** with `.qs2` extension diff --git a/tidyverse/tidy-r/references/grouping-examples.md b/tidyverse/tidy-r/references/grouping-examples.md new file mode 100644 index 0000000..d5ed53d --- /dev/null +++ b/tidyverse/tidy-r/references/grouping-examples.md @@ -0,0 +1,175 @@ +# Modern Grouping and Column Operations (dplyr 1.1+) + +## Per-operation grouping with .by + +The `.by` argument replaces the old `group_by() |> ... |> ungroup()` pattern. Results are always ungrouped. + +### Basic usage + +```r +data |> + summarise( + mean_value = mean(value), + .by = category + ) +``` + +### Multiple grouping variables + +```r +data |> + summarise( + total = sum(revenue), + .by = c(company, year) + ) +``` + +### .by with mutate (window functions) + +```r +data |> + mutate( + pct_of_group = revenue / sum(revenue), + rank = row_number(desc(revenue)), + .by = region + ) +``` + +### .by with filter (group-level filtering) + +```r +data |> + filter( + revenue == max(revenue), + .by = region + ) +``` + +### Place .by on its own line + +```r +# Good - readable +data |> + summarise( + mean_value = mean(value), + .by = category + ) + +# Avoid - crammed +data |> + summarise(mean_value = mean(value), .by = category) +``` + +### Avoid - old persistent grouping pattern + +```r +# Avoid +data |> + group_by(category) |> + summarise(mean_value = mean(value)) |> + ungroup() +``` + +## pick() for column selection + +Use `pick()` inside data-masking functions to select columns by name or tidyselect helpers: + +```r +data |> + summarise( + n_x_cols = ncol(pick(starts_with("x"))), + n_y_cols = ncol(pick(starts_with("y"))) + ) +``` + +### pick() to pass selected columns to functions + +```r +data |> + mutate( + row_mean = rowMeans(pick(where(is.numeric))) + ) +``` + +## across() for applying functions + +Apply one or more functions to multiple columns: + +### Single function + +```r +data |> + summarise( + across(where(is.numeric), \(x) mean(x)), + .by = group + ) +``` + +### Multiple functions with naming + +```r +data |> + summarise( + across( + c(revenue, cost), + list(mean = \(x) mean(x), sd = \(x) sd(x)), + .names = "{.fn}_{.col}" + ), + .by = region + ) +``` + +### Conditional transformation + +```r +data |> + mutate( + across(where(is.character), str_to_lower) + ) +``` + +## reframe() for multi-row results + +When a summary returns multiple rows per group, use `reframe()` instead of `summarise()`: + +```r +data |> + reframe( + quantile = c(0.25, 0.50, 0.75), + value = quantile(x, c(0.25, 0.50, 0.75)), + .by = group + ) +``` + +## Data masking vs tidy selection + +Understand the difference for writing functions: + +- **Data masking** (`arrange`, `filter`, `mutate`, `summarise`): expressions evaluated in data context +- **Tidy selection** (`select`, `relocate`, `across`, `pick`): column selection helpers + +### Embrace with {{ }} for function arguments + +```r +my_summary <- function(data, summary_var) { + data |> + summarise(mean_val = mean({{ summary_var }})) +} +``` + +### Character vectors use .data[[]] + +```r +for (var in names(mtcars)) { + mtcars |> count(.data[[var]]) |> print() +} +``` + +### Multiple columns use across() + +```r +my_summary <- function(data, summary_vars) { + data |> + summarise(across({{ summary_vars }}, \(x) mean(x))) +} +``` diff --git a/tidyverse/tidy-r/references/join-examples.md b/tidyverse/tidy-r/references/join-examples.md new file mode 100644 index 0000000..ffb0a7a --- /dev/null +++ b/tidyverse/tidy-r/references/join-examples.md @@ -0,0 +1,123 @@ +# Modern Join Syntax (dplyr 1.1+) + +## Use join_by() instead of character vectors + +### Equality joins + +```r +transactions |> + inner_join(companies, by = join_by(company == id)) +``` + +### Same-name columns + +```r +# When both tables share a column name, use a single name +orders |> + left_join(customers, by = join_by(customer_id)) +``` + +### Inequality joins + +```r +transactions |> + inner_join(companies, by = join_by(company == id, year >= since)) +``` + +### Rolling joins (closest match) + +```r +transactions |> + inner_join(companies, by = join_by(company == id, closest(year >= since))) +``` + +### Overlap joins + +```r +# Find events during each interval +intervals |> + inner_join(events, by = join_by(start <= time, end >= time)) +``` + +### Avoid - Old character vector syntax + +```r +# Avoid +transactions |> + inner_join(companies, by = c("company" = "id")) +``` + +## Relationship and match handling + +### Enforce expected cardinality with relationship + +```r +# 1:1 - each row matches at most one row in the other table +inner_join(x, y, by = join_by(id), relationship = "one-to-one") + +# Many-to-one - many x rows can match one y row (lookup pattern) +left_join(x, y, by = join_by(id), relationship = "many-to-one") + +# One-to-many +inner_join(x, y, by = join_by(id), relationship = "one-to-many") +``` + +### Ensure all rows match + +```r +inner_join(x, y, by = join_by(id), unmatched = "error") +``` + +### Prevent NA matching (recommended) + +```r +# By default, NA matches NA in joins -- usually not desired +left_join(x, y, by = join_by(id), na_matches = "never") +``` + +### Combining guards for production code + +```r +sales |> + left_join( + products, + by = join_by(product_id), + relationship = "many-to-one", + unmatched = "error", + na_matches = "never" + ) +``` + +## Logging joins with tidylog + +Use `tidylog::` prefix for joins to verify expected behavior. Call directly without loading the package. + +```r +result <- transactions |> + tidylog::left_join(companies, by = join_by(company == id)) + +# tidylog output: +# left_join: added 2 columns (name, region) +# > rows only in x 12 +# > rows only in y (3) +# > matched rows 988 +# > rows total 1000 +``` + +### Interpreting join output + +| Output | Meaning | +|--------|---------| +| `rows only in x` | Rows in left table with no match (kept as NA in left joins) | +| `rows only in y` | Rows in right table with no match (in parentheses, dropped in left joins) | +| `matched rows` | Rows that matched between tables | +| `rows total` | Final row count after join | + +### When to use tidylog + +- **Always for joins** to see how many rows matched, duplicated, or were dropped +- **Critical filters** with `tidylog::filter()` to verify expected row counts +- **Critical mutates** with `tidylog::mutate()` to verify expected changes +- **Any operation where silent data loss is a risk** + +Don't use tidylog in production code, inside functions, or loops where output would be too verbose. It's for interactive verification only. diff --git a/tidyverse/tidy-r/references/migration-examples.md b/tidyverse/tidy-r/references/migration-examples.md new file mode 100644 index 0000000..0cd6cf9 --- /dev/null +++ b/tidyverse/tidy-r/references/migration-examples.md @@ -0,0 +1,165 @@ +# Migration: Base R and Old Tidyverse to Modern Patterns + +## Base R to Modern Tidyverse + +### Data manipulation + +```r +subset(data, condition) # -> filter(data, condition) +data[order(data$x), ] # -> arrange(data, x) +aggregate(x ~ y, data, mean) # -> summarise(data, mean(x), .by = y) +merge(x, y, by = "id") # -> left_join(x, y, by = join_by(id)) +``` + +### Functional programming + +```r +sapply(x, f) # -> map(x, f) # type-stable +lapply(x, f) # -> map(x, f) +vapply(x, f, numeric(1)) # -> map_dbl(x, f) +``` + +### String manipulation + +```r +grepl("pattern", text) # -> str_detect(text, "pattern") +gsub("old", "new", text) # -> str_replace_all(text, "old", "new") +substr(text, 1, 5) # -> str_sub(text, 1, 5) +nchar(text) # -> str_length(text) +strsplit(text, ",") # -> str_split(text, ",") +tolower(text) # -> str_to_lower(text) +sprintf("Hello %s", name) # -> str_glue("Hello {name}") +``` + +## Old to New Tidyverse Patterns + +### Pipes + +```r +data %>% function() # -> data |> function() +``` + +### Anonymous functions + +```r +map(x, function(x) x + 1) # -> map(x, \(x) x + 1) +map(x, ~ .x + 1) # -> map(x, \(x) x + 1) +``` + +### Grouping (dplyr 1.1+) + +```r +group_by(data, x) |> + summarise(mean(y)) |> + ungroup() # -> summarise(data, mean(y), .by = x) +``` + +### Joins + +```r +by = c("a" = "b") # -> by = join_by(a == b) +``` + +### Column selection + +```r +across(starts_with("x")) # -> pick(starts_with("x")) # for selection only +``` + +### Multi-row summaries + +```r +summarise(data, x, .groups = "drop") # -> reframe(data, x) +``` + +### Data reshaping + +```r +gather()/spread() # -> pivot_longer()/pivot_wider() +``` + +### String separation (tidyr 1.3+) + +```r +separate(col, into = c("a", "b")) +# -> separate_wider_delim(col, delim = "_", names = c("a", "b")) + +extract(col, into = "x", regex) +# -> separate_wider_regex(col, patterns = c(x = regex)) +``` + +### Superseded purrr functions (purrr 1.0+) + +```r +map_dfr(x, f) # -> map(x, f) |> list_rbind() +map_dfc(x, f) # -> map(x, f) |> list_cbind() +map2_dfr(x, y, f) # -> map2(x, y, f) |> list_rbind() +pmap_dfr(list, f) # -> pmap(list, f) |> list_rbind() +imap_dfr(x, f) # -> imap(x, f) |> list_rbind() +``` + +### Recoding and replacing (dplyr 1.2+) + +```r +case_match(x, val ~ result) # -> recode_values(x, val ~ result) +recode(x, old = "new") # -> recode_values(x, "old" ~ "new") + # or replace_values(x, "old" ~ "new") + +# Conditional replacement: case_when with .default = x -> replace_when +case_when( + cond1 ~ val1, + cond2 ~ val2, + .default = x +) # -> x |> replace_when(cond1 ~ val1, cond2 ~ val2) + +# NA handling +coalesce(x, default) # -> replace_values(x, NA ~ default) +na_if(x, val) # -> replace_values(x, val ~ NA) +tidyr::replace_na(x, default) # -> replace_values(x, NA ~ default) +``` + +### Filter family (dplyr 1.2+) + +```r +# Dropping rows with NA-safe negation +filter(x != val | is.na(x)) # -> filter_out(x == val) + +# Combining conditions with OR +filter(cond1 | cond2 | cond3) # -> filter(when_any(cond1, cond2, cond3)) + +# Combining conditions with AND (explicit) +filter(cond1 & cond2 & cond3) # -> filter(when_all(cond1, cond2, cond3)) +``` + +### Serialization + +```r +qs::qsave(x, "file.qs") # -> qs2::qs_save(x, "file.qs2") +qs::qread("file.qs") # -> qs2::qs_read("file.qs2") +``` + +### Defunct in dplyr 1.2 (now errors) + +```r +# Underscored SE verbs (defunct since 1.2, deprecated since 0.7) +mutate_() # -> mutate() with modern programming +filter_() # -> filter() +summarise_() # -> summarise() +# ... all *_() variants + +# _each variants (defunct since 1.2, deprecated since 0.7) +mutate_each() # -> mutate(across(...)) +summarise_each() # -> summarise(across(...)) + +# Multi-row summarise (defunct since 1.2, deprecated since 1.1) +summarise(data, x) # -> reframe(data, x) for multi-row results +``` + +### For side effects + +```r +for (x in xs) write_file(x) # -> walk(xs, write_file) +for (i in seq_along(data)) { + write_csv(data[[i]], paths[[i]]) +} # -> walk2(data, paths, write_csv) +``` diff --git a/tidyverse/tidy-r/references/recode-replace-examples.md b/tidyverse/tidy-r/references/recode-replace-examples.md new file mode 100644 index 0000000..6c95616 --- /dev/null +++ b/tidyverse/tidy-r/references/recode-replace-examples.md @@ -0,0 +1,188 @@ +# Recoding, Replacing, and Filtering (dplyr 1.2+) + +dplyr 1.2 introduced a family of functions for recoding and replacing values, and for NA-safe filtering. These replace older patterns (`case_match`, `recode`, `coalesce`, `na_if`, negated filters). + +## The recode/replace family + +| | **Recoding** (new column) | **Replacing** (update in place) | +|---------------------------|---------------------------|---------------------------------| +| **Match with conditions** | `case_when()` | `replace_when()` | +| **Match with values** | `recode_values()` | `replace_values()` | + +## recode_values() + +Use instead of `case_match()` or repetitive `case_when()` with `==`. + +### Formula interface + +```r +score |> + recode_values( + 1 ~ "Strongly disagree", + 2 ~ "Disagree", + 3 ~ "Neutral", + 4 ~ "Agree", + 5 ~ "Strongly agree" + ) +``` + +### Lookup table interface + +```r +likert |> + mutate(score = recode_values(score, from = lookup$from, to = lookup$to)) +``` + +### With .unmatched = "error" for safety + +```r +# Errors if any value has no match +score |> + recode_values( + 1 ~ "Low", + 2 ~ "Medium", + 3 ~ "High", + .unmatched = "error" + ) +``` + +### Avoid + +```r +# Avoid - repetitive case_when with == +case_when(score == 1 ~ "Strongly disagree", score == 2 ~ "Disagree", ...) + +# Avoid - case_match() is soft-deprecated in dplyr 1.2 +case_match(score, 1 ~ "Strongly disagree", 2 ~ "Disagree", ...) + +# Avoid - recode() is soft-deprecated +recode(score, `1` = "Strongly disagree", `2` = "Disagree", ...) +``` + +## replace_values() + +Use for partial updates by value. Unmatched values pass through unchanged. + +### Replace specific values + +```r +name |> + replace_values( + c("UNC", "Chapel Hill") ~ "UNC Chapel Hill", + c("Duke", "Duke University") ~ "Duke" + ) +``` + +### Replace NA (replaces coalesce/tidyr::replace_na) + +```r +x |> replace_values(NA ~ 0) +``` + +### Convert sentinel values to NA (replaces na_if) + +```r +x |> replace_values(from = c(0, -99), to = NA) +``` + +## replace_when() + +Use for conditional updates. Type-stable on the input; unmatched values pass through unchanged. + +### Conditional updates + +```r +racers |> + mutate( + time = time |> + replace_when( + id %in% id_banned ~ NA, + id %in% id_penalty ~ time + 1/3 + ) + ) +``` + +### Avoid - case_when with .default + +```r +# Avoid - buries the primary input, loses type info +mutate(time = case_when( + id %in% id_banned ~ NA, + id %in% id_penalty ~ time + 1/3, + .default = time +)) +``` + +## case_when() with .unmatched = "error" + +Still the right choice for complex conditional recoding into a new column. Use `.unmatched = "error"` for safety: + +```r +tier <- case_when( + time < 23 ~ "A", + time < 27 ~ "B", + time < 30 ~ "C", + .unmatched = "error" +) +``` + +## filter_out() + +NA-safe row removal. Treats `NA` as `FALSE`, so you don't accidentally drop NA rows: + +```r +# Good - clear intent, NA-safe +data |> filter_out(deceased, date < 2012) + +# Avoid - easy to get wrong with NA +data |> filter(!(deceased & date < 2012) | is.na(deceased) | is.na(date)) +``` + +## when_any() and when_all() + +Combine conditions with comma-separated syntax instead of `|` and `&`: + +### OR conditions + +```r +data |> + filter(when_any( + name %in% c("US", "CA") & between(score, 200, 300), + name %in% c("PR", "RU") & between(score, 100, 200) + )) +``` + +### Drop rows matching any condition + +```r +data |> + filter_out(when_any( + is.na(value), + status == "invalid" + )) +``` + +### AND conditions + +```r +data |> + filter(when_all( + score > 50, + !is.na(region), + status == "active" + )) +``` + +## Migration quick reference + +| Old pattern | New pattern | +|-------------|-------------| +| `case_match(x, val ~ result)` | `recode_values(x, val ~ result)` | +| `recode(x, old = "new")` | `recode_values(x, "old" ~ "new")` | +| `case_when(..., .default = x)` | `x \|> replace_when(...)` | +| `coalesce(x, default)` | `replace_values(x, NA ~ default)` | +| `na_if(x, val)` | `replace_values(x, val ~ NA)` | +| `tidyr::replace_na(x, default)` | `replace_values(x, NA ~ default)` | +| `filter(x != val \| is.na(x))` | `filter_out(x == val)` | +| `filter(c1 \| c2 \| c3)` | `filter(when_any(c1, c2, c3))` | +| `filter(c1 & c2 & c3)` | `filter(when_all(c1, c2, c3))` | diff --git a/tidyverse/tidy-r/references/stringr-examples.md b/tidyverse/tidy-r/references/stringr-examples.md new file mode 100644 index 0000000..a56c5a9 --- /dev/null +++ b/tidyverse/tidy-r/references/stringr-examples.md @@ -0,0 +1,102 @@ +# String Manipulation with stringr + +Use stringr over base R string functions. Benefits: consistent `str_` prefix, string-first argument order, pipe-friendly and vectorized. + +## Core patterns + +### Pipe-friendly chaining + +```r +text |> + str_to_lower() |> + str_trim() |> + str_replace_all("pattern", "replacement") |> + str_extract("\\d+") +``` + +### Detection and extraction + +```r +str_detect(text, "pattern") # logical: does it match? +str_which(text, "pattern") # integer: which elements match? +str_count(text, "pattern") # integer: how many matches? +str_extract(text, "pattern") # first match +str_extract_all(text, "pattern") # all matches (returns list) +str_match(text, "(\\w+)@(\\w+)") # capture groups as matrix +``` + +### Replacement + +```r +str_replace(text, "old", "new") # first occurrence +str_replace_all(text, "old", "new") # all occurrences +str_remove(text, "pattern") # remove first match +str_remove_all(text, "pattern") # remove all matches +``` + +### Splitting and combining + +```r +str_split(text, ",") # split into list +str_split_fixed(text, ",", n = 3) # split into matrix (fixed columns) +str_split_i(text, ",", i = 2) # extract ith piece directly +str_c("a", "b", "c", sep = "-") # combine with separator +str_flatten(words, collapse = ", ") # collapse vector to single string +``` + +### Substring operations + +```r +str_sub(text, 1, 5) # extract positions 1-5 +str_sub(text, -3) # last 3 characters +str_length(text) # character count +str_trunc(text, 20) # truncate with ellipsis +``` + +### Formatting + +```r +str_to_lower(text) # lowercase +str_to_upper(text) # uppercase +str_to_title(text) # title case +str_to_sentence(text) # sentence case +str_trim(text) # remove leading/trailing whitespace +str_squish(text) # trim + collapse internal whitespace +str_pad(text, 10, side = "left") # pad to fixed width +str_wrap(text, width = 80) # word wrap +``` + +### Interpolation + +```r +str_glue("Hello {name}, you scored {score}!") +str_glue_data(df, "{name}: {value}") +``` + +## Pattern helpers + +Use these for clarity about what kind of matching you intend: + +```r +str_detect(text, fixed("$")) # literal match (no regex) +str_detect(text, regex("\\d+")) # explicit regex (default) +str_detect(text, regex("hello", ignore_case = TRUE)) # case-insensitive +str_detect(text, coll("e", locale = "fr")) # locale-aware collation +str_detect(text, boundary("word")) # word boundaries +``` + +## stringr vs base R + +| stringr | base R | Notes | +|---------|--------|-------| +| `str_detect(text, "pat")` | `grepl("pat", text)` | Argument order differs | +| `str_extract(text, "pat")` | `regmatches(text, regexpr(...))` | Much simpler | +| `str_replace_all(text, "a", "b")` | `gsub("a", "b", text)` | Argument order differs | +| `str_split(text, ",")` | `strsplit(text, ",")` | | +| `str_length(text)` | `nchar(text)` | | +| `str_sub(text, 1, 5)` | `substr(text, 1, 5)` | | +| `str_to_lower(text)` | `tolower(text)` | | +| `str_to_upper(text)` | `toupper(text)` | | +| `str_to_title(text)` | `tools::toTitleCase(text)` | | +| `str_trim(text)` | `trimws(text)` | | +| `str_glue("Hello {x}")` | `sprintf("Hello %s", x)` | More readable | diff --git a/tidyverse/tidy-r/references/tidyverse-style.md b/tidyverse/tidy-r/references/tidyverse-style.md new file mode 100644 index 0000000..304b5bc --- /dev/null +++ b/tidyverse/tidy-r/references/tidyverse-style.md @@ -0,0 +1,215 @@ +# Tidyverse Style Guide Summary + +Based on https://style.tidyverse.org/ + +## Object Names + +- Use **snake_case**: lowercase letters, numbers, underscores only +- Variables = **nouns**, functions = **verbs** +- Avoid reusing common function/variable names +- Prefix non-standard function arguments with `.` (e.g., `.data`, `.by`) +- Avoid dots in names except for S3 methods + +```r +# Good +day_one +calculate_mean +user_data + +# Bad +DayOne +calculateMean +day.one +``` + +## Spacing + +**Commas**: space after, never before + +```r +# Good +x[, 1] +mean(x, na.rm = TRUE) + +# Bad +x[,1] +mean(x ,na.rm = TRUE) +``` + +**Infix operators**: surround with spaces (`==`, `+`, `-`, `<-`, etc.) + +```r +# Good +x == y +z <- 2 + 2 + +# Bad +x==y +z<-2+2 +``` + +**No spaces** for high-precedence operators: `::`, `$`, `@`, `[`, `[[`, `^`, `:` + +```r +# Good +sqrt(x^2 + y^2) +x <- 1:10 +pkg::fun() +``` + +## Assignment + +Use `<-`, not `=` + +```r +# Good +x <- 5 + +# Bad +x = 5 +``` + +## Quotes + +Use double quotes `"`; single `'` only when text contains double quotes + +```r +# Good +"Text here" +'They said "hello"' +``` + +## Line Length + +Limit to **80 characters**. For long function calls, put each argument on its own line: + +```r +# Good +do_something( + arg1 = "value", + arg2 = "value", + arg3 = "value" +) +``` + +## Braces + +- `{` ends a line +- Contents indented by **2 spaces** +- `}` starts a line +- `else` on same line as `}` + +```r +if (condition) { + do_this() +} else { + do_that() +} +``` + +## Functions + +**Anonymous functions**: use `\(x)` for short lambdas + +```r +# Good +map(x, \(x) x + 1) + +# Bad +map(x, function(x) x + 1) +``` + +**Return**: use `return()` only for early returns; rely on implicit return otherwise + +```r +# Good +add_one <- function(x) { + x + 1 +} + +# Early return +check_input <- function(x) { + if (is.null(x)) { + return(NULL) + } + process(x) +} +``` + +**Multi-line definitions**: single-indent style preferred + +```r +long_function_name <- function( + a = "argument", + b = "argument" +) { + # body +} +``` + +## Pipes + +- Use `|>` (not `%>%`) +- Space before pipe, newline after +- Indent continuation by 2 spaces + +```r +# Good +data |> + filter(x > 0) |> + mutate(y = x * 2) |> + summarise(mean(y)) + +# Bad +data |> filter(x > 0) |> mutate(y = x * 2) +``` + +**Avoid pipes when**: +- Manipulating multiple objects +- Meaningful intermediate objects deserve names + +## Comments + +- Start with `# ` (hash + space) +- Explain **why**, not what +- Use sentence case + +```r +# Skip NA values because downstream analysis requires complete cases +data <- data |> filter(!is.na(value)) +``` + +## Control Flow + +- Use `&&` and `||` in conditions (not `&` and `|`) +- Use `TRUE`/`FALSE` (not `T`/`F`) +- Never use semicolons + +## Error Messages + +Use `cli::cli_abort()` for errors. See https://style.tidyverse.org/errors.html + +**Problem statement**: +- Start with concise problem in sentence case, ending with `.` +- Use **"must"** when cause is clear: `` `n` must be a numeric vector, not a character vector.`` +- Use **"can't"** when you cannot state what was expected: ``Can't find column `b` in `.data`.`` + +**Bullets**: +- `x` (cross) for problems +- `i` (info) for context +- `!` (warning) for warnings + +**Formatting**: +- Surround argument names in backticks: `` `x` `` +- Use "column" to disambiguate (avoid "variable") +- Keep under 80 characters; let cli wrap +- List up to 5 issues, truncate with `...` + +**Hints**: place last with `i` bullet, end with `?` + +```r +cli::cli_abort(c( + "{.arg x} must be a numeric vector, not {.obj_type_friendly {x}}.", + "i" = "Did you mean to use {.fn as.numeric}?" +)) +``` From 3740f7291a65a353a1e3ec617dd545a5ef881104 Mon Sep 17 00:00:00 2001 From: statzhero Date: Thu, 2 Apr 2026 14:35:12 +0200 Subject: [PATCH 2/7] Add redundant groupings --- tidyverse/tidy-r/SKILL.md | 4 + .../tidy-r/references/grouping-examples.md | 75 +++++++++++++++++++ 2 files changed, 79 insertions(+) diff --git a/tidyverse/tidy-r/SKILL.md b/tidyverse/tidy-r/SKILL.md index 325d5eb..5d672ad 100644 --- a/tidyverse/tidy-r/SKILL.md +++ b/tidyverse/tidy-r/SKILL.md @@ -97,6 +97,8 @@ Use newspaper style: high-level logic first, helpers below. Don't define functio ### Grouping - Use `.by` for per-operation grouping, never `group_by() |> ... |> ungroup()` +- Never add `ungroup()` before or after `.by` — it always returns ungrouped data +- Consolidate multiple `mutate(.by = x)` calls into one when they share the same `.by`; keep separate only when `.by` differs or a later column depends on an earlier one - Place `.by` on its own line for readability ### Joins @@ -138,6 +140,8 @@ Use `cli::cli_abort()` with problem statement + bullets, never `stop()`. | `multiple = "error"` in joins | `relationship = "many-to-one"` (or `"one-to-one"`) | | `sapply()` | `map_*()` (type-stable) | | `group_by() \|> ... \|> ungroup()` | `.by` argument | +| `ungroup() \|> mutate(..., .by = x)` | `mutate(..., .by = x)` (`.by` ignores existing groups) | +| Repeated `mutate(.by = x)` with same `.by` | Single `mutate()` with all columns and one `.by` | | `cat()` for messages | `message()` or `cli::cli_inform()` | | `stop()` for errors | `cli::cli_abort()` | | `distinct(id)` | `distinct(id, .keep_all = TRUE)` | diff --git a/tidyverse/tidy-r/references/grouping-examples.md b/tidyverse/tidy-r/references/grouping-examples.md index d5ed53d..5310ece 100644 --- a/tidyverse/tidy-r/references/grouping-examples.md +++ b/tidyverse/tidy-r/references/grouping-examples.md @@ -70,6 +70,81 @@ data |> ungroup() ``` +### Avoid - redundant ungroup() around .by + +`.by` always returns ungrouped data, so `ungroup()` before or after is a no-op. Remove it. + +```r +# Avoid - ungroup() is redundant +data |> + ungroup() |> + mutate( + centered = x - mean(x), + .by = group + ) + +# Good +data |> + mutate( + centered = x - mean(x), + .by = group + ) +``` + +### Consolidating mutate() calls + +When multiple columns share the same `.by`, combine them in a single `mutate()`. + +```r +# Avoid - repeating .by = year across separate mutate() calls +data |> + mutate( + above_med_a = a > median(a), + .by = year + ) |> + mutate( + above_med_b = b > median(b), + .by = year + ) + +# Good - one mutate(), one .by +data |> + mutate( + above_med_a = a > median(a), + above_med_b = b > median(b), + .by = year + ) +``` + +**When to keep separate `mutate()` calls:** + +- **Different `.by` variables** between the calls +- **Sequential dependency**: a later column uses a column created in an earlier `mutate()` within the same grouped context (the new column must exist before the group-level aggregate can reference it) + +```r +# Separate calls needed: different .by variables +data |> + mutate( + x_lag = dplyr::lag(x), + .by = id + ) |> + mutate( + above_med = x_lag > median(x_lag), + .by = year + ) + +# Separate calls needed: b_rank depends on b_centered +data |> + mutate( + b_centered = b - mean(b), + .by = group + ) |> + mutate( + b_rank = row_number(desc(b_centered)), + .by = group + ) +``` + ## pick() for column selection Use `pick()` inside data-masking functions to select columns by name or tidyselect helpers: From 2ece3a3e402b93647b90cb237aa8f7a30619b009 Mon Sep 17 00:00:00 2001 From: statzhero Date: Tue, 7 Apr 2026 16:17:28 -0400 Subject: [PATCH 3/7] Fixes most comments --- tidyverse/tidy-r/SKILL.md | 116 +++++------------- .../{grouping-examples.md => grouping.md} | 2 +- .../references/{join-examples.md => joins.md} | 2 +- .../{migration-examples.md => migration.md} | 16 +-- ...-replace-examples.md => recode-replace.md} | 2 +- .../{stringr-examples.md => stringr.md} | 0 6 files changed, 45 insertions(+), 93 deletions(-) rename tidyverse/tidy-r/references/{grouping-examples.md => grouping.md} (98%) rename tidyverse/tidy-r/references/{join-examples.md => joins.md} (98%) rename tidyverse/tidy-r/references/{migration-examples.md => migration.md} (91%) rename tidyverse/tidy-r/references/{recode-replace-examples.md => recode-replace.md} (98%) rename tidyverse/tidy-r/references/{stringr-examples.md => stringr.md} (100%) diff --git a/tidyverse/tidy-r/SKILL.md b/tidyverse/tidy-r/SKILL.md index 5d672ad..918b3d0 100644 --- a/tidyverse/tidy-r/SKILL.md +++ b/tidyverse/tidy-r/SKILL.md @@ -1,87 +1,42 @@ --- name: tidy-r -description: | - Modern tidyverse patterns, style guide, and migration guidance for R development. Use when writing, reviewing, or refactoring tidyverse code. Covers native pipe, join_by(), .by grouping, pick/across/reframe, filter_out/when_any/when_all, recode_values/replace_values/replace_when, tidy selection, stringr, naming conventions, and migration from base R or older tidyverse APIs. +description: > + Modern tidyverse patterns, style guide, and migration guidance for R development. + Use this skill when writing R code, reviewing tidyverse code, updating legacy R + code to modern patterns, or enforcing consistent style. Covers native pipe usage, + join_by() syntax, .by grouping, pick/across/reframe, filter_out/when_any/when_all, + recode_values/replace_values/replace_when, tidy selection, stringr, naming + conventions, and migration from base R or older tidyverse APIs. metadata: - r_version: "4.5+" - tidyverse_version: "2.0+" - dplyr_version: "1.2+" -allowed-tools: Read, Edit, Write, Grep, Glob, Bash, mcp__r-btw__* + r_version: ">=4.5.0" + tidyverse_version: ">=2.0.0" + dplyr_version: ">=1.2.0" --- -# Writing Modern Tidyverse R +# Modern Tidyverse R Reference -This skill covers modern tidyverse patterns for R 4.5+ and tidyverse 2.0+, style guidelines, and migration from legacy patterns. +Code from blog posts and StackOverflow often uses deprecated APIs, magrittr pipes, or base R patterns where a modern tidyverse function exists. This guide encodes the current recommended approach. -## Core philosophy +## Reference files -R's tidyverse evolves. Code from blog posts and StackOverflow often uses deprecated APIs, magrittr pipes, or base R patterns where a modern tidyverse function exists. This skill encodes the current recommended approach so the model writes code that experienced R developers would recognize as idiomatic. +Consult the appropriate reference file for detailed patterns and examples: -## When to use this skill - -- Writing new R code with dplyr, tidyr, stringr, purrr, or other tidyverse packages -- Reviewing or refactoring existing R code for modern patterns -- Migrating from base R, magrittr pipes, or older tidyverse APIs -- Applying tidyverse style conventions (naming, spacing, error handling) -- Choosing between similar functions (e.g., `case_when` vs `recode_values`) -- Working with joins, grouping, recoding, or string manipulation in R - -## When NOT to use this skill - -- Writing data.table code (different paradigm) -- Pure base R projects that intentionally avoid tidyverse -- Shiny UI/server logic (use a Shiny-specific skill) -- Package development internals (NAMESPACE, DESCRIPTION, roxygen) -- ggplot2 visualization (use the socviz skill) -- Statistical modeling or Bayesian analysis - -## Instructions - -When you receive a request, classify it and consult the appropriate reference: - -### Step 1: Classify the request - -| Category | Reference file | Trigger | -|----------|---------------|---------| -| **Joins** | [join-examples.md](references/join-examples.md) | Merging data, `*_join`, `join_by`, matching rows, lookup tables | -| **Grouping & columns** | [grouping-examples.md](references/grouping-examples.md) | `.by`, `group_by`, `across`, `pick`, `reframe`, column operations | -| **Recoding & replacing** | [recode-replace-examples.md](references/recode-replace-examples.md) | `case_when`, `recode_values`, `replace_values`, `replace_when`, `filter_out`, `when_any`, `when_all`, recoding, replacing, conditional updates | -| **Strings** | [stringr-examples.md](references/stringr-examples.md) | String manipulation, regex, `str_*` functions, text processing | +| Topic | Reference file | When to consult | +|-------|---------------|-----------------| +| **Joins** | [joins.md](references/joins.md) | Merging data, `*_join`, `join_by`, matching rows, lookup tables | +| **Grouping & columns** | [grouping.md](references/grouping.md) | `.by`, `group_by`, `across`, `pick`, `reframe`, column operations | +| **Recoding & replacing** | [recode-replace.md](references/recode-replace.md) | `recode_values`, `replace_values`, `replace_when`, `filter_out`, `when_any`, `when_all` | +| **Strings** | [stringr.md](references/stringr.md) | String manipulation, regex, `str_*` functions, text processing | | **Style** | [tidyverse-style.md](references/tidyverse-style.md) | Naming, formatting, spacing, error messages, `cli::cli_abort` | -| **Migration** | [migration-examples.md](references/migration-examples.md) | Updating old code, base R conversion, deprecated functions | - -### Step 2: Read the reference file(s) - -Use the Read tool to load the relevant reference. For requests that span multiple categories (e.g., "rewrite this old code" touches migration + style), read multiple files. +| **Migration** | [migration.md](references/migration.md) | Updating old code, base R conversion, deprecated functions | -### Step 3: Apply core principles - -1. **Use modern tidyverse patterns** - Prioritize dplyr 1.2+ features, native pipe, and current APIs -2. **Write readable code first** - Optimize only when necessary -3. **Follow tidyverse style guide** - Consistent naming, spacing, and structure -4. **Use R MCP tools** - Automatically resolve function documentation and library references without being asked. If the `mcp__r-btw__*` tools are unavailable, fall back to running R help via Bash (see below) - -### R documentation lookup fallback - -When `mcp__r-btw__*` tools are available, use them to look up function signatures, help pages, and package docs. When they are not available (e.g., the r-btw MCP server is not configured), fall back to Bash: - -```bash -# Help page for a function -Rscript --vanilla -e '?dplyr::recode_values' 2>/dev/null || Rscript --vanilla -e 'utils::help("recode_values", package = "dplyr")' - -# Function signature / arguments -Rscript --vanilla -e 'args(dplyr::recode_values)' - -# List exported functions in a package -Rscript --vanilla -e 'ls("package:dplyr")' - -# Check if a package is installed -Rscript --vanilla -e 'requireNamespace("dplyr", quietly = TRUE)' -``` +For requests that span multiple topics (e.g., "rewrite this old code" touches migration + style), read multiple files. -### Step 4: Write the code +## Core principles -Follow the quick reference and anti-patterns below. When in doubt, consult the reference files. +1. **Use modern tidyverse patterns** -- Prioritize dplyr 1.2+ features, native pipe, and current APIs +2. **Write readable code first** -- Optimize only when necessary +3. **Follow tidyverse style guide** -- Consistent naming, spacing, and structure ## Quick reference @@ -97,7 +52,7 @@ Use newspaper style: high-level logic first, helpers below. Don't define functio ### Grouping - Use `.by` for per-operation grouping, never `group_by() |> ... |> ungroup()` -- Never add `ungroup()` before or after `.by` — it always returns ungrouped data +- Never add `ungroup()` before or after `.by` -- it always returns ungrouped data - Consolidate multiple `mutate(.by = x)` calls into one when they share the same `.by`; keep separate only when `.by` differs or a later column depends on an earlier one - Place `.by` on its own line for readability @@ -128,7 +83,6 @@ Use `cli::cli_abort()` with problem statement + bullets, never `stop()`. - `message()` for info, never `cat()` - `map_*()` over `sapply()` for type stability - `set.seed()` with date-time, never 42 -- `qs2::qs_save()`/`qs2::qs_read()`, never `qs` ## Anti-patterns @@ -151,9 +105,8 @@ Use `cli::cli_abort()` with problem statement + bullets, never `stop()`. | `filter(x != val \| is.na(x))` | `filter_out(x == val)` | | `coalesce(x, default)` | `replace_values(x, NA ~ default)` | | `na_if(x, val)` | `replace_values(x, val ~ NA)` | -| `qs::qsave()` / `qs::qread()` | `qs2::qs_save()` / `qs2::qs_read()` | -## Complete workflow example +## Example ```r library(tidyverse) @@ -205,10 +158,9 @@ quarterly <- sales_enriched |> ## Best practices 1. **Use `.unmatched = "error"`** in `case_when()` and `recode_values()` for defensive programming -4. **Place `.by` on its own line** for readability -5. **Prefer `filter_out()` over negated `filter()`** for NA-safe row removal -6. **Use `recode_values()` over `case_match()`** (dplyr 1.2+ preferred API) -7. **Use `replace_when()` over `case_when()` with `.default`** when updating a column in place -8. **Name variables as nouns, functions as verbs** in snake_case -9. **Explain "why" in comments**, not "what" -10. **Use `qs2` for serialization** with `.qs2` extension +2. **Place `.by` on its own line** for readability +3. **Prefer `filter_out()` over negated `filter()`** for NA-safe row removal +4. **Use `recode_values()` over `case_match()`** (dplyr 1.2+ preferred API) +5. **Use `replace_when()` over `case_when()` with `.default`** when updating a column in place +6. **Name variables as nouns, functions as verbs** in snake_case +7. **Explain "why" in comments**, not "what" diff --git a/tidyverse/tidy-r/references/grouping-examples.md b/tidyverse/tidy-r/references/grouping.md similarity index 98% rename from tidyverse/tidy-r/references/grouping-examples.md rename to tidyverse/tidy-r/references/grouping.md index 5310ece..633b1ee 100644 --- a/tidyverse/tidy-r/references/grouping-examples.md +++ b/tidyverse/tidy-r/references/grouping.md @@ -1,4 +1,4 @@ -# Modern Grouping and Column Operations (dplyr 1.1+) +# Modern Grouping and Column Operations (dplyr >=1.2.0) ## Per-operation grouping with .by diff --git a/tidyverse/tidy-r/references/join-examples.md b/tidyverse/tidy-r/references/joins.md similarity index 98% rename from tidyverse/tidy-r/references/join-examples.md rename to tidyverse/tidy-r/references/joins.md index ffb0a7a..5520ac4 100644 --- a/tidyverse/tidy-r/references/join-examples.md +++ b/tidyverse/tidy-r/references/joins.md @@ -1,4 +1,4 @@ -# Modern Join Syntax (dplyr 1.1+) +# Modern Join Syntax (dplyr >=1.2.0) ## Use join_by() instead of character vectors diff --git a/tidyverse/tidy-r/references/migration-examples.md b/tidyverse/tidy-r/references/migration.md similarity index 91% rename from tidyverse/tidy-r/references/migration-examples.md rename to tidyverse/tidy-r/references/migration.md index 0cd6cf9..41bf538 100644 --- a/tidyverse/tidy-r/references/migration-examples.md +++ b/tidyverse/tidy-r/references/migration.md @@ -1,4 +1,4 @@ -# Migration: Base R and Old Tidyverse to Modern Patterns +# Migration: Base R and Old Tidyverse to Modern Patterns (dplyr >=1.2.0) ## Base R to Modern Tidyverse @@ -28,7 +28,7 @@ substr(text, 1, 5) # -> str_sub(text, 1, 5) nchar(text) # -> str_length(text) strsplit(text, ",") # -> str_split(text, ",") tolower(text) # -> str_to_lower(text) -sprintf("Hello %s", name) # -> str_glue("Hello {name}") +sprintf("Hello %s", name) # -> str_glue("Hello {name}") ``` ## Old to New Tidyverse Patterns @@ -46,7 +46,7 @@ map(x, function(x) x + 1) # -> map(x, \(x) x + 1) map(x, ~ .x + 1) # -> map(x, \(x) x + 1) ``` -### Grouping (dplyr 1.1+) +### Grouping (dplyr >=1.2.0) ```r group_by(data, x) |> @@ -78,7 +78,7 @@ summarise(data, x, .groups = "drop") # -> reframe(data, x) gather()/spread() # -> pivot_longer()/pivot_wider() ``` -### String separation (tidyr 1.3+) +### String separation (tidyr >=1.3.0) ```r separate(col, into = c("a", "b")) @@ -88,7 +88,7 @@ extract(col, into = "x", regex) # -> separate_wider_regex(col, patterns = c(x = regex)) ``` -### Superseded purrr functions (purrr 1.0+) +### Superseded purrr functions (purrr >=1.0.0) ```r map_dfr(x, f) # -> map(x, f) |> list_rbind() @@ -98,7 +98,7 @@ pmap_dfr(list, f) # -> pmap(list, f) |> list_rbind() imap_dfr(x, f) # -> imap(x, f) |> list_rbind() ``` -### Recoding and replacing (dplyr 1.2+) +### Recoding and replacing (dplyr >=1.2.0) ```r case_match(x, val ~ result) # -> recode_values(x, val ~ result) @@ -118,7 +118,7 @@ na_if(x, val) # -> replace_values(x, val ~ NA) tidyr::replace_na(x, default) # -> replace_values(x, NA ~ default) ``` -### Filter family (dplyr 1.2+) +### Filter family (dplyr >=1.2.0) ```r # Dropping rows with NA-safe negation @@ -138,7 +138,7 @@ qs::qsave(x, "file.qs") # -> qs2::qs_save(x, "file.qs2") qs::qread("file.qs") # -> qs2::qs_read("file.qs2") ``` -### Defunct in dplyr 1.2 (now errors) +### Defunct in dplyr >=1.2.0 (now errors) ```r # Underscored SE verbs (defunct since 1.2, deprecated since 0.7) diff --git a/tidyverse/tidy-r/references/recode-replace-examples.md b/tidyverse/tidy-r/references/recode-replace.md similarity index 98% rename from tidyverse/tidy-r/references/recode-replace-examples.md rename to tidyverse/tidy-r/references/recode-replace.md index 6c95616..9889a8b 100644 --- a/tidyverse/tidy-r/references/recode-replace-examples.md +++ b/tidyverse/tidy-r/references/recode-replace.md @@ -1,4 +1,4 @@ -# Recoding, Replacing, and Filtering (dplyr 1.2+) +# Recoding, Replacing, and Filtering (dplyr >=1.2.0) dplyr 1.2 introduced a family of functions for recoding and replacing values, and for NA-safe filtering. These replace older patterns (`case_match`, `recode`, `coalesce`, `na_if`, negated filters). diff --git a/tidyverse/tidy-r/references/stringr-examples.md b/tidyverse/tidy-r/references/stringr.md similarity index 100% rename from tidyverse/tidy-r/references/stringr-examples.md rename to tidyverse/tidy-r/references/stringr.md From 698c090445641a3916adc477bb57977d0cb6a470 Mon Sep 17 00:00:00 2001 From: statzhero Date: Tue, 7 Apr 2026 16:22:19 -0400 Subject: [PATCH 4/7] Remove linebreaks --- tidyverse/tidy-r/SKILL.md | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/tidyverse/tidy-r/SKILL.md b/tidyverse/tidy-r/SKILL.md index 918b3d0..8e1fbea 100644 --- a/tidyverse/tidy-r/SKILL.md +++ b/tidyverse/tidy-r/SKILL.md @@ -1,12 +1,7 @@ --- name: tidy-r description: > - Modern tidyverse patterns, style guide, and migration guidance for R development. - Use this skill when writing R code, reviewing tidyverse code, updating legacy R - code to modern patterns, or enforcing consistent style. Covers native pipe usage, - join_by() syntax, .by grouping, pick/across/reframe, filter_out/when_any/when_all, - recode_values/replace_values/replace_when, tidy selection, stringr, naming - conventions, and migration from base R or older tidyverse APIs. + Modern tidyverse patterns, style guide, and migration guidance for R development. Use this skill when writing R code, reviewing tidyverse code, updating legacy R code to modern patterns, or enforcing consistent style. Covers native pipe usage, join_by() syntax, .by grouping, pick/across/reframe, filter_out/when_any/when_all, recode_values/replace_values/replace_when, tidy selection, stringr, naming conventions, and migration from base R or older tidyverse APIs. metadata: r_version: ">=4.5.0" tidyverse_version: ">=2.0.0" @@ -61,7 +56,7 @@ Use newspaper style: high-level logic first, helpers below. Don't define functio - Use `join_by()`, never `c("a" = "b")` - Use `relationship`, `unmatched`, `na_matches` for quality control -### Recoding and replacing (dplyr 1.2+) +### Recoding and replacing (dplyr >=1.2.0) | Task | Function | |------|----------| @@ -160,7 +155,7 @@ quarterly <- sales_enriched |> 1. **Use `.unmatched = "error"`** in `case_when()` and `recode_values()` for defensive programming 2. **Place `.by` on its own line** for readability 3. **Prefer `filter_out()` over negated `filter()`** for NA-safe row removal -4. **Use `recode_values()` over `case_match()`** (dplyr 1.2+ preferred API) +4. **Use `recode_values()` over `case_match()`** (dplyr >=1.2.0 preferred API) 5. **Use `replace_when()` over `case_when()` with `.default`** when updating a column in place 6. **Name variables as nouns, functions as verbs** in snake_case 7. **Explain "why" in comments**, not "what" From 73950f1ed18d943773c2ccf5a5e0387d1dc59fb9 Mon Sep 17 00:00:00 2001 From: statzhero Date: Tue, 7 Apr 2026 17:04:05 -0400 Subject: [PATCH 5/7] Soften grouping stance --- tidyverse/tidy-r/SKILL.md | 14 +++++++------- tidyverse/tidy-r/references/grouping.md | 4 ++-- tidyverse/tidy-r/references/migration.md | 2 +- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/tidyverse/tidy-r/SKILL.md b/tidyverse/tidy-r/SKILL.md index 8e1fbea..228d2e7 100644 --- a/tidyverse/tidy-r/SKILL.md +++ b/tidyverse/tidy-r/SKILL.md @@ -46,7 +46,7 @@ Use newspaper style: high-level logic first, helpers below. Don't define functio ### Grouping -- Use `.by` for per-operation grouping, never `group_by() |> ... |> ungroup()` +- Prefer `.by` for per-operation grouping; use `group_by()` when grouping must persist across multiple operations - Never add `ungroup()` before or after `.by` -- it always returns ungrouped data - Consolidate multiple `mutate(.by = x)` calls into one when they share the same `.by`; keep separate only when `.by` differs or a later column depends on an earlier one - Place `.by` on its own line for readability @@ -88,12 +88,11 @@ Use `cli::cli_abort()` with problem statement + bullets, never `stop()`. | `by = c("a" = "b")` | `by = join_by(a == b)` | | `multiple = "error"` in joins | `relationship = "many-to-one"` (or `"one-to-one"`) | | `sapply()` | `map_*()` (type-stable) | -| `group_by() \|> ... \|> ungroup()` | `.by` argument | +| `group_by() \|> ... \|> ungroup()` for single operations | `.by` argument | | `ungroup() \|> mutate(..., .by = x)` | `mutate(..., .by = x)` (`.by` ignores existing groups) | | Repeated `mutate(.by = x)` with same `.by` | Single `mutate()` with all columns and one `.by` | | `cat()` for messages | `message()` or `cli::cli_inform()` | | `stop()` for errors | `cli::cli_abort()` | -| `distinct(id)` | `distinct(id, .keep_all = TRUE)` | | `mean(x, na.rm = TRUE)` | `mean(x)` with tidyna loaded | | `case_match(x, ...)` | `recode_values(x, ...)` | | `recode(x, ...)` | `recode_values(x, ...)` or `replace_values(x, ...)` | @@ -141,10 +140,11 @@ quarterly <- sales_enriched |> .by = c(region_name, quarter) ) |> mutate( - performance = revenue |> - replace_when( - total_revenue > 100000 ~ "high", - total_revenue > 50000 ~ "medium" + performance = total_revenue |> + recode_values( + \(x) x > 100000 ~ "high", + \(x) x > 50000 ~ "medium", + .default = "low" ) ) |> arrange(region_name, quarter) diff --git a/tidyverse/tidy-r/references/grouping.md b/tidyverse/tidy-r/references/grouping.md index 633b1ee..61830f9 100644 --- a/tidyverse/tidy-r/references/grouping.md +++ b/tidyverse/tidy-r/references/grouping.md @@ -2,7 +2,7 @@ ## Per-operation grouping with .by -The `.by` argument replaces the old `group_by() |> ... |> ungroup()` pattern. Results are always ungrouped. +The `.by` argument is preferred for per-operation grouping. Use `group_by()` when grouping must persist across multiple operations. `.by` results are always ungrouped. ### Basic usage @@ -60,7 +60,7 @@ data |> summarise(mean_value = mean(value), .by = category) ``` -### Avoid - old persistent grouping pattern +### Avoid for single operations - use .by instead ```r # Avoid diff --git a/tidyverse/tidy-r/references/migration.md b/tidyverse/tidy-r/references/migration.md index 41bf538..fb3e52a 100644 --- a/tidyverse/tidy-r/references/migration.md +++ b/tidyverse/tidy-r/references/migration.md @@ -8,7 +8,7 @@ subset(data, condition) # -> filter(data, condition) data[order(data$x), ] # -> arrange(data, x) aggregate(x ~ y, data, mean) # -> summarise(data, mean(x), .by = y) -merge(x, y, by = "id") # -> left_join(x, y, by = join_by(id)) +merge(x, y, by = "id") # -> inner_join(x, y, by = join_by(id)) ``` ### Functional programming From 2de0d63bca290195f656555e5ec1d81a93e7981a Mon Sep 17 00:00:00 2001 From: statzhero Date: Tue, 7 Apr 2026 17:22:45 -0400 Subject: [PATCH 6/7] Audit and improvements --- tidyverse/tidy-r/SKILL.md | 34 +---- tidyverse/tidy-r/references/grouping.md | 48 ++++++- tidyverse/tidy-r/references/joins.md | 33 ----- tidyverse/tidy-r/references/migration.md | 10 ++ tidyverse/tidy-r/references/stringr.md | 17 ++- tidyverse/tidy-r/references/tidyselect.md | 117 ++++++++++++++++++ .../tidy-r/references/tidyverse-style.md | 5 +- 7 files changed, 196 insertions(+), 68 deletions(-) create mode 100644 tidyverse/tidy-r/references/tidyselect.md diff --git a/tidyverse/tidy-r/SKILL.md b/tidyverse/tidy-r/SKILL.md index 228d2e7..8190087 100644 --- a/tidyverse/tidy-r/SKILL.md +++ b/tidyverse/tidy-r/SKILL.md @@ -1,7 +1,7 @@ --- name: tidy-r description: > - Modern tidyverse patterns, style guide, and migration guidance for R development. Use this skill when writing R code, reviewing tidyverse code, updating legacy R code to modern patterns, or enforcing consistent style. Covers native pipe usage, join_by() syntax, .by grouping, pick/across/reframe, filter_out/when_any/when_all, recode_values/replace_values/replace_when, tidy selection, stringr, naming conventions, and migration from base R or older tidyverse APIs. + Modern tidyverse patterns, style guide, and migration guidance for R development. Use this skill when writing R code, reviewing tidyverse code, updating legacy R code, or enforcing consistent style. Covers native pipe usage, join_by() syntax, .by grouping, pick/across/reframe, filter_out/when_any/when_all, recode_values/replace_values/replace_when, tidyselect helpers, .data/.env pronouns, stringr, naming conventions, and readr. metadata: r_version: ">=4.5.0" tidyverse_version: ">=2.0.0" @@ -22,6 +22,7 @@ Consult the appropriate reference file for detailed patterns and examples: | **Grouping & columns** | [grouping.md](references/grouping.md) | `.by`, `group_by`, `across`, `pick`, `reframe`, column operations | | **Recoding & replacing** | [recode-replace.md](references/recode-replace.md) | `recode_values`, `replace_values`, `replace_when`, `filter_out`, `when_any`, `when_all` | | **Strings** | [stringr.md](references/stringr.md) | String manipulation, regex, `str_*` functions, text processing | +| **Tidy selection** | [tidyselect.md](references/tidyselect.md) | Column selection helpers, `where()`, `all_of()`, `any_of()`, boolean ops, `.data`/`.env` pronouns | | **Style** | [tidyverse-style.md](references/tidyverse-style.md) | Naming, formatting, spacing, error messages, `cli::cli_abort` | | **Migration** | [migration.md](references/migration.md) | Updating old code, base R conversion, deprecated functions | @@ -38,6 +39,7 @@ For requests that span multiple topics (e.g., "rewrite this old code" touches mi ### Pipe and lambda - Always `|>`, never `%>%` +- Use `_` placeholder for non-first arguments: `x |> f(1, y = _)`. The placeholder must be named and used exactly once. - Always `\(x)`, never `function(x)` or `~` in map/keep/etc. ### Code organization @@ -79,27 +81,6 @@ Use `cli::cli_abort()` with problem statement + bullets, never `stop()`. - `map_*()` over `sapply()` for type stability - `set.seed()` with date-time, never 42 -## Anti-patterns - -| Avoid | Use instead | -|-------|-------------| -| `%>%` | `|>` | -| `function(x)` or `~` | `\(x)` | -| `by = c("a" = "b")` | `by = join_by(a == b)` | -| `multiple = "error"` in joins | `relationship = "many-to-one"` (or `"one-to-one"`) | -| `sapply()` | `map_*()` (type-stable) | -| `group_by() \|> ... \|> ungroup()` for single operations | `.by` argument | -| `ungroup() \|> mutate(..., .by = x)` | `mutate(..., .by = x)` (`.by` ignores existing groups) | -| Repeated `mutate(.by = x)` with same `.by` | Single `mutate()` with all columns and one `.by` | -| `cat()` for messages | `message()` or `cli::cli_inform()` | -| `stop()` for errors | `cli::cli_abort()` | -| `mean(x, na.rm = TRUE)` | `mean(x)` with tidyna loaded | -| `case_match(x, ...)` | `recode_values(x, ...)` | -| `recode(x, ...)` | `recode_values(x, ...)` or `replace_values(x, ...)` | -| `filter(x != val \| is.na(x))` | `filter_out(x == val)` | -| `coalesce(x, default)` | `replace_values(x, NA ~ default)` | -| `na_if(x, val)` | `replace_values(x, val ~ NA)` | - ## Example ```r @@ -150,12 +131,3 @@ quarterly <- sales_enriched |> arrange(region_name, quarter) ``` -## Best practices - -1. **Use `.unmatched = "error"`** in `case_when()` and `recode_values()` for defensive programming -2. **Place `.by` on its own line** for readability -3. **Prefer `filter_out()` over negated `filter()`** for NA-safe row removal -4. **Use `recode_values()` over `case_match()`** (dplyr >=1.2.0 preferred API) -5. **Use `replace_when()` over `case_when()` with `.default`** when updating a column in place -6. **Name variables as nouns, functions as verbs** in snake_case -7. **Explain "why" in comments**, not "what" diff --git a/tidyverse/tidy-r/references/grouping.md b/tidyverse/tidy-r/references/grouping.md index 61830f9..c576c8d 100644 --- a/tidyverse/tidy-r/references/grouping.md +++ b/tidyverse/tidy-r/references/grouping.md @@ -145,6 +145,22 @@ data |> ) ``` +## .by with tidyr::fill() + +tidyr supports `.by` in `fill()`, matching the dplyr pattern: + +```r +# Good - per-operation grouping +data |> + tidyr::fill(value, .by = group, .direction = "down") + +# Avoid - group_by/ungroup wrapper +data |> + group_by(group) |> + tidyr::fill(value, .direction = "down") |> + ungroup() +``` + ## pick() for column selection Use `pick()` inside data-masking functions to select columns by name or tidyselect helpers: @@ -232,7 +248,7 @@ my_summary <- function(data, summary_var) { } ``` -### Character vectors use .data[[]] +### Character vectors in data-masked contexts use .data[[]] ```r for (var in names(mtcars)) { @@ -240,6 +256,36 @@ for (var in names(mtcars)) { } ``` +### Character vectors in tidy-select contexts use all_of()/any_of() + +The `across(all_of())` bridge is the canonical pattern for passing character vectors into tidy-select: + +```r +vars <- c("mpg", "wt", "hp") + +# Good - across(all_of()) for character vectors +mtcars |> + summarise(across(all_of(vars), mean)) + +# Good - any_of() when some columns may not exist +mtcars |> + select(any_of(vars)) + +# Avoid - .data[[]] inside tidy-select (deprecated) +mtcars |> + select(.data[["mpg"]], .data[["wt"]]) +``` + +### Access calling-environment variables with .env + +Use `.env$var` to disambiguate when a local variable shares a name with a column: + +```r +threshold <- 10 +data |> + filter(value > .env$threshold) +``` + ### Multiple columns use across() ```r diff --git a/tidyverse/tidy-r/references/joins.md b/tidyverse/tidy-r/references/joins.md index 5520ac4..7ea7140 100644 --- a/tidyverse/tidy-r/references/joins.md +++ b/tidyverse/tidy-r/references/joins.md @@ -88,36 +88,3 @@ sales |> ) ``` -## Logging joins with tidylog - -Use `tidylog::` prefix for joins to verify expected behavior. Call directly without loading the package. - -```r -result <- transactions |> - tidylog::left_join(companies, by = join_by(company == id)) - -# tidylog output: -# left_join: added 2 columns (name, region) -# > rows only in x 12 -# > rows only in y (3) -# > matched rows 988 -# > rows total 1000 -``` - -### Interpreting join output - -| Output | Meaning | -|--------|---------| -| `rows only in x` | Rows in left table with no match (kept as NA in left joins) | -| `rows only in y` | Rows in right table with no match (in parentheses, dropped in left joins) | -| `matched rows` | Rows that matched between tables | -| `rows total` | Final row count after join | - -### When to use tidylog - -- **Always for joins** to see how many rows matched, duplicated, or were dropped -- **Critical filters** with `tidylog::filter()` to verify expected row counts -- **Critical mutates** with `tidylog::mutate()` to verify expected changes -- **Any operation where silent data loss is a risk** - -Don't use tidylog in production code, inside functions, or loops where output would be too verbose. It's for interactive verification only. diff --git a/tidyverse/tidy-r/references/migration.md b/tidyverse/tidy-r/references/migration.md index fb3e52a..65d732a 100644 --- a/tidyverse/tidy-r/references/migration.md +++ b/tidyverse/tidy-r/references/migration.md @@ -131,6 +131,16 @@ filter(cond1 | cond2 | cond3) # -> filter(when_any(cond1, cond2, cond3)) filter(cond1 & cond2 & cond3) # -> filter(when_all(cond1, cond2, cond3)) ``` +### Reading data + +```r +read.csv("file.csv") # -> read_csv("file.csv") # tibble, faster, better type detection +read.csv("file.csv", sep = "\t") # -> read_tsv("file.csv") +read.csv2("file.csv") # -> read_csv2("file.csv") # semicolon-delimited +``` + +For large files (>100 MB), `vroom::vroom()` is faster than `read_csv()`. For small files the difference is negligible. + ### Serialization ```r diff --git a/tidyverse/tidy-r/references/stringr.md b/tidyverse/tidy-r/references/stringr.md index a56c5a9..fecadfd 100644 --- a/tidyverse/tidy-r/references/stringr.md +++ b/tidyverse/tidy-r/references/stringr.md @@ -53,13 +53,16 @@ str_length(text) # character count str_trunc(text, 20) # truncate with ellipsis ``` -### Formatting +### Formatting and case conversion ```r str_to_lower(text) # lowercase str_to_upper(text) # uppercase str_to_title(text) # title case str_to_sentence(text) # sentence case +str_to_snake(text) # snake_case (stringr >=1.6.0) +str_to_camel(text) # camelCase (stringr >=1.6.0) +str_to_kebab(text) # kebab-case (stringr >=1.6.0) str_trim(text) # remove leading/trailing whitespace str_squish(text) # trim + collapse internal whitespace str_pad(text, 10, side = "left") # pad to fixed width @@ -73,6 +76,14 @@ str_glue("Hello {name}, you scored {score}!") str_glue_data(df, "{name}: {value}") ``` +### Case-insensitive matching (stringr >=1.6.0) + +```r +str_ilike(text, "hello*") # SQL ILIKE-style, case-insensitive glob +# Replaces: str_like(text, "hello*", ignore_case = TRUE) +# str_like() ignore_case argument is deprecated; use str_ilike() instead +``` + ## Pattern helpers Use these for clarity about what kind of matching you intend: @@ -98,5 +109,9 @@ str_detect(text, boundary("word")) # word boundaries | `str_to_lower(text)` | `tolower(text)` | | | `str_to_upper(text)` | `toupper(text)` | | | `str_to_title(text)` | `tools::toTitleCase(text)` | | +| `str_to_snake(text)` | — | stringr >=1.6.0 | +| `str_to_camel(text)` | — | stringr >=1.6.0 | +| `str_to_kebab(text)` | — | stringr >=1.6.0 | +| `str_ilike(text, "pat*")` | — | case-insensitive glob, stringr >=1.6.0 | | `str_trim(text)` | `trimws(text)` | | | `str_glue("Hello {x}")` | `sprintf("Hello %s", x)` | More readable | diff --git a/tidyverse/tidy-r/references/tidyselect.md b/tidyverse/tidy-r/references/tidyselect.md new file mode 100644 index 0000000..64fbff6 --- /dev/null +++ b/tidyverse/tidy-r/references/tidyselect.md @@ -0,0 +1,117 @@ +# Tidy Selection + +Tidy selection is the column selection language used by `select()`, `relocate()`, `rename()`, `across()`, `pick()`, `pivot_longer()`, `pivot_wider()`, and other tidyverse functions that accept column specifications. + +## Selection helpers + +```r +starts_with("x") # columns starting with "x" +ends_with("_id") # columns ending with "_id" +contains("score") # columns containing "score" +matches("^x\\d+$") # columns matching a regex +num_range("x", 1:5) # x1, x2, x3, x4, x5 +last_col() # rightmost column +everything() # all columns +where(is.numeric) # columns satisfying a predicate +``` + +## Selecting by name + +```r +data |> select(name, age) # by name +data |> select(name:age) # range +data |> select(!age) # exclude +data |> select(where(is.numeric) & !id) # boolean combination +``` + +## Boolean algebra on selections + +Selections support `!` (complement), `&` (intersection), and `|` (union): + +```r +data |> select(where(is.numeric) & !c(id, year)) +data |> select(starts_with("x") | ends_with("_total")) +data |> select(!where(is.character)) +``` + +## Character vectors: all_of() and any_of() + +Use `all_of()` for strict matching (errors if a name is missing) and `any_of()` for permissive matching (silently ignores missing names): + +```r +vars <- c("mpg", "wt", "hp") + +data |> select(all_of(vars)) # errors if any name absent +data |> select(any_of(vars)) # ignores missing names +``` + +### The across(all_of()) bridge pattern + +This is the canonical way to pass character vectors into data-masked contexts that use tidy selection: + +```r +vars <- c("revenue", "cost") + +data |> + summarise(across(all_of(vars), mean)) + +data |> + mutate(across(all_of(vars), \(x) x / 1000)) +``` + +## .data and .env pronouns + +### .data in data-masked contexts + +Use `.data[[var]]` when the column name is a string variable inside data-masked functions (`filter`, `mutate`, `summarise`): + +```r +var <- "mpg" +mtcars |> filter(.data[[var]] > 20) +``` + +### .data is deprecated in tidy-select contexts + +Do NOT use `.data$col` or `.data[[var]]` inside tidy-select functions (`select`, `across`, `pick`). Use string names or `all_of()`/`any_of()` instead: + +```r +var <- "mpg" + +# Good +data |> select(all_of(var)) +data |> select(any_of(var)) + +# Avoid (deprecated) +data |> select(.data[[var]]) +``` + +### .env for environment variables + +Use `.env$var` to access variables from the calling environment when they might collide with column names: + +```r +threshold <- 10 + +# Good - unambiguous +data |> filter(value > .env$threshold) + +# Risky - if data has a "threshold" column, it shadows the local variable +data |> filter(value > threshold) +``` + +`.env` is most useful inside functions where you cannot control what columns the data has: + +```r +filter_above <- function(data, col, cutoff) { + data |> filter({{ col }} > .env$cutoff) +} +``` + +## Tidy selection vs data masking + +| Context | Used by | Column selection | Character vector bridge | +|---------|---------|-----------------|----------------------| +| **Tidy selection** | `select`, `across`, `pick`, `relocate`, `pivot_*` | helpers like `where()`, `starts_with()` | `all_of(vars)` | +| **Data masking** | `filter`, `mutate`, `summarise`, `arrange` | `.data[[var]]` | `across(all_of(vars))` | + +The two contexts have different rules. Tidy selection uses helper functions; data masking evaluates R expressions in the data frame environment. `{{ }}` (embrace) works in both contexts for forwarding a single function argument. diff --git a/tidyverse/tidy-r/references/tidyverse-style.md b/tidyverse/tidy-r/references/tidyverse-style.md index 304b5bc..95c098f 100644 --- a/tidyverse/tidy-r/references/tidyverse-style.md +++ b/tidyverse/tidy-r/references/tidyverse-style.md @@ -140,8 +140,8 @@ check_input <- function(x) { ```r long_function_name <- function( - a = "argument", - b = "argument" + a = "argument", + b = "argument" ) { # body } @@ -184,6 +184,7 @@ data <- data |> filter(!is.na(value)) - Use `&&` and `||` in conditions (not `&` and `|`) - Use `TRUE`/`FALSE` (not `T`/`F`) - Never use semicolons +- With `tidyna` loaded, `na.rm = TRUE` is the default for common aggregation functions -- write `mean(x)` instead of `mean(x, na.rm = TRUE)` ## Error Messages From 416ab0c22ef4ad173d7e3a1c5aba04ffdb448c4b Mon Sep 17 00:00:00 2001 From: statzhero Date: Tue, 7 Apr 2026 18:49:03 -0400 Subject: [PATCH 7/7] Fix example --- tidyverse/tidy-r/SKILL.md | 64 +++++++++++++++------------------------ 1 file changed, 25 insertions(+), 39 deletions(-) diff --git a/tidyverse/tidy-r/SKILL.md b/tidyverse/tidy-r/SKILL.md index 8190087..4340cac 100644 --- a/tidyverse/tidy-r/SKILL.md +++ b/tidyverse/tidy-r/SKILL.md @@ -86,48 +86,34 @@ Use `cli::cli_abort()` with problem statement + bullets, never `stop()`. ```r library(tidyverse) -# Read and clean data -sales <- read_csv("data/sales.csv") |> - rename( - region = Region, - product = Product, - revenue = Revenue, - date = Date - ) |> - mutate( - quarter = quarter(date), - product = product |> - replace_values( - c("Widget A", "WidgetA") ~ "Widget A", - c("Widget B", "WidgetB") ~ "Widget B" - ) +penguins <- penguins |> + filter_out(is.na(sex)) |> + mutate(size = case_when( + body_mass > 4500 ~ "large", + body_mass > 3500 ~ "medium", + .default = "small" + )) + +# Coordinates for spatial join below +island_coords <- tribble( + ~island, ~latitude, + "Biscoe", -65.5, + "Dream", -64.7, + "Torgersen", -64.8 +) + +island_summary <- penguins |> + summarise( + mean_flipper = mean(flipper_len), + mean_mass = mean(body_mass), + n = n(), + .by = c(species, island) ) |> - filter_out(is.na(revenue)) - -# Enrich with lookup table -sales_enriched <- sales |> left_join( - regions, - by = join_by(region == region_code), + island_coords, + by = join_by(island), unmatched = "error" - ) - -# Summarise by group -quarterly <- sales_enriched |> - summarise( - total_revenue = sum(revenue), - avg_revenue = mean(revenue), - n_transactions = n(), - .by = c(region_name, quarter) - ) |> - mutate( - performance = total_revenue |> - recode_values( - \(x) x > 100000 ~ "high", - \(x) x > 50000 ~ "medium", - .default = "low" - ) ) |> - arrange(region_name, quarter) + arrange(species, island) ```