diff --git a/docs/src/getting-started.md b/docs/src/getting-started.md index 4427492..bd0baa5 100644 --- a/docs/src/getting-started.md +++ b/docs/src/getting-started.md @@ -31,7 +31,7 @@ dumpling --help dumpling scaffold-config -i dump.sql -o .dumplingconf ``` - This **beta** subcommand streams the dump once and writes inferred `[rules]` from SQL column names (`CREATE TABLE`, `INSERT`, and PostgreSQL `COPY` column lists). Heuristics are **English-oriented**; output is **draft only**—review and edit every rule, add a top-level **`salt`** (for hashing) and any **`${…}`** secret placeholders before production use. + This **beta** subcommand streams the dump once and writes inferred `[rules]` from SQL column names (`CREATE TABLE`, `INSERT`, and PostgreSQL `COPY` column lists). It does **not** require an existing Dumpling config in the current directory (optional config is only merged for `pg_restore` / keep-original defaults). Heuristics are **English-oriented**; output is **draft only**—review and edit every rule, add a top-level **`salt`** (for hashing) and any **`${…}`** secret placeholders before production use. Useful flags: diff --git a/src/main.rs b/src/main.rs index c2cdcc5..390e8cf 100644 --- a/src/main.rs +++ b/src/main.rs @@ -6,6 +6,7 @@ use std::sync::mpsc::sync_channel; use std::thread::JoinHandle; use std::time::Instant; +use anyhow::Context; use clap::{ArgAction, Parser, Subcommand}; mod compressed_input; @@ -232,7 +233,15 @@ fn main() -> anyhow::Result<()> { other ), }; - let resolved_for_pg = settings::load_config(cli.config.as_ref(), false)?; + let resolved_for_pg = settings::load_config(cli.config.as_ref(), true).with_context(|| { + "loading config for scaffold-config (pg_restore / keep-original merge only; use -c if an explicit file fails to load)" + })?; + if cli.config.is_none() && resolved_for_pg.source_path.is_none() { + eprintln!( + "dumpling scaffold-config: note: no Dumpling config found in the current directory; \ + using defaults for pg_restore and keep-original hints" + ); + } let (pg_restore_path_eff, pg_restore_arg_eff) = settings::merge_pg_restore_cli( &resolved_for_pg.pg_restore, pg_restore_path.clone(), diff --git a/src/scaffold.rs b/src/scaffold.rs index df10fd2..94e1bd2 100644 --- a/src/scaffold.rs +++ b/src/scaffold.rs @@ -45,13 +45,10 @@ pub fn run_scaffold_config(opts: ScaffoldConfigOptions) -> anyhow::Result<()> { } = opts; eprintln!( - "dumpling scaffold-config: beta — draft rules from column names{}; review before use. \ + "dumpling scaffold-config: beta — draft rules from column names plus up to {} \ + reservoir rows per table for JSON path hints and name-like column value checks; review before use. \ Heuristics are English-oriented and miss opaque or non-English names.", - if infer_json_paths { - ", reservoir-sampled JSON paths (~5 rows/table)" - } else { - " only" - } + crate::sql::SCAFFOLD_JSON_RESERVOIR_SIZE ); let mut pg_restore_child: Option = None; @@ -104,10 +101,15 @@ pub fn run_scaffold_config(opts: ScaffoldConfigOptions) -> anyhow::Result<()> { eprintln!( "dumpling scaffold-config: warning: no rules inferred; emitted file contains header only" ); - } else if infer_json_paths { + } else { eprintln!( - "dumpling scaffold-config: reservoir sample ({} rows max per table) for JSON path hints", - crate::sql::SCAFFOLD_JSON_RESERVOIR_SIZE + "dumpling scaffold-config: reservoir sample ({} rows max per table) for name-like column value checks{}", + crate::sql::SCAFFOLD_JSON_RESERVOIR_SIZE, + if infer_json_paths { + " and nested JSON path hints" + } else { + "" + } ); } @@ -174,7 +176,9 @@ const SCAFFOLD_HEADER: &str = r#"# Dumpling starter config (beta) — generated # # Inferred [rules]: SQL column names (CREATE TABLE, INSERT, COPY) plus optional nested JSON paths # when generated with `--infer-json-paths` (dot-separated keys: `payload.profile.email`). -# Name heuristics are English-oriented; JSON leaf inference uses segment names and light literals. +# JSON path segments keep the casing from the sampled payload (e.g. camelCase API fields). +# Name heuristics are English-oriented: unmistakable column names map directly; other "*name*" +# columns only become `strategy = "name"` when sampled INSERT/COPY text looks person-shaped. # Review every rule; add salt for hash strategies and extend row_filters / column_cases as needed. # diff --git a/src/settings.rs b/src/settings.rs index 5239ab7..091466e 100644 --- a/src/settings.rs +++ b/src/settings.rs @@ -121,7 +121,8 @@ pub struct AnonymizerSpec { #[derive(Debug, Clone, Default)] pub struct ResolvedConfig { pub salt: Option, - /// Normalized rule map: lowercase keys for table and column names + /// Normalized rule map: lowercase table keys; column keys are lowercased for plain SQL columns, + /// but JSON path segments after the first `.` / `__` keep their authored casing for runtime JSON matching. pub rules: HashMap>, /// Normalized row filters per table pub row_filters: HashMap, @@ -529,7 +530,7 @@ fn resolve(raw: RawConfig, source_path: Option) -> ResolvedConfig { let mut col_map: HashMap = HashMap::new(); for (col, mut spec) in cols.into_iter() { spec.strategy = spec.strategy.to_ascii_lowercase(); - col_map.insert(col.to_lowercase(), spec); + col_map.insert(normalize_rules_column_key(&col), spec); } normalized_rules.insert(table_key_norm, col_map); } @@ -549,7 +550,7 @@ fn resolve(raw: RawConfig, source_path: Option) -> ResolvedConfig { c }) .collect(); - inner.insert(col.to_lowercase(), cases); + inner.insert(normalize_rules_column_key(&col), cases); } normalized_cases.insert(table_key_norm, inner); } @@ -949,11 +950,62 @@ fn validate_anonymizer_spec(spec: &AnonymizerSpec, path: &str) -> anyhow::Result Ok(()) } +/// Normalize a `[rules]` / `[column_cases]` column key for [`ResolvedConfig`]: the leading SQL +/// column identifier is lowercased; nested JSON path segments keep their authored casing so rules +/// match object keys in dump JSON (e.g. camelCase from APIs). +pub fn normalize_rules_column_key(col: &str) -> String { + let col = col.trim(); + if col.contains("__") { + let parts: Vec<&str> = col.split("__").collect(); + if parts.len() >= 2 { + let base = parts[0].trim(); + if !base.is_empty() { + let path: Vec<&str> = parts[1..] + .iter() + .map(|p| (*p).trim()) + .filter(|p| !p.is_empty()) + .collect(); + if !path.is_empty() { + let mut out = base.to_ascii_lowercase(); + for p in path { + out.push_str("__"); + out.push_str(p); + } + return out; + } + } + } + } + if col.contains('.') { + let parts: Vec<&str> = col.split('.').collect(); + if parts.len() >= 2 { + let base = parts[0].trim(); + if !base.is_empty() { + let path: Vec<&str> = parts[1..] + .iter() + .map(|p| (*p).trim()) + .filter(|p| !p.is_empty()) + .collect(); + if !path.is_empty() { + let mut out = base.to_ascii_lowercase(); + for p in path { + out.push('.'); + out.push_str(p); + } + return out; + } + } + } + } + col.to_ascii_lowercase() +} + /// Split a rules column key into the SQL column name and optional JSON path segments. /// /// Nested paths use the same syntax as row-filter predicates: `payload.profile.email` or /// `payload__profile__email`. When no path is present, the entire key names one SQL column. -/// Keys are compared case-insensitively after normalization (lowercase). +/// The base SQL column is compared case-insensitively (lowercase); JSON path segments use the +/// casing stored in the resolved config key (must match JSON object keys in the cell). pub fn parse_json_column_key(column_key: &str) -> (String, Vec) { let trim_parts = |parts: &[&str]| -> Option<(String, Vec)> { if parts.len() < 2 { @@ -1251,7 +1303,9 @@ pub fn is_explicit_sensitive_column( #[cfg(test)] mod tests { - use super::{load_config, resolve_secrets_in_value, ConfigPathSegment}; + use super::{ + load_config, normalize_rules_column_key, resolve_secrets_in_value, ConfigPathSegment, + }; use std::fs; use std::path::{Path, PathBuf}; use std::time::{SystemTime, UNIX_EPOCH}; @@ -1816,4 +1870,35 @@ email = { strategy = "hash", locale = "fr_fr" } assert!(msg.contains("hash")); let _ = fs::remove_file(path); } + + #[test] + fn normalize_rules_column_key_preserves_json_path_case() { + assert_eq!( + normalize_rules_column_key("Payload.Profile.ContactEmail"), + "payload.Profile.ContactEmail" + ); + assert_eq!( + normalize_rules_column_key("payload__Profile__contactEmail"), + "payload__Profile__contactEmail" + ); + assert_eq!(normalize_rules_column_key("Email"), "email"); + } + + #[test] + fn load_config_preserves_json_path_case_in_resolved_rules() { + let path = write_temp_config( + r#" +salt = "testsalt" +[rules."public.orders"] +"payload.shipTo.fullName" = { strategy = "redact", as_string = true } +"#, + ); + let cfg = load_config(Some(&path), false).expect("load"); + let cols = cfg.rules.get("public.orders").expect("table"); + let spec = cols + .get("payload.shipTo.fullName") + .expect("expected camelCase path segments in resolved map key"); + assert_eq!(spec.strategy, "redact"); + let _ = fs::remove_file(path); + } } diff --git a/src/sql.rs b/src/sql.rs index 9e38c4c..adca275 100644 --- a/src/sql.rs +++ b/src/sql.rs @@ -2,7 +2,7 @@ use crate::filter::{rewrite_json_paths_with_rules, should_keep_row, when_matches use crate::report::Reporter; use crate::settings::{ is_explicit_sensitive_column, lookup_column_cases, lookup_column_rule, - lookup_json_path_rules_for_column, AnonymizerSpec, ResolvedConfig, + lookup_json_path_rules_for_column, normalize_rules_column_key, AnonymizerSpec, ResolvedConfig, }; use crate::transform::{apply_anonymizer, AnonymizerRegistry, Replacement}; use anyhow::Context; @@ -570,6 +570,14 @@ fn scaffold_table_key(schema: Option<&str>, table: &str) -> String { } } +/// Table portion of a `scaffold_table_key` (`schema.table` → `table`, else the whole key). +fn scaffold_bare_table_from_key(table_key: &str) -> &str { + table_key + .rsplit_once('.') + .map(|(_, t)| t) + .unwrap_or(table_key) +} + fn scaffold_address_like_segment(normalized: &str) -> bool { if normalized.contains("ip_address") || normalized.contains("mac_address") { return false; @@ -594,7 +602,12 @@ fn scaffold_address_like_segment(normalized: &str) -> bool { /// Heuristic strategy for starter config from a column name. These rules are **English-oriented** /// substring matches; other languages or opaque names need manual review. pub fn infer_scaffold_strategy(column: &str) -> Option { - infer_auto_strategy(column) + infer_scaffold_strategy_for_table("", column) +} + +/// Same as [`infer_scaffold_strategy`], with a table name for context-aware heuristics (scaffold only). +pub fn infer_scaffold_strategy_for_table(table: &str, column: &str) -> Option { + infer_auto_strategy_with_table(table, column) } /// Options for [`discover_scaffold_rules`]. @@ -626,7 +639,7 @@ fn scaffold_merge_rule( spec: AnonymizerSpec, ) { let cols = rules.entry(table_key.to_string()).or_default(); - let col_key = col_key.to_lowercase(); + let col_key = normalize_rules_column_key(col_key); match cols.get(&col_key) { None => { cols.insert(col_key, spec); @@ -677,25 +690,61 @@ impl TableRowReservoir { } fn flush_into_rules( - self, + &self, table_key: &str, max_json_depth: usize, rules: &mut HashMap>, ) { - let Some(columns) = self.columns else { + let Some(columns) = &self.columns else { return; }; - for row in self.rows { + for row in &self.rows { for (i, raw) in row.iter().enumerate() { let col = columns.get(i).map(|s| s.as_str()).unwrap_or(""); scaffold_consider_json_column_cell(table_key, col, raw, max_json_depth, rules); } } } + + /// For columns whose names are only weakly "name-like", add a `name` rule when sampled cell + /// text looks like a person-style string (English-oriented; many false negatives/positives). + fn flush_name_hints_from_samples( + &self, + table_key: &str, + rules: &mut HashMap>, + ) { + let Some(columns) = &self.columns else { + return; + }; + if self.rows.is_empty() { + return; + } + let table_only = scaffold_bare_table_from_key(table_key); + for (col_idx, col) in columns.iter().enumerate() { + let norm = col.to_ascii_lowercase().replace('-', "_"); + if !infer_ambiguous_name_column_for_sampling(table_only, &norm) { + continue; + } + let matched = self.rows.iter().any(|row| { + row.get(col_idx) + .map(|cell| looks_like_person_name_literal(cell)) + .unwrap_or(false) + }); + if matched { + scaffold_merge_rule( + rules, + table_key, + col.as_str(), + base_spec("name", Some(true)), + ); + } + } + } } -/// One streaming pass over a SQL dump: collect `[rules]` from column names and (optionally) sampled -/// row values. Conflicting rule keys keep the first strategy seen. +/// One streaming pass over a SQL dump: collect `[rules]` from column names and (when INSERT/COPY +/// data is present) up to [`SCAFFOLD_JSON_RESERVOIR_SIZE`] reservoir rows per table for JSON path +/// hints and for **value-checked** weak `name` columns. Conflicting rule keys keep the first strategy seen. pub fn discover_scaffold_rules( reader: &mut R, format: DumpFormat, @@ -722,7 +771,7 @@ pub fn discover_scaffold_rules( columns: &[String], ) { for column in columns { - if let Some(spec) = infer_scaffold_strategy(column) { + if let Some(spec) = infer_scaffold_strategy_for_table(table, column) { let table_key = scaffold_table_key(schema, table); scaffold_merge_rule(rules, &table_key, column, spec); } @@ -763,29 +812,22 @@ pub fn discover_scaffold_rules( &table, &columns, ); - if options.infer_json_paths { - let table_key = - scaffold_table_key(schema.as_deref(), &table); - let r = - reservoir_for_table(&mut table_reservoirs, &table_key); - r.set_columns(columns.clone()); - if let Some(idx) = - find_ignore_ascii_case(rest_after_cols, "VALUES") - { - let after_values = - &rest_after_cols[idx + "VALUES".len()..]; - let values_block = - strip_trailing_semicolon(after_values.trim()); - if let Ok(rows) = parse_values_rows(values_block) { - for row in rows { - let cells: Vec = row - .iter() - .map(|c| { - c.original.clone().unwrap_or_default() - }) - .collect(); - r.push_row(cells, &mut rng); - } + let table_key = scaffold_table_key(schema.as_deref(), &table); + let r = reservoir_for_table(&mut table_reservoirs, &table_key); + r.set_columns(columns.clone()); + if let Some(idx) = + find_ignore_ascii_case(rest_after_cols, "VALUES") + { + let after_values = &rest_after_cols[idx + "VALUES".len()..]; + let values_block = + strip_trailing_semicolon(after_values.trim()); + if let Ok(rows) = parse_values_rows(values_block) { + for row in rows { + let cells: Vec = row + .iter() + .map(|c| c.original.clone().unwrap_or_default()) + .collect(); + r.push_row(cells, &mut rng); } } } @@ -821,11 +863,9 @@ pub fn discover_scaffold_rules( &table, &columns, ); - if options.infer_json_paths { - let table_key = scaffold_table_key(schema.as_deref(), &table); - let r = reservoir_for_table(&mut table_reservoirs, &table_key); - r.set_columns(columns.clone()); - } + let table_key = scaffold_table_key(schema.as_deref(), &table); + let r = reservoir_for_table(&mut table_reservoirs, &table_key); + r.set_columns(columns.clone()); mode = Mode::InCopy { schema, table, @@ -850,24 +890,21 @@ pub fn discover_scaffold_rules( &table, &columns, ); - if options.infer_json_paths { - let table_key = scaffold_table_key(schema.as_deref(), &table); - let r = reservoir_for_table(&mut table_reservoirs, &table_key); - r.set_columns(columns.clone()); - if let Some(idx) = - find_ignore_ascii_case(rest_after_cols, "VALUES") - { - let after_values = &rest_after_cols[idx + "VALUES".len()..]; - let values_block = - strip_trailing_semicolon(after_values.trim()); - if let Ok(rows) = parse_values_rows(values_block) { - for row in rows { - let cells: Vec = row - .iter() - .map(|c| c.original.clone().unwrap_or_default()) - .collect(); - r.push_row(cells, &mut rng); - } + let table_key = scaffold_table_key(schema.as_deref(), &table); + let r = reservoir_for_table(&mut table_reservoirs, &table_key); + r.set_columns(columns.clone()); + if let Some(idx) = find_ignore_ascii_case(rest_after_cols, "VALUES") + { + let after_values = &rest_after_cols[idx + "VALUES".len()..]; + let values_block = + strip_trailing_semicolon(after_values.trim()); + if let Ok(rows) = parse_values_rows(values_block) { + for row in rows { + let cells: Vec = row + .iter() + .map(|c| c.original.clone().unwrap_or_default()) + .collect(); + r.push_row(cells, &mut rng); } } } @@ -885,7 +922,7 @@ pub fn discover_scaffold_rules( } => { if line.trim_end() == "\\." { mode = Mode::Pass; - } else if options.infer_json_paths { + } else { let line_body = line.trim_end_matches(['\n', '\r']); let fields: Vec<&str> = line_body.split('\t').collect(); let table_key = scaffold_table_key(schema.as_deref(), table); @@ -919,16 +956,18 @@ pub fn discover_scaffold_rules( } } - if options.infer_json_paths { - for (table_key, reservoir) in table_reservoirs { + for (table_key, reservoir) in table_reservoirs { + if options.infer_json_paths { reservoir.flush_into_rules(&table_key, options.max_json_depth, &mut rules); } + reservoir.flush_name_hints_from_samples(&table_key, &mut rules); } Ok(rules) } -/// Same as [`discover_scaffold_rules`] with default options (name-based columns only, no row sampling). +/// Same as [`discover_scaffold_rules`] with default options (`infer_json_paths` off). Still reads +/// up to [`SCAFFOLD_JSON_RESERVOIR_SIZE`] INSERT/COPY rows per table when present for weak name hints. pub fn discover_scaffold_column_rules( reader: &mut R, format: DumpFormat, @@ -973,6 +1012,14 @@ fn infer_scaffold_from_leaf_segment_and_sample( segment_name: &str, sample: &str, ) -> Option { + let norm = segment_name.to_ascii_lowercase().replace('-', "_"); + if infer_strong_name_column(&norm) { + return Some(base_spec("name", Some(true))); + } + if infer_ambiguous_name_column_for_sampling("", &norm) && looks_like_person_name_literal(sample) + { + return Some(base_spec("name", Some(true))); + } infer_scaffold_strategy(segment_name) .or_else(|| infer_scaffold_from_address_like_literal(sample)) .or_else(|| infer_scaffold_from_literal_sample(sample)) @@ -1999,7 +2046,7 @@ fn is_sensitive_candidate( column: &str, ) -> bool { is_explicit_sensitive_column(cfg, schema, table, column) - || infer_auto_strategy(column).is_some() + || infer_auto_strategy_with_table(table, column).is_some() } fn is_explicitly_covered_column( @@ -2024,7 +2071,7 @@ fn qualified_column_name(schema: Option<&str>, table: &str, column: &str) -> Str } } -fn infer_auto_strategy(column: &str) -> Option { +fn infer_auto_strategy_with_table(_table: &str, column: &str) -> Option { let normalized = column.to_ascii_lowercase().replace('-', "_"); let spec = if normalized.contains("email") { base_spec("email", Some(true)) @@ -2039,16 +2086,12 @@ fn infer_auto_strategy(column: &str) -> Option { || normalized.contains("family_name") { base_spec("last_name", Some(true)) - } else if normalized.contains("name") { + } else if infer_strong_name_column(&normalized) { base_spec("name", Some(true)) - } else if normalized.contains("phone") - || normalized.contains("mobile") - || normalized.contains("cell") - { + } else if infer_phone_strategy_tokens(&normalized) { base_spec("phone", Some(true)) - } else if scaffold_address_like_segment(&normalized) { - base_spec("redact", Some(true)) - } else if normalized.contains("password") + } else if scaffold_address_like_segment(&normalized) + || normalized.contains("password") || normalized == "pass" || normalized.contains("secret") || normalized.contains("token") @@ -2061,7 +2104,7 @@ fn infer_auto_strategy(column: &str) -> Option { || normalized.contains("routing") || normalized.contains("account_number") { - base_spec("hash", Some(true)) + base_spec("redact", Some(true)) } else if normalized == "dob" || normalized.contains("date_of_birth") || normalized.contains("birth_date") @@ -2082,6 +2125,155 @@ fn infer_auto_strategy(column: &str) -> Option { Some(spec) } +fn scaffold_table_skips_bare_name(table: &str) -> bool { + let t = table.to_ascii_lowercase(); + t == "auth_group" + || t.starts_with("wagtail") + || t.starts_with("waffle_") + || t.contains("wagtailembeds") +} + +/// Column names that almost always refer to a human display name; no cell sampling required. +fn infer_strong_name_column(normalized: &str) -> bool { + normalized.contains("full_name") + || normalized.contains("fullname") + || normalized.contains("display_name") + || normalized.contains("displayname") + || normalized.contains("legal_name") + || normalized.contains("legalname") + || normalized.contains("maiden_name") + || normalized.contains("cardholder_name") + || normalized.contains("account_holder_name") +} + +/// Weak "name-ish" columns: confirm with [`looks_like_person_name_literal`] on sampled cells. +fn infer_ambiguous_name_column_for_sampling(table: &str, normalized: &str) -> bool { + if infer_strong_name_column(normalized) { + return false; + } + if name_substring_false_positive(normalized) { + return false; + } + if scaffold_table_skips_bare_name(table) { + let segs: Vec<&str> = normalized.split('_').filter(|s| !s.is_empty()).collect(); + if segs.len() == 1 && segs[0] == "name" { + return false; + } + } + if normalized == "name" && table.to_ascii_lowercase().ends_with("_grade") { + return false; + } + let segs: Vec<&str> = normalized.split('_').filter(|s| !s.is_empty()).collect(); + if segs.iter().any(|s| *s == "mime" || *s == "mimetype") { + return false; + } + if segs.iter().any(|s| *s == "name" || *s == "names") { + return true; + } + if segs.len() == 1 { + let s = segs[0]; + if (s.ends_with("name") || s.ends_with("names")) && s.len() >= 5 { + return true; + } + } + normalized.contains("name") || normalized.contains("names") +} + +/// Heuristic: free-text that plausibly holds a personal or display name (not email, not URL-like). +fn looks_like_person_name_literal(sample: &str) -> bool { + let t = sample.trim(); + if t.len() < 2 || t.len() > 200 || t.contains('\n') { + return false; + } + if infer_scaffold_from_literal_sample(t).is_some() { + return false; + } + let tl = t.to_ascii_lowercase(); + if matches!( + tl.as_str(), + "n/a" | "na" | "null" | "none" | "tbd" | "unknown" | "undefined" + ) { + return false; + } + if t.starts_with("http://") || t.starts_with("https://") { + return false; + } + if t.contains('@') || t.contains("://") { + return false; + } + let non_ws: String = t.chars().filter(|c| !c.is_whitespace()).collect(); + if non_ws.is_empty() { + return false; + } + let digit_count = non_ws.chars().filter(|c| c.is_ascii_digit()).count(); + if digit_count * 3 > non_ws.len() { + return false; + } + let tokens: Vec<&str> = t.split_whitespace().collect(); + let letter_tokens: Vec<&str> = tokens + .iter() + .copied() + .filter(|tok| { + let letters = tok.chars().filter(|c| c.is_alphabetic()).count(); + letters >= 2 + }) + .collect(); + if letter_tokens.len() >= 2 { + let all_caps_words = letter_tokens.iter().all(|tok| { + let letters: String = tok.chars().filter(|c| c.is_ascii_alphabetic()).collect(); + letters.len() >= 2 && letters.chars().all(|c| c.is_ascii_uppercase()) + }); + if all_caps_words { + return false; + } + return true; + } + let letters_hyphen: String = t + .chars() + .filter(|c| c.is_alphabetic() || matches!(c, '-' | '\'')) + .collect(); + let total_non_space = t.chars().filter(|c| !c.is_whitespace()).count(); + if (8..=32).contains(&total_non_space) + && letters_hyphen.len() * 10 >= total_non_space * 8 + && t.chars().any(|c| { + matches!( + c, + 'a' | 'e' | 'i' | 'o' | 'u' | 'y' | 'A' | 'E' | 'I' | 'O' | 'U' | 'Y' + ) + }) + { + return true; + } + false +} + +/// True when `normalized` has a snake_case token that clearly denotes a phone field (avoids +/// matching `cell` inside `cancelled`, `cancellation`, etc.). +fn infer_phone_strategy_tokens(normalized: &str) -> bool { + for seg in normalized.split('_').filter(|s| !s.is_empty()) { + if matches!( + seg, + "phone" | "phones" | "mobile" | "cell" | "tel" | "cellphone" | "telephone" | "fax" + ) { + return true; + } + } + false +} + +fn name_substring_false_positive(norm: &str) -> bool { + norm.contains("hostname") + || norm.contains("mimetype") + || norm.contains("namespace") + || norm.contains("classname") + || norm.contains("typename") + || norm.contains("codename") + || norm.contains("filename") + || norm.ends_with("rename") + || norm.contains("microphone") + || norm.contains("headphone") +} + fn base_spec(strategy: &str, as_string: Option) -> AnonymizerSpec { AnonymizerSpec { strategy: strategy.to_string(), @@ -4079,9 +4271,29 @@ COPY public.users (id, user_email, notes) FROM stdin; assert_eq!(email.strategy, "email"); } + #[test] + fn discover_scaffold_infers_ambiguous_name_from_row_samples_without_json_flag() { + let input = r#" +INSERT INTO public.contacts (id, provider_name, customer_name) VALUES + (1, 'YouTube', 'Ada Lovelace'); +COPY public.contacts (id, provider_name, customer_name) FROM stdin; +2 Vimeo Bob Smith +\. +"#; + let mut reader = std::io::BufReader::new(input.as_bytes()); + let rules = discover_scaffold_column_rules(&mut reader, DumpFormat::Postgres).unwrap(); + let t = rules.get("public.contacts").expect("public.contacts"); + assert!( + !t.contains_key("provider_name"), + "short provider tokens must not satisfy person-name literal heuristic: {:?}", + t.keys().collect::>() + ); + assert_eq!(t.get("customer_name").unwrap().strategy, "name"); + } + #[test] fn discover_scaffold_rules_infer_json_paths() { - let input = r#"INSERT INTO app.events (id, payload) VALUES (1, '{"profile":{"contact_email":"x@y.z"},"meta":"y"}'); + let input = r#"INSERT INTO app.events (id, payload) VALUES (1, '{"profile":{"contactEmail":"x@y.z"},"meta":"y"}'); "#; let mut reader = std::io::BufReader::new(input.as_bytes()); let opts = ScaffoldDiscoverOptions { @@ -4091,12 +4303,12 @@ COPY public.users (id, user_email, notes) FROM stdin; let rules = discover_scaffold_rules(&mut reader, DumpFormat::Postgres, &opts).unwrap(); let t = rules.get("app.events").expect("app.events"); assert!( - t.contains_key("payload.profile.contact_email"), - "expected nested JSON rule key, got {:?}", + t.contains_key("payload.profile.contactEmail"), + "expected nested JSON rule key preserving JSON key case, got {:?}", t.keys().collect::>() ); assert_eq!( - t.get("payload.profile.contact_email").unwrap().strategy, + t.get("payload.profile.contactEmail").unwrap().strategy, "email" ); assert!( @@ -4126,4 +4338,88 @@ COPY public.users (id, user_email, notes) FROM stdin; let note = rules.get("t").unwrap().get("note").unwrap(); assert_eq!(note.strategy, "redact"); } + + #[test] + fn scaffold_infer_cancelled_at_datetime_not_phone() { + let spec = + infer_scaffold_strategy_for_table("shopify_shopifyorder", "cancelled_at").unwrap(); + assert_eq!(spec.strategy, "datetime_fuzz"); + } + + #[test] + fn scaffold_infer_secrets_default_to_redact() { + let spec = infer_scaffold_strategy_for_table("users", "password_hash").unwrap(); + assert_eq!(spec.strategy, "redact"); + } + + #[test] + fn scaffold_infer_auth_group_name_skipped() { + assert!(infer_scaffold_strategy_for_table("auth_group", "name").is_none()); + } + + #[test] + fn scaffold_infer_wagtail_provider_column_skipped() { + assert!( + infer_scaffold_strategy_for_table("wagtailembeds_embed", "provider_name").is_none() + ); + } + + #[test] + fn pipeline_anonymizes_nested_json_paths_with_camel_case_rule_keys() { + use crate::settings::normalize_rules_column_key; + let mut rules: HashMap> = HashMap::new(); + let mut cols: HashMap = HashMap::new(); + cols.insert( + normalize_rules_column_key("payload.Profile.secretToken"), + AnonymizerSpec { + strategy: "string".to_string(), + salt: None, + min: None, + max: None, + scale: None, + length: Some(8), + min_days: None, + max_days: None, + min_seconds: None, + max_seconds: None, + domain: Some("secrets".to_string()), + unique_within_domain: None, + as_string: Some(true), + locale: None, + faker: None, + format: None, + }, + ); + rules.insert("public.events".to_string(), cols); + let cfg = ResolvedConfig { + salt: None, + rules, + row_filters: HashMap::new(), + column_cases: HashMap::new(), + sensitive_columns: HashMap::new(), + output_scan: crate::settings::OutputScanConfig::default(), + pg_restore: crate::settings::PgRestoreConfig::default(), + keep_original: None, + source_path: None, + }; + let reg = AnonymizerRegistry::from_config(&cfg); + let mut proc = SqlStreamProcessor::new(reg, cfg, None, DumpFormat::Postgres); + let input = r#" +CREATE TABLE public.events (id int, payload jsonb); +INSERT INTO public.events (id, payload) VALUES + (1, '{"Profile":{"secretToken":"alpha"}}'); + +COPY public.events (id, payload) FROM stdin; +2 {"Profile":{"secretToken":"alpha"}} +\. +"#; + let mut reader = std::io::BufReader::new(input.as_bytes()); + let mut out = Vec::new(); + proc.process(&mut reader, &mut out).unwrap(); + let s = String::from_utf8(out).unwrap(); + assert!( + !s.contains("alpha"), + "nested camelCase path should be anonymized, got:\n{s}" + ); + } }