Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/src/getting-started.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ dumpling --help
dumpling scaffold-config -i dump.sql -o .dumplingconf
```

This **beta** subcommand streams the dump once and writes inferred `[rules]` from SQL column names (`CREATE TABLE`, `INSERT`, and PostgreSQL `COPY` column lists). Heuristics are **English-oriented**; output is **draft only**—review and edit every rule, add a top-level **`salt`** (for hashing) and any **`${…}`** secret placeholders before production use.
This **beta** subcommand streams the dump once and writes inferred `[rules]` from SQL column names (`CREATE TABLE`, `INSERT`, and PostgreSQL `COPY` column lists). It does **not** require an existing Dumpling config in the current directory (optional config is only merged for `pg_restore` / keep-original defaults). Heuristics are **English-oriented**; output is **draft only**—review and edit every rule, add a top-level **`salt`** (for hashing) and any **`${…}`** secret placeholders before production use.

Useful flags:

Expand Down
11 changes: 10 additions & 1 deletion src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ use std::sync::mpsc::sync_channel;
use std::thread::JoinHandle;
use std::time::Instant;

use anyhow::Context;
use clap::{ArgAction, Parser, Subcommand};

mod compressed_input;
Expand Down Expand Up @@ -232,7 +233,15 @@ fn main() -> anyhow::Result<()> {
other
),
};
let resolved_for_pg = settings::load_config(cli.config.as_ref(), false)?;
let resolved_for_pg = settings::load_config(cli.config.as_ref(), true).with_context(|| {
"loading config for scaffold-config (pg_restore / keep-original merge only; use -c if an explicit file fails to load)"
})?;
if cli.config.is_none() && resolved_for_pg.source_path.is_none() {
eprintln!(
"dumpling scaffold-config: note: no Dumpling config found in the current directory; \
using defaults for pg_restore and keep-original hints"
);
}
let (pg_restore_path_eff, pg_restore_arg_eff) = settings::merge_pg_restore_cli(
&resolved_for_pg.pg_restore,
pg_restore_path.clone(),
Expand Down
24 changes: 14 additions & 10 deletions src/scaffold.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,13 +45,10 @@ pub fn run_scaffold_config(opts: ScaffoldConfigOptions) -> anyhow::Result<()> {
} = opts;

eprintln!(
"dumpling scaffold-config: beta — draft rules from column names{}; review before use. \
"dumpling scaffold-config: beta — draft rules from column names plus up to {} \
reservoir rows per table for JSON path hints and name-like column value checks; review before use. \
Heuristics are English-oriented and miss opaque or non-English names.",
if infer_json_paths {
", reservoir-sampled JSON paths (~5 rows/table)"
} else {
" only"
}
crate::sql::SCAFFOLD_JSON_RESERVOIR_SIZE
);

let mut pg_restore_child: Option<pg_restore_decode::PgRestoreDecodeProcess> = None;
Expand Down Expand Up @@ -104,10 +101,15 @@ pub fn run_scaffold_config(opts: ScaffoldConfigOptions) -> anyhow::Result<()> {
eprintln!(
"dumpling scaffold-config: warning: no rules inferred; emitted file contains header only"
);
} else if infer_json_paths {
} else {
eprintln!(
"dumpling scaffold-config: reservoir sample ({} rows max per table) for JSON path hints",
crate::sql::SCAFFOLD_JSON_RESERVOIR_SIZE
"dumpling scaffold-config: reservoir sample ({} rows max per table) for name-like column value checks{}",
crate::sql::SCAFFOLD_JSON_RESERVOIR_SIZE,
if infer_json_paths {
" and nested JSON path hints"
} else {
""
}
);
}

Expand Down Expand Up @@ -174,7 +176,9 @@ const SCAFFOLD_HEADER: &str = r#"# Dumpling starter config (beta) — generated
#
# Inferred [rules]: SQL column names (CREATE TABLE, INSERT, COPY) plus optional nested JSON paths
# when generated with `--infer-json-paths` (dot-separated keys: `payload.profile.email`).
# Name heuristics are English-oriented; JSON leaf inference uses segment names and light literals.
# JSON path segments keep the casing from the sampled payload (e.g. camelCase API fields).
# Name heuristics are English-oriented: unmistakable column names map directly; other "*name*"
# columns only become `strategy = "name"` when sampled INSERT/COPY text looks person-shaped.
# Review every rule; add salt for hash strategies and extend row_filters / column_cases as needed.
#

Expand Down
95 changes: 90 additions & 5 deletions src/settings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,8 @@ pub struct AnonymizerSpec {
#[derive(Debug, Clone, Default)]
pub struct ResolvedConfig {
pub salt: Option<String>,
/// Normalized rule map: lowercase keys for table and column names
/// Normalized rule map: lowercase table keys; column keys are lowercased for plain SQL columns,
/// but JSON path segments after the first `.` / `__` keep their authored casing for runtime JSON matching.
pub rules: HashMap<String, HashMap<String, AnonymizerSpec>>,
/// Normalized row filters per table
pub row_filters: HashMap<String, RowFilterSet>,
Expand Down Expand Up @@ -529,7 +530,7 @@ fn resolve(raw: RawConfig, source_path: Option<PathBuf>) -> ResolvedConfig {
let mut col_map: HashMap<String, AnonymizerSpec> = HashMap::new();
for (col, mut spec) in cols.into_iter() {
spec.strategy = spec.strategy.to_ascii_lowercase();
col_map.insert(col.to_lowercase(), spec);
col_map.insert(normalize_rules_column_key(&col), spec);
}
normalized_rules.insert(table_key_norm, col_map);
}
Expand All @@ -549,7 +550,7 @@ fn resolve(raw: RawConfig, source_path: Option<PathBuf>) -> ResolvedConfig {
c
})
.collect();
inner.insert(col.to_lowercase(), cases);
inner.insert(normalize_rules_column_key(&col), cases);
}
normalized_cases.insert(table_key_norm, inner);
}
Expand Down Expand Up @@ -949,11 +950,62 @@ fn validate_anonymizer_spec(spec: &AnonymizerSpec, path: &str) -> anyhow::Result
Ok(())
}

/// Normalize a `[rules]` / `[column_cases]` column key for [`ResolvedConfig`]: the leading SQL
/// column identifier is lowercased; nested JSON path segments keep their authored casing so rules
/// match object keys in dump JSON (e.g. camelCase from APIs).
pub fn normalize_rules_column_key(col: &str) -> String {
let col = col.trim();
if col.contains("__") {
let parts: Vec<&str> = col.split("__").collect();
if parts.len() >= 2 {
let base = parts[0].trim();
if !base.is_empty() {
let path: Vec<&str> = parts[1..]
.iter()
.map(|p| (*p).trim())
.filter(|p| !p.is_empty())
.collect();
if !path.is_empty() {
let mut out = base.to_ascii_lowercase();
for p in path {
out.push_str("__");
out.push_str(p);
}
return out;
}
}
}
}
if col.contains('.') {
let parts: Vec<&str> = col.split('.').collect();
if parts.len() >= 2 {
let base = parts[0].trim();
if !base.is_empty() {
let path: Vec<&str> = parts[1..]
.iter()
.map(|p| (*p).trim())
.filter(|p| !p.is_empty())
.collect();
if !path.is_empty() {
let mut out = base.to_ascii_lowercase();
for p in path {
out.push('.');
out.push_str(p);
}
return out;
}
}
}
}
col.to_ascii_lowercase()
}

/// Split a rules column key into the SQL column name and optional JSON path segments.
///
/// Nested paths use the same syntax as row-filter predicates: `payload.profile.email` or
/// `payload__profile__email`. When no path is present, the entire key names one SQL column.
/// Keys are compared case-insensitively after normalization (lowercase).
/// The base SQL column is compared case-insensitively (lowercase); JSON path segments use the
/// casing stored in the resolved config key (must match JSON object keys in the cell).
pub fn parse_json_column_key(column_key: &str) -> (String, Vec<String>) {
let trim_parts = |parts: &[&str]| -> Option<(String, Vec<String>)> {
if parts.len() < 2 {
Expand Down Expand Up @@ -1251,7 +1303,9 @@ pub fn is_explicit_sensitive_column(

#[cfg(test)]
mod tests {
use super::{load_config, resolve_secrets_in_value, ConfigPathSegment};
use super::{
load_config, normalize_rules_column_key, resolve_secrets_in_value, ConfigPathSegment,
};
use std::fs;
use std::path::{Path, PathBuf};
use std::time::{SystemTime, UNIX_EPOCH};
Expand Down Expand Up @@ -1816,4 +1870,35 @@ email = { strategy = "hash", locale = "fr_fr" }
assert!(msg.contains("hash"));
let _ = fs::remove_file(path);
}

#[test]
fn normalize_rules_column_key_preserves_json_path_case() {
assert_eq!(
normalize_rules_column_key("Payload.Profile.ContactEmail"),
"payload.Profile.ContactEmail"
);
assert_eq!(
normalize_rules_column_key("payload__Profile__contactEmail"),
"payload__Profile__contactEmail"
);
assert_eq!(normalize_rules_column_key("Email"), "email");
}

#[test]
fn load_config_preserves_json_path_case_in_resolved_rules() {
let path = write_temp_config(
r#"
salt = "testsalt"
[rules."public.orders"]
"payload.shipTo.fullName" = { strategy = "redact", as_string = true }
"#,
);
let cfg = load_config(Some(&path), false).expect("load");
let cols = cfg.rules.get("public.orders").expect("table");
let spec = cols
.get("payload.shipTo.fullName")
.expect("expected camelCase path segments in resolved map key");
assert_eq!(spec.strategy, "redact");
let _ = fs::remove_file(path);
}
}
Loading
Loading