diff --git a/.github/workflows/rust.yaml b/.github/workflows/rust.yaml index d4bae52..ae264ae 100644 --- a/.github/workflows/rust.yaml +++ b/.github/workflows/rust.yaml @@ -4,7 +4,6 @@ on: push: branches: [ "master" ] pull_request: - branches: [ "master" ] env: CARGO_TERM_COLOR: always diff --git a/.gitignore b/.gitignore index 3f87625..1191268 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,5 @@ /Cargo.lock .DS_Store .idea +tmp/ +coverage/ diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..0824eca --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,169 @@ +# CLAUDE.md + +## Project + +Rust workspace: `sql-insight` library + `sql-insight-cli`. SQL parsing is +built on `sqlparser-rs`; always work against its AST, never re-parse SQL +by hand. + +## Commands + +- Format: `cargo fmt` +- Test: `cargo test --all` +- Lint: `cargo clippy --all-targets -- -D warnings` (zero-warning policy) + +## Architecture + +- The `resolver` module walks a `Statement` once and produces a + `Resolution`: + - a scope arena of `Binding`s (`Table` / `Cte` / `DerivedTable` / + `TableFunction`), + - a buffer of `RawColumnRef`s captured at walk time with + resolved-table + synthetic-vs-real + clause-kind metadata, + - a buffer of `FlowEdge`s emitted directly during the walk. + Two post-passes on `into_resolution` compose the flow graph + end-to-end through CTE / derived intermediates and filter reads + down to references whose walk-time owner was a real `Table`. + Sub-modules are split by responsibility: `binding` (scope arena), + `context` (`VisitContext`), `column_ref`, `projection`, `flow`, + `composition`, `rename`; walker files (`expr` / `query` / + `statement` / `table`) live as siblings and add `visit_*` methods + via `impl Resolver` blocks. +- Pull-style design: `resolve_query` returns a `ResolvedQuery` + carrying the body's `projections: Vec`. Callers + (visit_insert / CTAS / scalar subqueries / etc.) decide what to do + with them — pair with target columns, emit `QueryOutput` edges, + bubble up through `SetExpr::Query`, etc. +- The resolver takes an optional `&dyn Catalog`. With a catalog, + Table bindings come back with `Known` schemas and unqualified + column resolution becomes strict (typos surface as `table: None`). + Without a catalog the resolver is best-effort. +- Extractors consume the resolver's output: + - `table_extractor` — flat list of `TableReference`s (legacy API). + - `crud_table_extractor` — CRUD-bucketed tables (legacy API). + - `table_operation_extractor` — `extract_table_operations` returns + `TableOperation { statement_kind, reads, writes, + lineage, diagnostics }` per parsed statement. + - `column_operation_extractor` — `extract_column_operations` + returns `ColumnOperation { statement_kind, reads, + writes, lineage, diagnostics }` at column granularity. `reads` / + `writes` are plain occurrence lists; `lineage` edges carry + `kind: ColumnLineageKind`. +- Per-statement output convention: extractors return + `Vec>` so one bad statement does not kill the + rest. + +## Vocabulary + +- `TableOperation` carries three parallel surfaces: + - `reads: Vec` — every table the statement reads + from (occurrence-based; a table read more than once appears more + than once). + - `writes: Vec` — every table the statement writes + to. + - `lineage: Vec` — directed `source → target` + edges, only for statements that physically move data (INSERT / + UPDATE / MERGE / CTAS / CREATE VIEW). A table that plays both + roles (e.g. `DELETE t1 FROM t1`) appears in both `reads` and + `writes`. +- `ColumnOperation` mirrors the same surfaces at column + granularity: + - `reads: Vec` — every column reference, as a + plain occurrence list with no clause tag. References whose + walk-time owning binding was synthetic (CTE / derived / table + function) are dropped — only real-storage references and + unresolved names surface. + - `writes: Vec` — INSERT column lists, UPDATE SET + targets, CTAS / CREATE VIEW / ALTER VIEW columns, MERGE + WHEN-clause writes. + - `lineage: Vec` — `source → target` edges with + `kind: ColumnLineageKind` (`Passthrough` / `Transformation`). + Sources flowing through CTE / derived intermediates are composed + end-to-end; composition yields `Transformation` if any step + transforms. Targets: `QueryOutput { name, position }` for + transient SELECT outputs, `Relation(ColumnReference)` for + writes into a named relation (table or view). +- The value-vs-filter distinction is structural, not a tag: a value + contributor is a `lineage` source; a filter-only column is in + `reads` but not `lineage`. +- `StatementKind` — the verb of the statement; combined with the + `reads` / `writes` split recovers every granularity distinction. +- Internal-only `TableRole` (Read / Write) lives inside the resolver + for binding metadata. It is not exposed via the public API — + surface it through `reads` / `writes` instead. +- `TableReference` is identity-only (`catalog` / `schema` / `name`). + Alias is a use-site decoration, not part of a table's identity, + so `HashSet` dedup and cross-statement comparison + behave intuitively. Resolver bindings carry alias as a separate + field; the public API does not currently surface it. +- `ColumnReference` is identity-only too (`table: Option`, + `name: Ident`). `table` is `Option` for cases where resolution + fails (ambiguous, no candidate); the column name still surfaces. + +## Design conventions + +- Pull design: `resolve_query` collects facts (projections), callers + decide edge construction. Avoid pushing state from caller into + resolver via flag bags — instead expose helpers like + `with_filter_clause` / `with_branch_scope` for scoped, lexical + context. +- Walking-context state lives in `VisitContext` (just `scope_kind`) + — "in effect for the current visit", not "queued". Save / restore + goes through `with_context` (and the focused `with_branch_scope` / + `with_filter_clause` helpers) so the prior context is restored on + scope exit. `scope_kind` is preserved across a subquery boundary so + predicate-ness flows transitively. For owning per-query buffers + like `current_projections: Vec<…>`, `mem::replace` is used + instead. +- Wildcards (`SELECT *`, `t.*`) are not expanded at the parser + level — even with a catalog. The rigor cost (USING / NATURAL JOIN + merge, EXCLUDE / REPLACE / RENAME clauses, CTE column rename, + multi-segment qualifiers) is too high for a SQL-text-only library + to handle correctly. Wildcards contribute nothing to `reads` / + `lineage`; consumers needing per-column source → target lineage + either supply resolved query plans or do their own expansion. + +## Code conventions + +- Keep changes small and scoped. Preserve public API compatibility + unless an API change is intentional, and update doc comments when + it changes. +- **Public items deserve rustdoc** (`///` on items, `//!` on + modules / crates). State purpose, contract, edge cases, and + include examples where useful — rustdoc is the published API + surface and shows up in `cargo doc`, docs.rs, and IDE hovers. + Length is fine when it earns it. +- **Inline `//` comments**: keep them concise and well-structured. + Add a short example when it clarifies. +- Prefer private modules; export through explicit re-exports in + `lib.rs`. +- Avoid `bool` or ambiguous `Option` parameters in new public APIs. + Prefer enums, named methods, or small option structs. +- Avoid growing large modules. Split before a file becomes + unscannable. +- Keep `sqlparser-rs` AST `match` arms exhaustive in the resolver + and extractors — wildcard arms silently hide newly added variants. +- Public enums are **exhaustive (no `#[non_exhaustive]`) while pre-1.0** + (`StatementKind` / `ColumnLineageKind` / `ColumnTarget` / + `TableLevelDiagnosticKind` / `ColumnLevelDiagnosticKind`). Adding a + variant is therefore a breaking change on purpose — pre-1.0 that + rides a `0.x` bump and forces consumers to re-acknowledge the new + case rather than silently hitting a wildcard arm. Add + `#[non_exhaustive]` at the 1.0 freeze (removing it later is + non-breaking; adding it is breaking, so the 1.0 boundary is the + place). Keep internal `match`es exhaustive regardless. +- Diagnostics are split by extraction granularity: + `TableLevelDiagnostic` (only `UnsupportedStatement`) vs + `ColumnLevelDiagnostic` (adds `WildcardSuppressed` / + `AmbiguousColumn` / `UnresolvedColumn`). The resolver produces the + column-level superset; table-level surfaces project it down via + `ColumnLevelDiagnostic::to_table_level` (exhaustive match, so a new + column kind forces a table-level decision). +- For unsupported SQL, accumulate diagnostics instead of `?`-bailing + mid-walk. Reserve hard errors for genuinely unrecoverable + conditions. +- Tests: compare whole values (`assert_eq!(ops.reads, vec![...])`) + over field-by-field assertions. Use a layered helper convention + — `extract` → `extract_with(dialect)` → `extract_with_catalog( + dialect, catalog)` — so callsites stay terse and new parameters + fall through cleanly. diff --git a/README.md b/README.md index 7d77acb..b0c2cc6 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,11 @@ # sql-insight -A utility for SQL query analysis, formatting, and transformation. -Leveraging the comprehensive parsing capabilities of [sqlparser-rs](https://github.com/sqlparser-rs/sqlparser-rs), it can handle various SQL dialects. +Operation extraction for SQL, built on +[sqlparser-rs](https://github.com/sqlparser-rs/sqlparser-rs). Turn a +SQL string into structured facts about what the statement does — +which tables and columns it reads, which it writes, and how data +moves from sources to targets — alongside utilities for formatting +and normalization. [![Crates.io](https://img.shields.io/crates/v/sql-insight.svg)](https://crates.io/crates/sql-insight) [![Docs.rs](https://docs.rs/sql-insight/badge.svg)](https://docs.rs/sql-insight) @@ -11,10 +15,30 @@ Leveraging the comprehensive parsing capabilities of [sqlparser-rs](https://gith ## Features -- **SQL Formatting**: Format SQL queries to standardized form, improving readability and maintainability. -- **SQL Normalization**: Convert SQL queries into a normalized form, making them easier to analyze and process. -- **Table Extraction**: Extract tables referenced in SQL queries, clarifying the data sources involved. -- **CRUD Table Extraction**: Identify the create, read, update, and delete operations, along with the tables involved in each operation within SQL queries. +- **Table-level Operation Extraction**: `reads` / `writes` / `lineage` + surfaces with statement-kind classification per parsed statement. +- **Column-level Operation Extraction**: the same three surfaces at + column granularity. `reads` / `writes` are plain occurrence lists + of column references; `lineage` forms a source → target graph, each + edge carrying a kind (`Passthrough` vs `Transformation`). The + value-vs-filter distinction is structural — a value contributor is a + `lineage` source, a filter-only column is in `reads` but not `lineage`. +- **Optional Catalog**: supply a schema provider to make resolution + strict — catch typos as unresolved references, pair INSERT + positional values with target columns. Every extractor still + works catalog-free in best-effort mode. +- **Diagnostics**: non-fatal issues (unsupported statements, + suppressed wildcards, ambiguous / unresolved columns) surface + alongside the result with optional source-location spans, rather + than failing the whole call. Split by granularity + (`TableLevelDiagnostic` / `ColumnLevelDiagnostic`) so a table-level + result never carries a column-only condition. +- **Table Extraction / CRUD Table Extraction**: flat or + CRUD-bucketed table sets — lightweight extraction when the + operation graph isn't needed. +- **SQL Formatting & Normalization**: pretty-print or normalize + queries (placeholder-substitute literals) for hashing and + comparison. ## Installation @@ -27,9 +51,69 @@ sql-insight = { version = "0.2.0" } ## Usage -### SQL Formatting +### Table-level Operation Extraction + +Get the statement kind plus `reads` / `writes` / `lineage` in one call: + +```rust +use sql_insight::sqlparser::dialect::GenericDialect; +use sql_insight::{extract_table_operations, StatementKind}; + +let dialect = GenericDialect {}; +let result = extract_table_operations( + &dialect, + "INSERT INTO orders (id) SELECT id FROM staging", + None, +).unwrap(); +let ops = result[0].as_ref().unwrap(); +assert_eq!(ops.statement_kind, StatementKind::Insert); +assert_eq!(ops.reads.len(), 1); // staging +assert_eq!(ops.writes.len(), 1); // orders +assert_eq!(ops.lineage.len(), 1); // staging → orders +``` + +### Column-level Operation Extraction + +Same surfaces, at column granularity. `reads` / `writes` are plain +occurrence lists of column references; `lineage` edges carry a kind +(`Passthrough` vs `Transformation`) describing how each source +reaches its target: + +```rust +use sql_insight::sqlparser::dialect::GenericDialect; +use sql_insight::extract_column_operations; + +let dialect = GenericDialect {}; +let result = extract_column_operations( + &dialect, + "INSERT INTO orders (id, total) SELECT id, SUM(amount) FROM staging GROUP BY id", + None, +).unwrap(); +let ops = result[0].as_ref().unwrap(); +// One lineage edge per target column: id → id (Passthrough), amount → total (Transformation, via SUM). +assert_eq!(ops.lineage.len(), 2); +``` -Format SQL queries according to different dialects: +### Diagnostics + +Non-fatal issues surface alongside the result. Each diagnostic carries +a `kind`, a human-readable `message`, and an optional source-location +`span`: + +```rust +use sql_insight::sqlparser::dialect::GenericDialect; +use sql_insight::{extract_column_operations, ColumnLevelDiagnosticKind}; + +let dialect = GenericDialect {}; +let result = extract_column_operations(&dialect, "SELECT * FROM users", None).unwrap(); +let ops = result[0].as_ref().unwrap(); +assert!(ops + .diagnostics + .iter() + .any(|d| matches!(d.kind, ColumnLevelDiagnosticKind::WildcardSuppressed))); +``` + +### SQL Formatting ```rust use sql_insight::sqlparser::dialect::GenericDialect; @@ -41,7 +125,8 @@ assert_eq!(formatted_sql, ["SELECT * FROM users WHERE id = 1"]); ### SQL Normalization -Normalize SQL queries to abstract away literals: +Substitute literals with placeholders so structurally identical +queries hash to the same shape: ```rust use sql_insight::sqlparser::dialect::GenericDialect; @@ -51,27 +136,21 @@ let normalized_sql = sql_insight::normalize(&dialect, "SELECT * \n from users assert_eq!(normalized_sql, ["SELECT * FROM users WHERE id = ?"]); ``` -### Table Extraction +### Table Extraction (lightweight) -Extract table references from SQL queries: +Flat list of table references touched by a statement: ```rust use sql_insight::sqlparser::dialect::GenericDialect; let dialect = GenericDialect {}; -let tables = sql_insight::extract_tables(&dialect, "SELECT * FROM catalog.schema.`users` as users_alias").unwrap(); -println!("{:?}", tables); -``` - -This outputs: - -``` -[Ok(Tables([TableReference { catalog: Some(Ident { value: "catalog", quote_style: None }), schema: Some(Ident { value: "schema", quote_style: None }), name: Ident { value: "users", quote_style: Some('`') }, alias: Some(Ident { value: "users_alias", quote_style: None }) }]))] +let extractions = sql_insight::extract_tables(&dialect, "SELECT * FROM catalog.schema.users").unwrap(); +println!("{:?}", extractions); ``` ### CRUD Table Extraction -Identify CRUD operations and the tables involved in each operation within SQL queries: +Bucket tables by create / read / update / delete role: ```rust use sql_insight::sqlparser::dialect::GenericDialect; @@ -81,11 +160,56 @@ let crud_tables = sql_insight::extract_crud_tables(&dialect, "INSERT INTO users println!("{:?}", crud_tables); ``` -This outputs: - -``` -[Ok(CrudTables { create_tables: [TableReference { catalog: None, schema: None, name: Ident { value: "users", quote_style: None }, alias: None }], read_tables: [TableReference { catalog: None, schema: None, name: Ident { value: "employees", quote_style: None }, alias: None }], update_tables: [], delete_tables: [] })] -``` +## Limitations and Behavior Notes + +A few intentional non-supports and behavior nuances that shape what +you can rely on: + +- **Wildcards (`SELECT *`, `t.*`) are not expanded** — they contribute + nothing to `reads` / `lineage` and surface as a `WildcardSuppressed` + diagnostic. +- **TableFunction schemas stay `Unknown`** (`UNNEST`, `JSON_TABLE`, + etc.) — catalog enrichment doesn't reach them yet. +- **Recursive CTE bodies** are pre-bound under a stub; lineage + composition through them is deferred. +- **Catalog is optional, but load-bearing for column lineage.** + Table-level extraction is robust catalog-free (a table's identity + comes straight from the FROM clause). Column-level extraction + degrades without one: an unqualified column across multiple + in-scope tables (`SELECT x FROM a JOIN b`) can't be attributed from + the SQL text alone, so it resolves to `table: None`. Qualified + (`t.col`) and single-table queries resolve fine catalog-free. The + ambiguous / unresolved-column diagnostics that would explain those + `None`s only fire *with* a catalog — without one they are + suppressed (every `Unknown` schema could contain anything, so + flagging would be noise). +- **No type checking** — the catalog is an enrichment input, not a + validator. + +See the +[Limitations](https://docs.rs/sql-insight/latest/sql_insight/#limitations) +and +[Behavior notes](https://docs.rs/sql-insight/latest/sql_insight/#behavior-notes) +sections of the crate docs for the full set. + +## Examples + +Runnable examples under +[`sql-insight/examples/`](sql-insight/examples): + +- [`table_operations.rs`](sql-insight/examples/table_operations.rs) — + table-level `reads` / `writes` / `lineage` across a multi-statement + batch, with `StatementKind`-based dispatch. +- [`column_operations.rs`](sql-insight/examples/column_operations.rs) — + per-column reads and lineage classified by `ColumnLineageKind` + (Passthrough vs Transformation) into `Relation` vs `QueryOutput` + targets. +- [`with_catalog.rs`](sql-insight/examples/with_catalog.rs) — supplying + a `Catalog` enables INSERT positional column pairing and surfaces + `AmbiguousColumn` / `UnresolvedColumn` diagnostics that stay silent + without one. + +Run with `cargo run --example -p sql-insight`. ## Supported SQL Dialects diff --git a/codecov.yml b/codecov.yml index 8251cfe..af7d8a5 100644 --- a/codecov.yml +++ b/codecov.yml @@ -4,6 +4,8 @@ coverage: default: target: 90% threshold: 10% + informational: true patch: default: threshold: 5% + informational: true diff --git a/sql-insight-cli/tests/integration.rs b/sql-insight-cli/tests/integration.rs index 2fe48b3..338371d 100644 --- a/sql-insight-cli/tests/integration.rs +++ b/sql-insight-cli/tests/integration.rs @@ -176,6 +176,17 @@ mod integration { .stderr(""); } + #[test] + fn test_extract_crud_tables_with_cte() { + sql_insight_cmd() + .arg("extract-crud") + .arg("with t2 as (select id from t1) select * from t2;") + .assert() + .success() + .stdout("Create: [], Read: [t1], Update: [], Delete: []\n") + .stderr(""); + } + #[test] fn test_extract_crud_tables_from_file() { let mut temp_file = NamedTempFile::new().unwrap(); @@ -215,7 +226,18 @@ mod integration { insert into catalog.schema.t1 (a) select b from catalog.schema.t2;") .assert() .success() - .stdout("catalog.schema.t1 AS t1, catalog.schema.t2 AS t2\ncatalog.schema.t1, catalog.schema.t2\n") + .stdout("catalog.schema.t1, catalog.schema.t2\ncatalog.schema.t1, catalog.schema.t2\n") + .stderr(""); + } + + #[test] + fn test_extract_tables_with_cte() { + sql_insight_cmd() + .arg("extract-tables") + .arg("with t2 as (select id from t1) select * from t2;") + .assert() + .success() + .stdout("t1\n") .stderr(""); } diff --git a/sql-insight/Cargo.toml b/sql-insight/Cargo.toml index 8befe77..9f64a51 100644 --- a/sql-insight/Cargo.toml +++ b/sql-insight/Cargo.toml @@ -25,3 +25,4 @@ path = "src/lib.rs" [dependencies] sqlparser = { version = "0.61.0", features = ["visitor"] } thiserror = "1.0.56" +indexmap = "2.6.0" diff --git a/sql-insight/examples/column_operations.rs b/sql-insight/examples/column_operations.rs new file mode 100644 index 0000000..422f2af --- /dev/null +++ b/sql-insight/examples/column_operations.rs @@ -0,0 +1,78 @@ +//! Column-level operation extraction. +//! +//! Run with: +//! +//! ```bash +//! cargo run --example column_operations -p sql-insight +//! ``` +//! +//! Demonstrates per-column lineage: classification by `ColumnLineageKind`, +//! `Relation` vs `QueryOutput` targets, and occurrence-based reads. + +use sql_insight::sqlparser::dialect::GenericDialect; +use sql_insight::{extract_column_operations, ColumnLineageKind, ColumnTarget}; + +fn main() { + let dialect = GenericDialect {}; + let sql = "INSERT INTO orders (id, total) \ + SELECT order_id, SUM(amount) FROM staging GROUP BY order_id"; + + let results = extract_column_operations(&dialect, sql, None).unwrap(); + let ops = results[0].as_ref().expect("ok"); + + println!("--- {:?} ---", ops.statement_kind); + + println!("\nreads ({}):", ops.reads.len()); + for read in &ops.reads { + let table = read + .table + .as_ref() + .map(|t| t.name.value.as_str()) + .unwrap_or(""); + println!(" {}.{}", table, read.name.value); + } + + println!("\nlineage ({}):", ops.lineage.len()); + for edge in &ops.lineage { + let source = format!( + "{}.{}", + edge.source + .table + .as_ref() + .map(|t| t.name.value.as_str()) + .unwrap_or("?"), + edge.source.name.value + ); + let target = match &edge.target { + ColumnTarget::Relation(c) => format!( + "{}.{}", + c.table + .as_ref() + .map(|t| t.name.value.as_str()) + .unwrap_or("?"), + c.name.value + ), + ColumnTarget::QueryOutput { name, position } => format!( + "", + position, + name.as_ref().map(|n| n.value.as_str()).unwrap_or("anon") + ), + }; + println!(" {} -> {} ({:?})", source, target, edge.kind); + } + + // Bucket lineage by kind: is the value forwarded unchanged, or + // derived? (`direct copy` vs `transformed`). + let mut passthrough = 0usize; + let mut transformation = 0usize; + for edge in &ops.lineage { + match edge.kind { + ColumnLineageKind::Passthrough => passthrough += 1, + ColumnLineageKind::Transformation => transformation += 1, + } + } + println!( + "\nlineage kinds — Passthrough={}, Transformation={}", + passthrough, transformation + ); +} diff --git a/sql-insight/examples/table_operations.rs b/sql-insight/examples/table_operations.rs new file mode 100644 index 0000000..6b96c5c --- /dev/null +++ b/sql-insight/examples/table_operations.rs @@ -0,0 +1,56 @@ +//! Table-level operation extraction. +//! +//! Run with: +//! +//! ```bash +//! cargo run --example table_operations -p sql-insight +//! ``` +//! +//! Shows how a single call yields the statement kind plus the +//! `reads` / `writes` / `lineage` surfaces for each parsed statement. + +use sql_insight::sqlparser::dialect::GenericDialect; +use sql_insight::{extract_table_operations, StatementKind}; + +fn main() { + let dialect = GenericDialect {}; + let sql = "\ + INSERT INTO orders (id, total) \ + SELECT order_id, amount FROM staging; \ + DELETE FROM staging WHERE processed = true;"; + + let results = extract_table_operations(&dialect, sql, None).unwrap(); + + for (i, result) in results.iter().enumerate() { + let ops = result.as_ref().expect("parse + resolve succeeded"); + println!("--- statement {} ({:?}) ---", i + 1, ops.statement_kind); + let reads: Vec<&str> = ops.reads.iter().map(|r| r.name.value.as_str()).collect(); + let writes: Vec<&str> = ops.writes.iter().map(|w| w.name.value.as_str()).collect(); + println!("reads: {:?}", reads); + println!("writes: {:?}", writes); + println!("lineage: {} edge(s)", ops.lineage.len()); + for edge in &ops.lineage { + println!(" {} -> {}", edge.source.name.value, edge.target.name.value); + } + if !ops.diagnostics.is_empty() { + println!("diagnostics: {} non-fatal item(s)", ops.diagnostics.len()); + } + } + + // Programmatic dispatch on StatementKind — count statements that + // physically write to a relation. + let writers = results + .iter() + .filter_map(|r| r.as_ref().ok()) + .filter(|ops| { + matches!( + ops.statement_kind, + StatementKind::Insert + | StatementKind::Update + | StatementKind::Delete + | StatementKind::Merge + ) + }) + .count(); + println!("\n{} write statement(s) total", writers); +} diff --git a/sql-insight/examples/with_catalog.rs b/sql-insight/examples/with_catalog.rs new file mode 100644 index 0000000..2989f35 --- /dev/null +++ b/sql-insight/examples/with_catalog.rs @@ -0,0 +1,133 @@ +//! Operation extraction with a `Catalog`. +//! +//! Run with: +//! +//! ```bash +//! cargo run --example with_catalog -p sql-insight +//! ``` +//! +//! Shows how supplying a catalog changes resolver behaviour: +//! +//! 1. INSERT without an explicit column list pairs source projections +//! with the target table's catalog-supplied columns. +//! 2. `AmbiguousColumn` fires when two `Known` schemas both confirm an +//! unqualified column; it stays silent without a catalog. +//! 3. `UnresolvedColumn` fires when a `Known` schema has the column +//! not in any in-scope binding; same silence rule applies without +//! a catalog. + +use sql_insight::sqlparser::dialect::GenericDialect; +use sql_insight::{ + extract_column_operations, Catalog, ColumnLevelDiagnosticKind, ColumnSchema, ColumnTarget, + TableReference, +}; +use std::collections::HashMap; + +#[derive(Debug, Default)] +struct InMemoryCatalog { + tables: HashMap>, +} + +impl InMemoryCatalog { + fn with(mut self, name: &str, columns: &[&str]) -> Self { + self.tables.insert( + name.to_string(), + columns.iter().map(|c| c.to_string()).collect(), + ); + self + } +} + +impl Catalog for InMemoryCatalog { + fn columns(&self, table: &TableReference) -> Option> { + self.tables.get(table.name.value.as_str()).map(|cols| { + cols.iter() + .map(|c| ColumnSchema { name: c.clone() }) + .collect() + }) + } +} + +fn main() { + let dialect = GenericDialect {}; + let catalog = InMemoryCatalog::default() + .with("orders", &["id", "total"]) + .with("staging", &["order_id", "amount"]) + .with("t1", &["a"]) + .with("t2", &["a"]); + + // 1) INSERT without explicit columns — the catalog supplies the + // target column list so source projections pair positionally. + { + let sql = "INSERT INTO orders SELECT order_id, amount FROM staging"; + let results = extract_column_operations(&dialect, sql, Some(&catalog)).unwrap(); + let ops = results[0].as_ref().unwrap(); + println!("--- 1. INSERT without explicit column list ---"); + for edge in &ops.lineage { + if let ColumnTarget::Relation(target) = &edge.target { + println!( + " {} -> orders.{} ({:?})", + edge.source.name.value, target.name.value, edge.kind + ); + } + } + } + + // 2) Ambiguous column — both `t1` and `t2` declare `a` via the + // catalog, so `SELECT a FROM t1 JOIN t2 ...` is genuinely + // ambiguous and the diagnostic fires. + { + let sql = "SELECT a FROM t1 JOIN t2 ON t1.a = t2.a"; + let with = extract_column_operations(&dialect, sql, Some(&catalog)).unwrap(); + let without = extract_column_operations(&dialect, sql, None).unwrap(); + let with_count = count_kind( + &with[0].as_ref().unwrap().diagnostics, + ColumnLevelDiagnosticKind::AmbiguousColumn, + ); + let without_count = count_kind( + &without[0].as_ref().unwrap().diagnostics, + ColumnLevelDiagnosticKind::AmbiguousColumn, + ); + println!( + "\n--- 2. ambiguous column: with catalog={}, without={} ---", + with_count, without_count + ); + for diag in &with[0].as_ref().unwrap().diagnostics { + if matches!(diag.kind, ColumnLevelDiagnosticKind::AmbiguousColumn) { + println!(" {}", diag.message); + } + } + } + + // 3) Unresolved column — `t1` catalog says columns are [a]; `z` + // does not exist in any in-scope Known schema. + { + let sql = "SELECT z FROM t1"; + let with = extract_column_operations(&dialect, sql, Some(&catalog)).unwrap(); + let without = extract_column_operations(&dialect, sql, None).unwrap(); + let with_count = count_kind( + &with[0].as_ref().unwrap().diagnostics, + ColumnLevelDiagnosticKind::UnresolvedColumn, + ); + let without_count = count_kind( + &without[0].as_ref().unwrap().diagnostics, + ColumnLevelDiagnosticKind::UnresolvedColumn, + ); + println!( + "\n--- 3. unresolved column: with catalog={}, without={} ---", + with_count, without_count + ); + for diag in &with[0].as_ref().unwrap().diagnostics { + if matches!(diag.kind, ColumnLevelDiagnosticKind::UnresolvedColumn) { + println!(" {}", diag.message); + } + } + } +} + +fn count_kind( + diagnostics: &[sql_insight::ColumnLevelDiagnostic], + kind: ColumnLevelDiagnosticKind, +) -> usize { + diagnostics.iter().filter(|d| d.kind == kind).count() +} diff --git a/sql-insight/src/catalog.rs b/sql-insight/src/catalog.rs new file mode 100644 index 0000000..c9392f8 --- /dev/null +++ b/sql-insight/src/catalog.rs @@ -0,0 +1,68 @@ +//! Optional schema provider plugged into the resolver. +//! +//! The resolver uses [`Catalog`] purely as an *enrichment* input: structural +//! resolution (CTE / derived table schemas, FROM alias bindings) works +//! catalog-free, and a catalog only fills in the columns of tables +//! that the resolver could not derive from the SQL alone. When no catalog is +//! provided, those holes stay `RelationSchema::Unknown` and surface as diagnostics +//! once consumers (e.g. column-level operations) start reading them. +//! +//! The catalog is treated as **open-world**: a table it returns no columns +//! for is taken as *schema unknown*, not *nonexistent*. A misspelled or +//! unknown table name is therefore never flagged — it surfaces as an +//! ordinary read / write carrying an unknown schema. Strictness is +//! column-level and local: `UnresolvedColumn` / `AmbiguousColumn` only fire +//! where a known schema is in scope. (Treating absence as nonexistence +//! would require promising the catalog is exhaustive, which most providers +//! cannot, so it is not the default.) +//! +//! Implementations typically wrap an `information_schema` query, an ORM +//! model registry, or a static map produced from `CREATE TABLE` statements. + +use std::fmt; + +use crate::reference::TableReference; + +/// Provides the column list of a table. +/// +/// Implementations return `None` when the table is unknown to the catalog; +/// the resolver treats this the same as "no catalog" for that table and may +/// emit a diagnostic instead of failing the whole resolution. +/// +/// The trait is object-safe so it can be passed as `&dyn Catalog`. `Debug` +/// is a supertrait so that resolver state containing `&dyn Catalog` can +/// derive `Debug` — implementations are expected to `#[derive(Debug)]` or +/// provide a manual implementation. +pub trait Catalog: fmt::Debug { + /// Resolve a table to its column list. The `table` argument may + /// carry an alias, but implementations should treat the catalog/schema/ + /// name triplet as the identity — the alias is callsite-only metadata. + /// + /// Identifier case-folding *for this table lookup* is the + /// implementation's responsibility: the resolver passes the table name + /// as written in the SQL and does not normalize it, so an + /// implementation wanting case-insensitive lookup (most dialects) must + /// fold both its stored keys and the incoming `table` name. + /// + /// That is the only matching the implementation governs. The returned + /// column names are then matched against the SQL's column references + /// by the resolver's own fixed normalization rule (unquoted folds to + /// lowercase, quoted is exact) — independent of this implementation + /// and of the dialect. So supplying a catalog changes *which columns + /// exist*, never *how a column name compares*. + fn columns(&self, table: &TableReference) -> Option>; +} + +/// A column entry returned by a [`Catalog`]. Intentionally minimal: starts +/// with `name` only and grows along the project roadmap (see the resolver +/// memory note). Type/nullability/comment fields are deliberately deferred +/// until a downstream consumer needs them. +/// +/// `name` is a plain `String`: a catalog provides column identities, and +/// matching against SQL refs is case-insensitive by default (quoting / +/// case-sensitivity is not modelled per-column — see `BindingKey`), so +/// there is no need to carry `sqlparser`'s `Ident` (quote style / span). +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct ColumnSchema { + pub name: String, +} diff --git a/sql-insight/src/diagnostic.rs b/sql-insight/src/diagnostic.rs new file mode 100644 index 0000000..af7c837 --- /dev/null +++ b/sql-insight/src/diagnostic.rs @@ -0,0 +1,115 @@ +//! Diagnostics reported during SQL inspection. +//! +//! Diagnostics are split by extraction granularity: +//! [`TableLevelDiagnostic`] for the table-level surfaces +//! (`extract_tables` / `extract_table_operations` / `extract_crud_tables`) +//! and [`ColumnLevelDiagnostic`] for `extract_column_operations`. The split +//! is by *type* so a table-level result cannot even represent a column-only +//! condition — e.g. a suppressed wildcard, which leaves column lineage +//! incomplete but doesn't affect table-level completeness at all. + +use sqlparser::tokenizer::Span; + +/// A non-fatal diagnostic from table-level extraction. +/// +/// Carried by the table-level surfaces. `message` is human-readable and, +/// when a [`span`](Self::span) is available, also embeds the location for +/// log-line display. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct TableLevelDiagnostic { + pub kind: TableLevelDiagnosticKind, + pub message: String, + /// Source location of the offending token, when available. `None` when + /// the originating AST node carries no span. + pub span: Option, +} + +/// Why a table-level extraction is incomplete. +/// +/// Only one condition arises at table granularity: a whole statement the +/// extractor can't process. Column-resolution gaps (ambiguity, unresolved +/// names) and suppressed wildcards don't apply — a table's identity comes +/// straight from the FROM clause and is unaffected by them. +#[derive(Clone, Debug, PartialEq, Eq)] +pub enum TableLevelDiagnosticKind { + /// Statement variant the resolver / extractor does not understand well + /// enough to extract operations from. `message` names the statement. + UnsupportedStatement, +} + +/// A non-fatal diagnostic from column-level extraction +/// ([`extract_column_operations`](crate::extract_column_operations)). +/// +/// Carries the same `message` / `span` shape as [`TableLevelDiagnostic`]. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct ColumnLevelDiagnostic { + pub kind: ColumnLevelDiagnosticKind, + pub message: String, + /// Source location of the offending token, when available. `None` when + /// the originating AST node carries no span (sqlparser-rs coverage is + /// patchy outside `Ident` / `Value` / tokens), or when the resolver + /// couldn't reasonably attribute the diagnostic to a single span. + pub span: Option, +} + +/// Why a column-level extraction is incomplete. Two flavours, by *which +/// side* the gap is on: +/// +/// - **Tool-side coverage gap** — sql-insight didn't fully analyze this; a +/// more capable analyzer could do more. +/// [`UnsupportedStatement`](Self::UnsupportedStatement), +/// [`WildcardSuppressed`](Self::WildcardSuppressed). +/// - **Input-side resolution gap** — the SQL (+ catalog) doesn't determine +/// it, so the reference was left `table: None`. A real engine would also +/// reject these. [`AmbiguousColumn`](Self::AmbiguousColumn), +/// [`UnresolvedColumn`](Self::UnresolvedColumn). +#[derive(Clone, Debug, PartialEq, Eq)] +pub enum ColumnLevelDiagnosticKind { + /// (tool-side) Statement variant the resolver / extractor does not + /// understand well enough to extract operations from. `message` names + /// the statement. + UnsupportedStatement, + /// (tool-side) `SELECT *` / `t.*` left unexpanded — the resolver does + /// not perform wildcard expansion (see crate docs), so column lineage + /// is incomplete for projections that include a wildcard. + WildcardSuppressed, + /// (input-side) Unqualified column reference matched multiple in-scope + /// bindings whose schemas definitively contain the name. The reference + /// is recorded with `table: None`. Only emitted in catalog-aware mode + /// (i.e. when at least two `Known` schemas confirm the column); without + /// catalog enrichment the resolver suppresses this to avoid false + /// positives over `Unknown` schemas. + AmbiguousColumn, + /// (input-side) Unqualified column reference found no in-scope binding + /// that contains the name. Only emitted in catalog-aware mode (i.e. when + /// the scope has at least one `Known` schema and none of them holds the + /// column); without catalog enrichment, every `Unknown` schema could + /// contain anything and silence is the safer default. + UnresolvedColumn, +} + +impl ColumnLevelDiagnostic { + /// Project to a [`TableLevelDiagnostic`] when this diagnostic is also + /// meaningful at table granularity, else `None`. + /// + /// Only [`UnsupportedStatement`](ColumnLevelDiagnosticKind::UnsupportedStatement) + /// carries over — wildcard suppression and column-resolution gaps don't + /// affect table-level completeness. The `match` is exhaustive so a new + /// `ColumnLevelDiagnosticKind` variant forces an explicit table-level + /// decision here. + pub(crate) fn to_table_level(&self) -> Option { + let kind = match self.kind { + ColumnLevelDiagnosticKind::UnsupportedStatement => { + TableLevelDiagnosticKind::UnsupportedStatement + } + ColumnLevelDiagnosticKind::WildcardSuppressed + | ColumnLevelDiagnosticKind::AmbiguousColumn + | ColumnLevelDiagnosticKind::UnresolvedColumn => return None, + }; + Some(TableLevelDiagnostic { + kind, + message: self.message.clone(), + span: self.span, + }) + } +} diff --git a/sql-insight/src/extractor.rs b/sql-insight/src/extractor.rs index 2183a4e..84da73e 100644 --- a/sql-insight/src/extractor.rs +++ b/sql-insight/src/extractor.rs @@ -1,6 +1,9 @@ +pub mod column_operation_extractor; pub mod crud_table_extractor; -pub mod helper; pub mod table_extractor; +pub mod table_operation_extractor; +pub use column_operation_extractor::*; pub use crud_table_extractor::*; pub use table_extractor::*; +pub use table_operation_extractor::*; diff --git a/sql-insight/src/extractor/column_operation_extractor.rs b/sql-insight/src/extractor/column_operation_extractor.rs new file mode 100644 index 0000000..929b687 --- /dev/null +++ b/sql-insight/src/extractor/column_operation_extractor.rs @@ -0,0 +1,4219 @@ +//! Extracts the column-level operations a SQL statement performs. +//! +//! Where [`extract_table_operations`](crate::extract_table_operations) +//! answers "what tables does this statement touch / write / lineage", this +//! module answers the same questions at column granularity. +//! +//! The output mirrors `TableOperation` — three parallel +//! surfaces (`reads`, `writes`, `lineage`) — plus a small enrichment on +//! lineage edges to distinguish passthrough projections from +//! value-changing transformations. +//! +//! **Current coverage** (column tracking is rolling in incrementally): +//! - `reads`: qualified column references decompose directly to +//! `TableReference + name`; unqualified ones are resolved against +//! the scope chain at walk time. A unique candidate binding wins; +//! 0 or 2+ candidates leave `table: None` (the column name still +//! surfaces). References whose walk-time owning binding was a CTE, +//! derived table, or table function (synthetic intermediates, not +//! real storage) are dropped from reads — only references to real +//! tables or unresolved names surface. `reads` is a plain +//! occurrence list of `ColumnReference`s in walk order: a column +//! referenced more than once appears more than once, with no +//! syntactic clause tag. (Whether a reference contributes a value +//! or merely influences the result — e.g. a `WHERE` predicate — is +//! recovered structurally: value contributors are `lineage` sources, +//! filter-only columns are in `reads` but not `lineage`.) +//! - `writes`: INSERT target columns (explicit list when given; +//! when omitted and the catalog provides the target's schema, +//! the columns the resolver paired with source projections via +//! the catalog), UPDATE SET targets scoped to the UPDATE table, +//! CTAS / CREATE VIEW / ALTER VIEW target columns (explicit +//! column list when provided, else the names the resolver derived +//! from the source projection), and MERGE WHEN-clause writes +//! (UPDATE SET targets and INSERT column lists, with the same +//! catalog fallback for column-list-less INSERT). +//! - `lineage`: per-projection-item edges for SELECT (target = +//! `QueryOutput { name, position }`), positionally paired +//! `source-column → target-column` edges for INSERT (explicit +//! column list, or — when the catalog provides the target's +//! schema — the catalog columns; one ProjectionGroup per UNION +//! branch, each paired against the same target columns), and +//! per-assignment edges for +//! UPDATE SET. Sources that reference CTEs or derived tables are +//! composed end-to-end — references substitute through the +//! intermediate's body projections recursively, so a SELECT through +//! a chain of CTEs surfaces lineage whose sources are the underlying +//! base tables. Each edge is tagged with a `ColumnLineageKind`: +//! `Passthrough` (the value is forwarded unchanged — a bare column +//! ref, rename included) or `Transformation` (any expression that +//! changes the value: arithmetic, function calls, aggregates, +//! window functions, CASE, casts, …). Composition yields +//! `Transformation` whenever any step in a CTE / derived chain is a +//! transformation. CTAS / CREATE +//! VIEW / ALTER VIEW emit `Relation`-target lineage from source +//! projections to the created relation's columns. MERGE emits +//! per-clause `Relation`-target lineage for WHEN MATCHED UPDATE +//! (per assignment) and +//! WHEN NOT MATCHED INSERT VALUES (positional pair with the INSERT +//! column list); DELETE actions emit nothing. Column-list-less +//! INSERT SELECT is deferred. +//! +//! **Strictness scales with the catalog.** Without a catalog, Table +//! bindings have `Unknown` schemas and unqualified refs to a +//! single-table scope resolve unconditionally (best-effort, matches +//! the implicit promise of `catalog: None`). With a catalog, Table +//! schemas come back `Known(cols)` and unqualified refs only resolve +//! when the candidate's schema actually lists the column — column +//! typos that would otherwise silently resolve become unresolved. + +use crate::catalog::Catalog; +use crate::diagnostic::{ColumnLevelDiagnostic, ColumnLevelDiagnosticKind}; +use crate::error::Error; +use crate::extractor::table_operation_extractor::StatementKind; +use crate::reference::{ColumnReference, TableReference}; +use crate::resolver::{LineageTargetSpec, RawColumnRef, Resolution, Resolver}; +use sqlparser::ast::{ + AlterTableOperation, AssignmentTarget, Ident, OnConflictAction, OnInsert, Statement, + TableFactor, +}; +use sqlparser::dialect::Dialect; +use sqlparser::parser::Parser; + +/// Convenience function to extract column-level operations from SQL. +/// +/// `catalog` is consulted for relation-level enrichment as well as +/// future column-level needs (`SELECT *` expansion, ambiguous +/// unqualified column resolution). Pass `None` for the lightest path — +/// the MVP does not consult the catalog yet, but the signature is fixed +/// so callers don't have to migrate when it does. +/// +/// ## Example +/// +/// ```rust +/// use sql_insight::sqlparser::dialect::GenericDialect; +/// use sql_insight::{ +/// extract_column_operations, ColumnLineageKind, ColumnTarget, StatementKind, +/// }; +/// +/// let dialect = GenericDialect {}; +/// let result = +/// extract_column_operations(&dialect, "SELECT a FROM t1", None).unwrap(); +/// let ops = result[0].as_ref().unwrap(); +/// +/// // SELECT contributes reads + lineage but no writes. +/// assert_eq!(ops.statement_kind, StatementKind::Select); +/// assert!(ops.writes.is_empty()); +/// +/// // `t1.a` surfaces as a single read, walk-time resolved to t1. +/// assert_eq!(ops.reads.len(), 1); +/// let read = &ops.reads[0]; +/// assert_eq!(read.name.value, "a"); +/// assert_eq!(read.table.as_ref().unwrap().name.value, "t1"); +/// +/// // The projection emits one lineage edge into the SELECT's QueryOutput slot, +/// // marked Passthrough (no expression wrapping the column). +/// assert_eq!(ops.lineage.len(), 1); +/// let edge = &ops.lineage[0]; +/// assert_eq!(edge.kind, ColumnLineageKind::Passthrough); +/// match &edge.target { +/// ColumnTarget::QueryOutput { name, position } => { +/// assert_eq!(name.as_ref().unwrap().value, "a"); +/// assert_eq!(*position, 0); +/// } +/// other => panic!("expected QueryOutput, got {other:?}"), +/// } +/// ``` +pub fn extract_column_operations( + dialect: &dyn Dialect, + sql: &str, + catalog: Option<&dyn Catalog>, +) -> Result>, Error> { + ColumnOperationExtractor::extract(dialect, sql, catalog) +} + +/// Column-level operations performed by a single SQL statement. +/// +/// Mirrors [`TableOperation`](crate::TableOperation) +/// with the same three surfaces — `reads`, `writes`, `lineage` — at +/// column granularity. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct ColumnOperation { + pub statement_kind: StatementKind, + /// Columns read by the statement, in walk order. Occurrence-based: + /// a column referenced more than once appears more than once + /// (e.g. `SELECT a FROM t WHERE a > 0` yields `t.a` twice). A + /// consumer wanting the distinct set dedups via a `HashSet`. + pub reads: Vec, + /// Columns written by the statement, in walk order. Occurrence-based + /// like `reads`. + pub writes: Vec, + pub lineage: Vec, + pub diagnostics: Vec, +} + +/// A column-level lineage edge: data from `source` contributes to +/// `target`. Emitted for both relation-target statements (INSERT / +/// UPDATE / MERGE / CTAS / CREATE VIEW, target = `ColumnTarget::Relation`) +/// and bare SELECT (target = `ColumnTarget::QueryOutput`). +/// +/// One edge per (source, target) pair: `SELECT a + b FROM t1` emits two +/// edges, from `t1.a` and `t1.b` to the same query-output target, each +/// tagged `Transformation`. +/// +/// Statements that physically move data emit composed end-to-end lineage +/// — `INSERT INTO t1 (col) SELECT b FROM t2` emits `t2.b → t1.col` +/// directly, with no intermediate query-output entry. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct ColumnLineageEdge { + pub source: ColumnReference, + pub target: ColumnTarget, + pub kind: ColumnLineageKind, +} + +/// The target endpoint of a [`ColumnLineageEdge`]. +/// +/// `Relation` covers columns that live in a named relation — a table +/// or a view, both modelled identically as a `table`-qualified +/// `ColumnReference` — and receive a value from the statement (INSERT +/// target, UPDATE SET target, MERGE INSERT/UPDATE target, CTAS / CREATE +/// VIEW output column). +/// +/// `QueryOutput` covers transient columns produced by a top-level +/// SELECT projection that is not piped into a named relation. `name` +/// follows the projection: the alias if explicit, the bare column name +/// if the projection is a single column, otherwise `None`. `position` +/// is always set so anonymous outputs can be identified. +#[derive(Clone, Debug, PartialEq, Eq, Hash)] +pub enum ColumnTarget { + /// A column in a real relation receiving the inbound lineage edge — INSERT / + /// UPDATE / MERGE target columns, or columns of the new relation + /// produced by CTAS / CREATE VIEW / ALTER VIEW. + Relation(ColumnReference), + /// A transient column produced by a top-level SELECT projection + /// that is not piped into a named relation. `name` follows + /// the projection's explicit alias or inferred single-column name + /// (`None` for expressions without a clear name); `position` is + /// always set so anonymous outputs remain identifiable. + QueryOutput { + name: Option, + position: usize, + }, +} + +/// How a source column contributes to its target — the one clean, +/// exclusive distinction: is the value forwarded unchanged, or +/// derived? +/// +/// - `Passthrough` — the source value is forwarded unchanged +/// (`SELECT a FROM t1`, `INSERT INTO t1 (a) SELECT b FROM t2`). A +/// rename (`SELECT a AS b`) is still `Passthrough`; detect it by +/// comparing the source `name` to the target `name`. +/// - `Transformation` — the source feeds any expression that changes +/// the value: arithmetic, function calls, CASE branches, casts, +/// aggregates (`SUM`, `STRING_AGG`), window functions, etc. +/// +/// Finer sub-classification of `Transformation` (aggregate vs scalar, +/// cardinality, etc.) is deliberately not modelled here — it is lossy +/// for edge cases (window aggregates, value-preserving `STRING_AGG`) +/// and not load-bearing for the core dependency / impact-analysis use +/// case. A finer variant can be added later if a concrete consumer +/// needs it (a breaking change while the crate is pre-1.0). +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +pub enum ColumnLineageKind { + /// Source value is forwarded unchanged. Composition stays + /// `Passthrough` only when every step in the chain is also + /// `Passthrough`. + Passthrough, + /// Source feeds an expression that changes the value. Composition + /// yields `Transformation` whenever any step in the chain is a + /// transformation. + Transformation, +} + +/// Extracts column-level operations from SQL. +#[derive(Default, Debug)] +pub struct ColumnOperationExtractor; + +impl ColumnOperationExtractor { + pub fn extract( + dialect: &dyn Dialect, + sql: &str, + catalog: Option<&dyn Catalog>, + ) -> Result>, Error> { + let statements = Parser::parse_sql(dialect, sql)?; + Ok(statements + .iter() + .map(|s| Self::extract_from_statement(s, catalog)) + .collect()) + } + + pub fn extract_from_statement( + statement: &Statement, + catalog: Option<&dyn Catalog>, + ) -> Result { + let kind = super::table_operation_extractor::classify_statement(statement); + let resolution = Resolver::resolve_statement(catalog, statement)?; + + // Start from resolver-level diagnostics; extractor adds its own + // only when classify_statement detects an unsupported case the + // resolver did not already report. + let mut diagnostics = resolution.diagnostics.clone(); + + if matches!(kind, StatementKind::Unsupported) { + if !diagnostics + .iter() + .any(|d| matches!(d.kind, ColumnLevelDiagnosticKind::UnsupportedStatement)) + { + diagnostics.push(ColumnLevelDiagnostic { + kind: ColumnLevelDiagnosticKind::UnsupportedStatement, + message: format!( + "Unsupported statement for column operation extraction: {}", + statement + ), + span: None, + }); + } + return Ok(ColumnOperation { + statement_kind: kind, + reads: Vec::new(), + writes: Vec::new(), + lineage: Vec::new(), + diagnostics, + }); + } + + let reads = collect_reads(&resolution); + let writes = collect_writes(statement, &resolution)?; + let lineage = extract_lineage(&resolution); + + Ok(ColumnOperation { + statement_kind: kind, + reads, + writes, + lineage, + diagnostics, + }) + } +} + +/// Map the resolver's pre-built `lineage_edges` 1:1 to public +/// `ColumnLineageEdge`. Sources go through scope-chain resolution; targets +/// are already fully spec'd by the resolver. +fn extract_lineage(resolution: &Resolution) -> Vec { + resolution + .lineage_edges + .iter() + .filter_map(|edge| { + let source = resolve_raw_ref(&edge.source)?; + let target = match &edge.target { + LineageTargetSpec::QueryOutput { name, position } => ColumnTarget::QueryOutput { + name: name.clone(), + position: *position, + }, + LineageTargetSpec::Relation { table, column } => { + ColumnTarget::Relation(ColumnReference { + table: Some(table.clone()), + name: column.clone(), + }) + } + }; + Some(ColumnLineageEdge { + source, + target, + kind: edge.kind, + }) + }) + .collect() +} + +/// Build a `ColumnReference` from a resolver-captured raw ref. The +/// resolver records owning-table resolution at walk time, so this is +/// a 1:1 read of `(resolved, parts.last())`. Refs whose owning +/// binding was synthetic at walk time are dropped upstream by the +/// resolver itself before they reach the extractor — see +/// `Resolution::real_column_refs`. +fn resolve_raw_ref(raw: &RawColumnRef) -> Option { + let name = raw.parts.last()?.clone(); + Some(ColumnReference { + table: raw.resolved.clone(), + name, + }) +} + +fn collect_reads(resolution: &Resolution) -> Vec { + resolution + .column_refs + .iter() + .filter_map(resolve_raw_ref) + .collect() +} + +/// Build a `ColumnReference` from a `CompoundIdentifier`'s parts — +/// used by UPDATE SET target parsing where the target's qualifier +/// hasn't been resolver-walked. The last part is the column name; +/// preceding parts decode into `TableReference` by length (1 / 2 / 3). +fn column_ref_from_parts(parts: &[Ident]) -> Option { + let (col, table_parts) = match parts.split_last() { + Some((col, rest)) if !rest.is_empty() => (col.clone(), rest), + _ => return None, + }; + let table = match table_parts.len() { + 1 => TableReference { + catalog: None, + schema: None, + name: table_parts[0].clone(), + }, + 2 => TableReference { + catalog: None, + schema: Some(table_parts[0].clone()), + name: table_parts[1].clone(), + }, + 3 => TableReference { + catalog: Some(table_parts[0].clone()), + schema: Some(table_parts[1].clone()), + name: table_parts[2].clone(), + }, + _ => return None, + }; + Some(ColumnReference { + table: Some(table), + name: col, + }) +} + +/// Statement-specific write extraction. Covered: +/// - INSERT explicit column list → writes scoped to the INSERT target. +/// - UPDATE SET targets → writes scoped to the UPDATE target table +/// (qualifier is honored when the SET target is qualified, otherwise +/// the UPDATE head provides the table). +/// - CTAS / CREATE VIEW / ALTER VIEW → writes follow the created +/// relation's columns (explicit list when given, otherwise the +/// columns the resolver derived from the source projection — read +/// off the resolution's `Relation` lineage edges to that target). +/// +/// MERGE WHEN clause writes are deferred. +fn collect_writes( + statement: &Statement, + resolution: &Resolution, +) -> Result, Error> { + // `WITH cte AS (...) ` parses as a top-level `Statement::Query` + // wrapping a `SetExpr::{Insert|Update|Delete|Merge}` around the + // real DML statement. Unwrap that here so writes follow the inner + // verb, matching what `classify_statement` already does for kind. + if let Statement::Query(query) = statement { + use sqlparser::ast::SetExpr; + if let SetExpr::Insert(inner) + | SetExpr::Update(inner) + | SetExpr::Delete(inner) + | SetExpr::Merge(inner) = query.body.as_ref() + { + return collect_writes(inner, resolution); + } + } + let mut writes = Vec::new(); + match statement { + Statement::Insert(insert) => { + let target = TableReference::try_from(insert)?; + if !insert.columns.is_empty() { + for col in &insert.columns { + writes.push(ColumnReference { + table: Some(target.clone()), + name: col.clone(), + }); + } + } else { + // INSERT without an explicit column list — when the + // catalog provided the target schema, the resolver + // emitted Relation lineage to each paired column. Read + // those off to surface the implicit writes. + writes.extend(relation_target_writes(&target, resolution)); + } + // ON CONFLICT DO UPDATE SET / ON DUPLICATE KEY UPDATE + // assignment targets become writes too — each SET column + // is updated on conflict, same role as a standalone UPDATE + // SET target. + writes.extend(insert_on_action_writes(insert, &target)); + } + Statement::Update(update) => { + let default_table = match &update.table.relation { + TableFactor::Table { .. } => { + Some(TableReference::try_from(&update.table.relation)?) + } + _ => None, + }; + for assignment in &update.assignments { + if let Some(column) = + column_ref_from_assignment_target(&assignment.target, default_table.as_ref()) + { + writes.push(column); + } + } + } + // Only CTAS (`CREATE TABLE ... AS query`) writes data; plain + // `CREATE TABLE t (a INT, ...)` is pure DDL and falls through to + // the no-op arm below. + Statement::CreateTable(ct) if ct.query.is_some() => { + let target = TableReference::try_from(&ct.name)?; + let explicit: Vec = ct.columns.iter().map(|c| c.name.clone()).collect(); + writes.extend(created_writes(&target, &explicit, resolution)); + } + Statement::CreateView(cv) => { + let target = TableReference::try_from(&cv.name)?; + let explicit: Vec = cv.columns.iter().map(|c| c.name.clone()).collect(); + writes.extend(created_writes(&target, &explicit, resolution)); + } + Statement::AlterView { name, columns, .. } => { + let target = TableReference::try_from(name)?; + writes.extend(created_writes(&target, columns, resolution)); + } + Statement::AlterTable(alter) => { + let target = TableReference::try_from(&alter.name)?; + for op in &alter.operations { + for col_name in alter_table_op_target_columns(op) { + writes.push(ColumnReference { + table: Some(target.clone()), + name: col_name, + }); + } + } + } + Statement::Merge(merge) => { + use sqlparser::ast::MergeAction; + let target = match &merge.table { + TableFactor::Table { .. } => TableReference::try_from(&merge.table).ok(), + _ => None, + }; + for clause in &merge.clauses { + match &clause.action { + MergeAction::Insert(insert_expr) => { + let Some(target) = &target else { continue }; + for col_obj in &insert_expr.columns { + let Some(ident) = col_obj.0.last().and_then(|p| p.as_ident()) else { + continue; + }; + writes.push(ColumnReference { + table: Some(target.clone()), + name: ident.clone(), + }); + } + } + MergeAction::Update(update_expr) => { + for assignment in &update_expr.assignments { + if let Some(column) = column_ref_from_assignment_target( + &assignment.target, + target.as_ref(), + ) { + writes.push(column); + } + } + } + MergeAction::Delete { .. } => {} + } + } + } + _ => {} + } + Ok(writes) +} + +/// Writes for a CREATE-as-style target: when an explicit column list +/// is given, use it verbatim; otherwise delegate to +/// [`relation_target_writes`] to recover the columns from the +/// resolver's lineage edges. +fn created_writes( + target: &TableReference, + explicit: &[Ident], + resolution: &Resolution, +) -> Vec { + if !explicit.is_empty() { + return explicit + .iter() + .map(|c| ColumnReference { + table: Some(target.clone()), + name: c.clone(), + }) + .collect(); + } + relation_target_writes(target, resolution) +} + +/// Scan the resolution's `Relation` lineage edges for any pointing at +/// `target`, returning a deduped `ColumnWrite` per unique column +/// name. Used by both CREATE-as-style writes derivation and INSERT +/// without an explicit column list (where the catalog-provided +/// schema let the resolver pair source projections positionally). +fn relation_target_writes( + target: &TableReference, + resolution: &Resolution, +) -> Vec { + let mut seen: Vec = Vec::new(); + for edge in &resolution.lineage_edges { + if let LineageTargetSpec::Relation { table, column } = &edge.target { + if table == target && !seen.iter().any(|n| n.value == column.value) { + seen.push(column.clone()); + } + } + } + seen.into_iter() + .map(|name| ColumnReference { + table: Some(target.clone()), + name, + }) + .collect() +} + +/// Extract the column names an ALTER TABLE operation writes to. +/// Schema-level changes (AddConstraint, DropConstraint, partition / +/// projection ops, RENAME TABLE, etc.) return empty — they don't +/// affect named columns. Rename / change return BOTH the old and new +/// names so the lineage surface records both ends of the rename. +fn alter_table_op_target_columns(op: &AlterTableOperation) -> Vec { + match op { + AlterTableOperation::AddColumn { column_def, .. } => vec![column_def.name.clone()], + AlterTableOperation::DropColumn { column_names, .. } => column_names.clone(), + AlterTableOperation::RenameColumn { + old_column_name, + new_column_name, + } => vec![old_column_name.clone(), new_column_name.clone()], + AlterTableOperation::ChangeColumn { + old_name, new_name, .. + } => { + if old_name == new_name { + vec![old_name.clone()] + } else { + vec![old_name.clone(), new_name.clone()] + } + } + AlterTableOperation::ModifyColumn { col_name, .. } => vec![col_name.clone()], + AlterTableOperation::AlterColumn { column_name, .. } => vec![column_name.clone()], + _ => Vec::new(), + } +} + +/// Surface ON CONFLICT DO UPDATE SET / ON DUPLICATE KEY UPDATE +/// assignment targets as writes on the INSERT target table. +/// Returns an empty `Vec` when the INSERT carries no on-clause, or +/// when the on-clause is `DO NOTHING` (no SET targets to surface). +fn insert_on_action_writes( + insert: &sqlparser::ast::Insert, + target: &TableReference, +) -> Vec { + let assignments: &[sqlparser::ast::Assignment] = match insert.on.as_ref() { + Some(OnInsert::DuplicateKeyUpdate(a)) => a, + Some(OnInsert::OnConflict(c)) => match &c.action { + OnConflictAction::DoUpdate(do_update) => &do_update.assignments, + OnConflictAction::DoNothing => return Vec::new(), + }, + // `OnInsert` is `#[non_exhaustive]` — unknown variants + // surface no writes until we model them explicitly. + Some(_) => return Vec::new(), + None => return Vec::new(), + }; + assignments + .iter() + .filter_map(|a| column_ref_from_assignment_target(&a.target, Some(target))) + .collect() +} + +/// Resolve a SET assignment target to a `ColumnReference`. If the +/// target is qualified (`t1.a`), the qualifier wins; otherwise the +/// `default_table` (the UPDATE head) provides the table. +fn column_ref_from_assignment_target( + target: &AssignmentTarget, + default_table: Option<&TableReference>, +) -> Option { + let name = match target { + AssignmentTarget::ColumnName(name) => name, + AssignmentTarget::Tuple(_) => return None, + }; + let idents: Vec = name + .0 + .iter() + .map(|p| p.as_ident().cloned()) + .collect::>>()?; + match idents.len() { + 1 => Some(ColumnReference { + table: default_table.cloned(), + name: idents.into_iter().next().unwrap(), + }), + 2..=4 => column_ref_from_parts(&idents), + _ => None, + } +} + +#[cfg(test)] +mod tests { + use super::*; + use sqlparser::dialect::GenericDialect; + + fn extract(sql: &str) -> ColumnOperation { + let mut result = extract_column_operations(&GenericDialect {}, sql, None).unwrap(); + result.remove(0).unwrap() + } + + fn table(name: &str) -> TableReference { + TableReference { + catalog: None, + schema: None, + name: name.into(), + } + } + + // reads / writes are now plain `Vec` (occurrence + // based, no clause kind), so all the read/write builders return a + // `ColumnReference`. `read` and `col` are interchangeable; both are + // kept for callsite readability (`read` in reads lists, `col` as a + // lineage source / target inner). + fn read(table_name: &str, col: &str) -> ColumnReference { + ColumnReference { + table: Some(table(table_name)), + name: col.into(), + } + } + + fn write(table_name: &str, col: &str) -> ColumnReference { + ColumnReference { + table: Some(table(table_name)), + name: col.into(), + } + } + + fn unresolved(col: &str) -> ColumnReference { + ColumnReference { + table: None, + name: col.into(), + } + } + + fn out(name: &str, position: usize) -> ColumnTarget { + ColumnTarget::QueryOutput { + name: Some(name.into()), + position, + } + } + + fn out_anon(position: usize) -> ColumnTarget { + ColumnTarget::QueryOutput { + name: None, + position, + } + } + + fn relation(table_name: &str, col: &str) -> ColumnTarget { + ColumnTarget::Relation(ColumnReference { + table: Some(table(table_name)), + name: col.into(), + }) + } + + fn col(table_name: &str, name: &str) -> ColumnReference { + ColumnReference { + table: Some(table(table_name)), + name: name.into(), + } + } + + fn passthrough(source: ColumnReference, target: ColumnTarget) -> ColumnLineageEdge { + ColumnLineageEdge { + source, + target, + kind: ColumnLineageKind::Passthrough, + } + } + + fn transformation(source: ColumnReference, target: ColumnTarget) -> ColumnLineageEdge { + ColumnLineageEdge { + source, + target, + kind: ColumnLineageKind::Transformation, + } + } + + /// Whole-value-ish assertion: pin down the full + /// `ColumnOperation` for `sql`. reads / writes / lineage / + /// statement_kind compare strictly; diagnostics compare by **kind + /// sequence only** so message wording and span coordinates aren't + /// baked into the expected value. + fn assert_column_ops(sql: &str, expected: ColumnOperation) { + assert_nth_column_ops(sql, 0, expected); + } + + /// Like `assert_column_ops` but for multi-statement batches — + /// targets the statement at `index`. Compose multiple calls to + /// pin down each statement in a batch independently. + fn assert_nth_column_ops(sql: &str, index: usize, expected: ColumnOperation) { + let actual = extract_column_operations(&GenericDialect {}, sql, None) + .unwrap() + .into_iter() + .nth(index) + .unwrap_or_else(|| panic!("statement {index} missing in result for SQL: {sql}")) + .unwrap(); + assert_column_ops_inner(sql, index, actual, expected); + } + + fn assert_column_ops_inner( + sql: &str, + index: usize, + actual: ColumnOperation, + expected: ColumnOperation, + ) { + let ColumnOperation { + statement_kind, + reads, + writes, + lineage, + diagnostics, + } = expected; + assert_eq!( + actual.statement_kind, statement_kind, + "kind for SQL: {sql} (statement {index})" + ); + assert_eq!( + actual.reads, reads, + "reads for SQL: {sql} (statement {index})" + ); + assert_eq!( + actual.writes, writes, + "writes for SQL: {sql} (statement {index})" + ); + assert_eq!( + actual.lineage, lineage, + "lineage for SQL: {sql} (statement {index})" + ); + let actual_kinds: Vec<_> = actual.diagnostics.iter().map(|d| d.kind.clone()).collect(); + let expected_kinds: Vec<_> = diagnostics.iter().map(|d| d.kind.clone()).collect(); + assert_eq!( + actual_kinds, expected_kinds, + "diagnostic kinds for SQL: {sql} (statement {index})" + ); + } + + /// Placeholder `ColumnLevelDiagnostic` for `assert_column_ops.expected.diagnostics`. + /// Only the kind is compared; message and span are placeholders. + fn diag(kind: ColumnLevelDiagnosticKind) -> ColumnLevelDiagnostic { + ColumnLevelDiagnostic { + kind, + message: String::new(), + span: None, + } + } + + mod reads { + use super::*; + + #[test] + fn qualified_select_collects_qualified_reads() { + assert_column_ops( + "SELECT t1.a, t1.b FROM t1", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "a"), read("t1", "b")], + writes: vec![], + lineage: vec![ + passthrough(col("t1", "a"), out("a", 0)), + passthrough(col("t1", "b"), out("b", 1)), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn qualified_join_collects_reads_from_both_sides() { + // Resolver walks FROM (including JOIN ON) before the projection, + // so the predicate columns appear ahead of the projected ones — + // and are tagged Filter while projection refs are Projection. + assert_column_ops( + "SELECT t1.a, t2.b FROM t1 JOIN t2 ON t1.id = t2.id", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![ + read("t1", "id"), + read("t2", "id"), + read("t1", "a"), + read("t2", "b"), + ], + writes: vec![], + lineage: vec![ + passthrough(col("t1", "a"), out("a", 0)), + passthrough(col("t2", "b"), out("b", 1)), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn qualified_ref_through_alias_resolves_to_real_table() { + // `u` is an alias of `t1`; the qualified ref `u.a` resolves + // to the alias-free real table `t1`, matching how an + // unqualified ref resolves. Alias is use-site decoration, + // not part of the column's identity. + assert_column_ops( + "SELECT u.a FROM t1 AS u", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "a")], + writes: vec![], + lineage: vec![passthrough(col("t1", "a"), out("a", 0))], + diagnostics: vec![], + }, + ); + } + + #[test] + fn qualified_refs_through_aliases_on_both_join_sides_resolve_to_real_tables() { + // Implicit aliases (`t1 a`, `t2 b`) on both join sides; every + // qualified ref canonicalizes to its real table. JOIN ON is + // walked during FROM, so the predicate reads precede the + // projection reads. + assert_column_ops( + "SELECT a.x, b.y FROM t1 a JOIN t2 b ON a.id = b.id", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![ + read("t1", "id"), + read("t2", "id"), + read("t1", "x"), + read("t2", "y"), + ], + writes: vec![], + lineage: vec![ + passthrough(col("t1", "x"), out("x", 0)), + passthrough(col("t2", "y"), out("y", 1)), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn aliased_filter_ref_resolves_to_real_table_and_stays_out_of_lineage() { + // A WHERE-only column through an alias resolves to the real + // table for `reads`, but a filter column is not a value + // contributor, so it never appears in `lineage`. + assert_column_ops( + "SELECT u.a FROM t1 AS u WHERE u.b > 0", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "a"), read("t1", "b")], + writes: vec![], + lineage: vec![passthrough(col("t1", "a"), out("a", 0))], + diagnostics: vec![], + }, + ); + } + + #[test] + fn schema_qualified_ref_resolves_to_schema_dot_table() { + let table_ref = TableReference { + catalog: None, + schema: Some("s1".into()), + name: "t1".into(), + }; + assert_column_ops( + "SELECT s1.t1.a FROM s1.t1", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![ColumnReference { + table: Some(table_ref.clone()), + name: "a".into(), + }], + writes: vec![], + lineage: vec![passthrough( + ColumnReference { + table: Some(table_ref), + name: "a".into(), + }, + out("a", 0), + )], + diagnostics: vec![], + }, + ); + } + + #[test] + fn catalog_qualified_ref_resolves_to_catalog_dot_schema_dot_table() { + // `c1.s1.t1.a` — 4-part ref. parts.last() is the column; + // the preceding 3 parts decode into TableReference's + // catalog / schema / name fields. + let table_ref = TableReference { + catalog: Some("c1".into()), + schema: Some("s1".into()), + name: "t1".into(), + }; + assert_column_ops( + "SELECT c1.s1.t1.a FROM c1.s1.t1", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![ColumnReference { + table: Some(table_ref.clone()), + name: "a".into(), + }], + writes: vec![], + lineage: vec![passthrough( + ColumnReference { + table: Some(table_ref), + name: "a".into(), + }, + out("a", 0), + )], + diagnostics: vec![], + }, + ); + } + + #[test] + fn unqualified_ref_against_catalog_qualified_table_inherits_full_qualifier() { + // `SELECT a FROM c1.s1.t1` — the unqualified `a` resolves + // to the catalog-qualified binding, picking up the full + // qualifier in the ColumnReference. + let table_ref = TableReference { + catalog: Some("c1".into()), + schema: Some("s1".into()), + name: "t1".into(), + }; + assert_column_ops( + "SELECT a FROM c1.s1.t1", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![ColumnReference { + table: Some(table_ref.clone()), + name: "a".into(), + }], + writes: vec![], + lineage: vec![passthrough( + ColumnReference { + table: Some(table_ref), + name: "a".into(), + }, + out("a", 0), + )], + diagnostics: vec![], + }, + ); + } + + #[test] + fn five_part_ref_overshoots_qualifier_decoder_and_is_unresolved() { + // sqlparser parses `extra.c1.s1.t1.a` into 5 parts. The + // qualifier decoder caps at 3 parts (catalog / schema / + // name) — anything longer is a struct-field access on a + // fully qualified column, which we don't model. The ref + // is recorded with `table: None`. + assert_column_ops( + "SELECT extra.c1.s1.t1.a FROM c1.s1.t1", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![unresolved("a")], + writes: vec![], + lineage: vec![ColumnLineageEdge { + source: ColumnReference { + table: None, + name: "a".into(), + }, + target: out("a", 0), + kind: ColumnLineageKind::Passthrough, + }], + diagnostics: vec![], + }, + ); + } + + #[test] + fn where_predicate_qualified_ref_is_a_read() { + assert_column_ops( + "SELECT t1.a FROM t1 WHERE t1.b > 0", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "a"), read("t1", "b")], + writes: vec![], + lineage: vec![passthrough(col("t1", "a"), out("a", 0))], + diagnostics: vec![], + }, + ); + } + + #[test] + fn unqualified_single_table_resolves_to_that_table() { + assert_column_ops( + "SELECT a, b FROM t1", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "a"), read("t1", "b")], + writes: vec![], + lineage: vec![ + passthrough(col("t1", "a"), out("a", 0)), + passthrough(col("t1", "b"), out("b", 1)), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn unqualified_in_where_resolves_to_single_table() { + assert_column_ops( + "SELECT a FROM t1 WHERE b > 0", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "a"), read("t1", "b")], + writes: vec![], + lineage: vec![passthrough(col("t1", "a"), out("a", 0))], + diagnostics: vec![], + }, + ); + } + + #[test] + fn unqualified_with_multiple_tables_stays_unresolved() { + // Two `Unknown`-schema tables — without a catalog the resolver + // cannot tell which `a` belongs to, so the ref surfaces with + // `table: None`. The lineage source also stays unresolved. + assert_column_ops( + "SELECT a FROM t1 JOIN t2 ON t1.id = t2.id", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "id"), read("t2", "id"), unresolved("a")], + writes: vec![], + lineage: vec![ColumnLineageEdge { + source: ColumnReference { + table: None, + name: "a".into(), + }, + target: out("a", 0), + kind: ColumnLineageKind::Passthrough, + }], + diagnostics: vec![], + }, + ); + } + + #[test] + fn unqualified_uses_alias_binding_but_returns_real_table() { + // Alias is just a binding key; the resolver returns the + // alias-free TableReference of the binding's underlying table. + assert_column_ops( + "SELECT a FROM t1 AS u", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "a")], + writes: vec![], + lineage: vec![passthrough(col("t1", "a"), out("a", 0))], + diagnostics: vec![], + }, + ); + } + + #[test] + fn cte_ref_does_not_surface_in_reads() { + // The outer `id` resolves to the cte binding (a synthetic + // intermediate, not real storage), so it's dropped from reads. + // Reads surface only references with real Table owners or + // unresolved column names. `unknown_col` doesn't match the + // cte's Known schema [id], so it surfaces unresolved + // (table: None) AND fires an UnresolvedColumn diagnostic. + assert_column_ops( + "WITH cte AS (SELECT id FROM t1) SELECT id, unknown_col FROM cte", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "id"), unresolved("unknown_col")], + writes: vec![], + lineage: vec![ + passthrough(col("t1", "id"), out("id", 0)), + ColumnLineageEdge { + source: ColumnReference { + table: None, + name: "unknown_col".into(), + }, + target: out("unknown_col", 1), + kind: ColumnLineageKind::Passthrough, + }, + ], + diagnostics: vec![diag(ColumnLevelDiagnosticKind::UnresolvedColumn)], + }, + ); + } + + #[test] + fn derived_table_ref_does_not_surface_in_reads() { + // Outer `id` resolves to derived alias `d` — synthetic, dropped. + // Only the inner SELECT's t1.id is a real read. + assert_column_ops( + "SELECT id FROM (SELECT id FROM t1) AS d", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "id")], + writes: vec![], + lineage: vec![passthrough(col("t1", "id"), out("id", 0))], + diagnostics: vec![], + }, + ); + } + + #[test] + fn unqualified_inner_scope_shadows_outer() { + // Inner subquery has its own t2 in scope; the unqualified `y` + // inside the IN-subquery resolves to t2 even though t1 is + // also in the outer scope. Standard SQL inner-shadows-outer. + // The predicate subquery emits no lineage (it feeds a filter); + // it still surfaces its refs in reads. The outer `*` is a + // suppressed wildcard, so there is no lineage at all. + assert_column_ops( + "SELECT * FROM t1 WHERE id IN (SELECT id FROM t2 WHERE y > 0)", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "id"), read("t2", "id"), read("t2", "y")], + writes: vec![], + lineage: vec![], + diagnostics: vec![diag(ColumnLevelDiagnosticKind::WildcardSuppressed)], + }, + ); + } + + #[test] + fn unqualified_correlated_walks_to_outer_when_inner_has_no_candidate() { + // Inner CTE has Known schema [zz]; `outer_col` doesn't fit it, + // so resolution walks to the outer scope and picks the t1 + // (Unknown) binding. The predicate subquery emits no lineage; + // the outer `*` is a suppressed wildcard, so no lineage at all. + assert_column_ops( + "SELECT * FROM t1 WHERE id IN (\ + WITH inner_cte AS (SELECT zz FROM t1) \ + SELECT zz FROM inner_cte WHERE outer_col > 0)", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "id"), read("t1", "zz"), read("t1", "outer_col")], + writes: vec![], + lineage: vec![], + diagnostics: vec![diag(ColumnLevelDiagnosticKind::WildcardSuppressed)], + }, + ); + } + } + + mod writes { + use super::*; + + #[test] + fn insert_with_explicit_columns_writes_those_columns_on_target() { + assert_column_ops( + "INSERT INTO t1 (a, b) VALUES (1, 2)", + ColumnOperation { + statement_kind: StatementKind::Insert, + reads: vec![], + writes: vec![write("t1", "a"), write("t1", "b")], + lineage: vec![], + diagnostics: vec![], + }, + ); + } + + #[test] + fn insert_select_records_target_writes_and_qualified_source_reads() { + assert_column_ops( + "INSERT INTO t1 (a) SELECT t2.b FROM t2", + ColumnOperation { + statement_kind: StatementKind::Insert, + reads: vec![read("t2", "b")], + writes: vec![write("t1", "a")], + lineage: vec![passthrough(col("t2", "b"), relation("t1", "a"))], + diagnostics: vec![], + }, + ); + } + + #[test] + fn insert_without_explicit_columns_yields_no_writes() { + // Without an explicit column list AND without a catalog, the + // resolver can't pair source projections to target columns; + // writes / lineage stay empty. + assert_column_ops( + "INSERT INTO t1 SELECT t2.b FROM t2", + ColumnOperation { + statement_kind: StatementKind::Insert, + reads: vec![read("t2", "b")], + writes: vec![], + lineage: vec![], + diagnostics: vec![], + }, + ); + } + + #[test] + fn update_set_targets_become_writes_on_update_table() { + assert_column_ops( + "UPDATE t1 SET a = 1", + ColumnOperation { + statement_kind: StatementKind::Update, + reads: vec![], + writes: vec![write("t1", "a")], + lineage: vec![], + diagnostics: vec![], + }, + ); + } + + #[test] + fn update_set_qualified_target_keeps_qualifier() { + assert_column_ops( + "UPDATE t1 SET t1.a = 1", + ColumnOperation { + statement_kind: StatementKind::Update, + reads: vec![], + writes: vec![write("t1", "a")], + lineage: vec![], + diagnostics: vec![], + }, + ); + } + + #[test] + fn update_set_rhs_qualified_ref_is_a_read() { + // SET RHS is value-producing (Projection-like); WHERE refs are + // Filter-tagged. + assert_column_ops( + "UPDATE t1 SET a = t2.b FROM t2 WHERE t1.id = t2.id", + ColumnOperation { + statement_kind: StatementKind::Update, + reads: vec![read("t2", "b"), read("t1", "id"), read("t2", "id")], + writes: vec![write("t1", "a")], + lineage: vec![passthrough(col("t2", "b"), relation("t1", "a"))], + diagnostics: vec![], + }, + ); + } + } + + mod delete { + use super::*; + + #[test] + fn delete_qualified_predicate_is_a_read() { + assert_column_ops( + "DELETE FROM t1 WHERE t1.id = 5", + ColumnOperation { + statement_kind: StatementKind::Delete, + reads: vec![read("t1", "id")], + writes: vec![], + lineage: vec![], + diagnostics: vec![], + }, + ); + } + } + + // Columns from every clause (projection / WHERE / GROUP BY / + // ORDER BY / OVER / CASE / HAVING / …) surface in `reads` as plain + // occurrence entries — `reads` no longer tags a syntactic clause. + // These tests pin down WHICH refs surface (occurrence-based, dups + // kept) and the lineage they produce. + mod reads_by_clause { + use super::*; + + #[test] + fn same_column_in_projection_and_where_is_two_reads() { + // The two textual `a` references each get their own `reads` + // entry (occurrence-based — duplicates are kept). + assert_column_ops( + "SELECT a FROM t1 WHERE a > 0", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "a"), read("t1", "a")], + writes: vec![], + lineage: vec![passthrough(col("t1", "a"), out("a", 0))], + diagnostics: vec![], + }, + ); + } + + #[test] + fn predicate_subquery_surfaces_reads_but_no_lineage() { + // The IN-subquery feeds a filter, so it emits NO lineage + // (Option B: nested subqueries resolve raw, no intermediate + // QueryOutput edge). Its refs (s.id, s.flag) still surface + // in reads. Only the outer projection `a` contributes a lineage edge. + assert_column_ops( + "SELECT a FROM t WHERE id IN (SELECT id FROM s WHERE flag = 1)", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![ + read("t", "a"), + read("t", "id"), + read("s", "id"), + read("s", "flag"), + ], + writes: vec![], + lineage: vec![passthrough(col("t", "a"), out("a", 0))], + diagnostics: vec![], + }, + ); + } + + #[test] + fn scalar_subquery_in_projection_feeds_only_outer() { + // `SELECT a, (SELECT max(x) FROM s) AS m FROM t`: + // - the scalar subquery does NOT emit its own QueryOutput + // edge (Option B: raw resolve). Its source `s.x` is + // captured by the enclosing projection item, which emits + // the single meaningful edge `s.x → out("m", 1)`, + // Transformation (the item is a subquery expression). + // - `a` is a plain passthrough at position 0. + assert_column_ops( + "SELECT a, (SELECT max(x) FROM s) AS m FROM t", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t", "a"), read("s", "x")], + writes: vec![], + lineage: vec![ + passthrough(col("t", "a"), out("a", 0)), + transformation(col("s", "x"), out("m", 1)), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn is_null_predicate_ref_surfaces_as_read() { + // `WHERE x IS NULL` — x surfaces in reads like any other + // WHERE ref; it is not a lineage source (predicate-only). + assert_column_ops( + "SELECT a FROM t1 WHERE b IS NULL", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "a"), read("t1", "b")], + writes: vec![], + lineage: vec![passthrough(col("t1", "a"), out("a", 0))], + diagnostics: vec![], + }, + ); + } + + #[test] + fn is_not_null_predicate_ref_surfaces_as_read() { + assert_column_ops( + "SELECT a FROM t1 WHERE b IS NOT NULL", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "a"), read("t1", "b")], + writes: vec![], + lineage: vec![passthrough(col("t1", "a"), out("a", 0))], + diagnostics: vec![], + }, + ); + } + + #[test] + fn group_by_ref_surfaces_as_read() { + assert_column_ops( + "SELECT a, COUNT(*) FROM t1 GROUP BY a", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "a"), read("t1", "a")], + writes: vec![], + lineage: vec![passthrough(col("t1", "a"), out("a", 0))], + diagnostics: vec![], + }, + ); + } + + #[test] + fn order_by_ref_surfaces_as_read() { + assert_column_ops( + "SELECT a FROM t1 ORDER BY b", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "a"), read("t1", "b")], + writes: vec![], + lineage: vec![passthrough(col("t1", "a"), out("a", 0))], + diagnostics: vec![], + }, + ); + } + + #[test] + fn group_by_and_having_refs_both_surface() { + // `a` (projection + GROUP BY) and `b` (HAVING) all surface. + // Walk order: projection → HAVING → GROUP BY (the visitor + // hits HAVING before GROUP BY), so the read order reflects + // that, not the textual SQL order. + assert_column_ops( + "SELECT a FROM t1 GROUP BY a HAVING SUM(b) > 0", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "a"), read("t1", "b"), read("t1", "a")], + writes: vec![], + lineage: vec![passthrough(col("t1", "a"), out("a", 0))], + diagnostics: vec![], + }, + ); + } + + #[test] + fn group_by_rollup_modifier_refs_surface() { + assert_column_ops( + "SELECT a, b FROM t1 GROUP BY ROLLUP(a, b)", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![ + read("t1", "a"), + read("t1", "b"), + read("t1", "a"), + read("t1", "b"), + ], + writes: vec![], + lineage: vec![ + passthrough(col("t1", "a"), out("a", 0)), + passthrough(col("t1", "b"), out("b", 1)), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn group_by_cube_modifier_refs_surface() { + assert_column_ops( + "SELECT a, b FROM t1 GROUP BY CUBE(a, b)", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![ + read("t1", "a"), + read("t1", "b"), + read("t1", "a"), + read("t1", "b"), + ], + writes: vec![], + lineage: vec![ + passthrough(col("t1", "a"), out("a", 0)), + passthrough(col("t1", "b"), out("b", 1)), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn group_by_grouping_sets_walks_each_set_member() { + // GROUPING SETS ((a, b), (a), ()) — every named column + // inside any set surfaces as a read. The empty set + // contributes nothing. + assert_column_ops( + "SELECT a, b FROM t1 GROUP BY GROUPING SETS ((a, b), (a), ())", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![ + read("t1", "a"), + read("t1", "b"), + read("t1", "a"), + read("t1", "b"), + read("t1", "a"), + ], + writes: vec![], + lineage: vec![ + passthrough(col("t1", "a"), out("a", 0)), + passthrough(col("t1", "b"), out("b", 1)), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn group_by_mixed_plain_and_rollup_collects_both() { + // `GROUP BY a, ROLLUP(b, c)` — `a` is a plain GROUP BY ref; + // `b`, `c` are inside the ROLLUP expression. All three + // surface as reads. + assert_column_ops( + "SELECT a, b, c FROM t1 GROUP BY a, ROLLUP(b, c)", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![ + read("t1", "a"), + read("t1", "b"), + read("t1", "c"), + read("t1", "a"), + read("t1", "b"), + read("t1", "c"), + ], + writes: vec![], + lineage: vec![ + passthrough(col("t1", "a"), out("a", 0)), + passthrough(col("t1", "b"), out("b", 1)), + passthrough(col("t1", "c"), out("c", 2)), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn subquery_in_group_by_surfaces_reads_but_no_inner_lineage() { + // GROUP BY (SELECT z FROM s) — the subquery's `z` surfaces in + // reads, but the subquery emits no lineage (Option B: raw + // resolve, no intermediate QueryOutput). Only the outer + // projection `a` contributes a lineage edge. + assert_column_ops( + "SELECT a FROM t GROUP BY (SELECT z FROM s)", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t", "a"), read("s", "z")], + writes: vec![], + lineage: vec![passthrough(col("t", "a"), out("a", 0))], + diagnostics: vec![], + }, + ); + } + + #[test] + fn case_in_projection_refs_surface_and_transform() { + // Condition (`a`), THEN (`b`), and ELSE (`c`) all surface as + // reads and feed into the CASE output as Transformation. + assert_column_ops( + "SELECT CASE WHEN a > 0 THEN b ELSE c END FROM t1", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "a"), read("t1", "b"), read("t1", "c")], + writes: vec![], + lineage: vec![ + transformation(col("t1", "a"), out_anon(0)), + transformation(col("t1", "b"), out_anon(0)), + transformation(col("t1", "c"), out_anon(0)), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn case_in_where_refs_surface_as_reads() { + // The CASE sits in WHERE: its condition (`x`) and results + // (`y`, `z`) surface as reads (not lineage sources — the CASE + // feeds a predicate). `b` is the outer projection. + assert_column_ops( + "SELECT b FROM t WHERE CASE WHEN x > 0 THEN y ELSE z END = 1", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![ + read("t", "b"), + read("t", "x"), + read("t", "y"), + read("t", "z"), + ], + writes: vec![], + lineage: vec![passthrough(col("t", "b"), out("b", 0))], + diagnostics: vec![], + }, + ); + } + + #[test] + fn scalar_subquery_in_case_condition_composes_to_outer_only() { + // A scalar subquery in a CASE condition emits no lineage of its + // own (Option B: raw resolve). The outer CASE projection + // item captures the subquery's refs (`s.x` from its + // projection, `s.y` from its WHERE) as its source refs, so + // both feed into the outer anonymous output as + // Transformation. Refs still surface in reads. + assert_column_ops( + "SELECT CASE WHEN (SELECT x FROM s WHERE y > 0) IS NULL THEN 1 END FROM t", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("s", "x"), read("s", "y")], + writes: vec![], + lineage: vec![ + transformation(col("s", "x"), out_anon(0)), + transformation(col("s", "y"), out_anon(0)), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn simple_case_operand_and_results_surface() { + // `CASE x WHEN 1 THEN a WHEN 2 THEN b END` — the operand + // `x` and the results `a` / `b` all surface as reads and + // feed into the CASE output as Transformation. + assert_column_ops( + "SELECT CASE x WHEN 1 THEN a WHEN 2 THEN b END FROM t1", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "x"), read("t1", "a"), read("t1", "b")], + writes: vec![], + lineage: vec![ + transformation(col("t1", "x"), out_anon(0)), + transformation(col("t1", "a"), out_anon(0)), + transformation(col("t1", "b"), out_anon(0)), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn simple_case_with_column_when_pattern_all_surface() { + // `CASE x WHEN y THEN a ELSE b END` — operand `x`, + // WHEN-pattern `y`, and results `a` / `b` all surface as + // reads and feed into the CASE output as Transformation. + assert_column_ops( + "SELECT CASE x WHEN y THEN a ELSE b END FROM t1", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![ + read("t1", "x"), + read("t1", "y"), + read("t1", "a"), + read("t1", "b"), + ], + writes: vec![], + lineage: vec![ + transformation(col("t1", "x"), out_anon(0)), + transformation(col("t1", "y"), out_anon(0)), + transformation(col("t1", "a"), out_anon(0)), + transformation(col("t1", "b"), out_anon(0)), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn window_partition_by_refs_surface_and_transform() { + // OVER (PARTITION BY p) — both the aggregate arg `x` and + // the partition key `p` surface as reads, and both feed + // into the window output as Transformation (the whole + // SUM(...) OVER (...) expression is value-changing). + assert_column_ops( + "SELECT SUM(x) OVER (PARTITION BY p) FROM t1", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "x"), read("t1", "p")], + writes: vec![], + lineage: vec![ + transformation(col("t1", "x"), out_anon(0)), + transformation(col("t1", "p"), out_anon(0)), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn window_order_by_refs_surface_and_transform() { + assert_column_ops( + "SELECT SUM(x) OVER (ORDER BY o) FROM t1", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "x"), read("t1", "o")], + writes: vec![], + lineage: vec![ + transformation(col("t1", "x"), out_anon(0)), + transformation(col("t1", "o"), out_anon(0)), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn window_partition_and_order_refs_all_surface_and_transform() { + assert_column_ops( + "SELECT SUM(x) OVER (PARTITION BY p ORDER BY o) FROM t1", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "x"), read("t1", "p"), read("t1", "o")], + writes: vec![], + lineage: vec![ + transformation(col("t1", "x"), out_anon(0)), + transformation(col("t1", "p"), out_anon(0)), + transformation(col("t1", "o"), out_anon(0)), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn window_with_literal_frame_bounds_does_not_add_refs() { + // Frame bounds with literal integers (`3 PRECEDING`, + // `CURRENT ROW`) walk via visit_expr but produce no + // column refs — same shape as the no-frame version. + assert_column_ops( + "SELECT SUM(x) OVER (PARTITION BY p ORDER BY o \ + ROWS BETWEEN 3 PRECEDING AND CURRENT ROW) FROM t1", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "x"), read("t1", "p"), read("t1", "o")], + writes: vec![], + lineage: vec![ + transformation(col("t1", "x"), out_anon(0)), + transformation(col("t1", "p"), out_anon(0)), + transformation(col("t1", "o"), out_anon(0)), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn window_with_unbounded_frame_bounds_does_not_add_refs() { + // UNBOUNDED PRECEDING / UNBOUNDED FOLLOWING are bound + // variants without an associated expr — visit_window_frame_bound + // returns Ok without walking anything. + assert_column_ops( + "SELECT SUM(x) OVER (ORDER BY o \ + ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) \ + FROM t1", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "x"), read("t1", "o")], + writes: vec![], + lineage: vec![ + transformation(col("t1", "x"), out_anon(0)), + transformation(col("t1", "o"), out_anon(0)), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn merge_on_clause_refs_surface_as_reads_not_lineage() { + assert_column_ops( + "MERGE INTO t USING s ON t.id = s.id WHEN MATCHED THEN UPDATE SET t.a = s.a", + ColumnOperation { + statement_kind: StatementKind::Merge, + reads: vec![read("t", "id"), read("s", "id"), read("s", "a")], + writes: vec![write("t", "a")], + lineage: vec![passthrough(col("s", "a"), relation("t", "a"))], + diagnostics: vec![], + }, + ); + } + + #[test] + fn create_table_definitions_are_not_writes() { + assert_column_ops( + "CREATE TABLE t1 (a INT, b INT)", + ColumnOperation { + statement_kind: StatementKind::CreateTable, + reads: vec![], + writes: vec![], + lineage: vec![], + diagnostics: vec![], + }, + ); + } + } + + mod diagnostics { + use super::*; + + #[test] + fn unsupported_statement_reports_diagnostic() { + assert_column_ops( + "CREATE INDEX idx ON t1 (a)", + ColumnOperation { + statement_kind: StatementKind::Unsupported, + reads: vec![], + writes: vec![], + lineage: vec![], + diagnostics: vec![diag(ColumnLevelDiagnosticKind::UnsupportedStatement)], + }, + ); + } + + #[test] + fn wildcard_in_projection_reports_diagnostic() { + // Whole-value pin-down on the structural shape; assert_column_ops + // compares diagnostics by kind only. The message text and span + // coordinates are verified separately below since this test's + // *purpose* is to confirm both are populated. + let ops = extract("SELECT * FROM t1"); + assert_column_ops( + "SELECT * FROM t1", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![], + writes: vec![], + lineage: vec![], + diagnostics: vec![diag(ColumnLevelDiagnosticKind::WildcardSuppressed)], + }, + ); + // Span info ("at L1:C8") is duplicated in message and surfaced + // as structured data for programmatic consumers. + assert!( + ops.diagnostics[0].message.contains("at L1:C8"), + "expected span suffix in message, got: {}", + ops.diagnostics[0].message + ); + let span = ops.diagnostics[0] + .span + .expect("wildcard token carries a span"); + assert_eq!(span.start.line, 1); + assert_eq!(span.start.column, 8); + } + + #[test] + fn qualified_wildcard_in_projection_reports_diagnostic() { + assert_column_ops( + "SELECT t1.* FROM t1", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![], + writes: vec![], + lineage: vec![], + diagnostics: vec![diag(ColumnLevelDiagnosticKind::WildcardSuppressed)], + }, + ); + } + + #[test] + fn multiple_statements_produce_multiple_results() { + let sql = "SELECT t1.a FROM t1; SELECT t2.b FROM t2"; + assert_nth_column_ops( + sql, + 0, + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "a")], + writes: vec![], + lineage: vec![passthrough(col("t1", "a"), out("a", 0))], + diagnostics: vec![], + }, + ); + assert_nth_column_ops( + sql, + 1, + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t2", "b")], + writes: vec![], + lineage: vec![passthrough(col("t2", "b"), out("b", 0))], + diagnostics: vec![], + }, + ); + } + + #[test] + fn wildcard_select_yields_no_column_ops() { + assert_column_ops( + "SELECT * FROM t1", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![], + writes: vec![], + lineage: vec![], + diagnostics: vec![diag(ColumnLevelDiagnosticKind::WildcardSuppressed)], + }, + ); + } + } + + mod lineage { + use super::*; + + #[test] + fn select_bare_column_emits_passthrough_edge_to_query_output() { + assert_column_ops( + "SELECT a FROM t1", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "a")], + writes: vec![], + lineage: vec![passthrough(col("t1", "a"), out("a", 0))], + diagnostics: vec![], + }, + ); + } + + #[test] + fn select_aliased_column_uses_alias_as_output_name() { + assert_column_ops( + "SELECT a AS x FROM t1", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "a")], + writes: vec![], + lineage: vec![passthrough(col("t1", "a"), out("x", 0))], + diagnostics: vec![], + }, + ); + } + + #[test] + fn select_arithmetic_emits_one_transformation_edge_per_source() { + assert_column_ops( + "SELECT a + b FROM t1", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "a"), read("t1", "b")], + writes: vec![], + lineage: vec![ + transformation(col("t1", "a"), out_anon(0)), + transformation(col("t1", "b"), out_anon(0)), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn select_mixed_projection_separates_targets_by_position() { + assert_column_ops( + "SELECT a, a + b FROM t1", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "a"), read("t1", "a"), read("t1", "b")], + writes: vec![], + lineage: vec![ + passthrough(col("t1", "a"), out("a", 0)), + transformation(col("t1", "a"), out_anon(1)), + transformation(col("t1", "b"), out_anon(1)), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn select_qualified_ref_in_expression_resolves_directly() { + assert_column_ops( + "SELECT t1.a + t1.b AS sum FROM t1", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "a"), read("t1", "b")], + writes: vec![], + lineage: vec![ + transformation(col("t1", "a"), out("sum", 0)), + transformation(col("t1", "b"), out("sum", 0)), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn insert_select_pairs_target_cols_positionally() { + assert_column_ops( + "INSERT INTO t1 (a, b) SELECT x, y FROM t2", + ColumnOperation { + statement_kind: StatementKind::Insert, + reads: vec![read("t2", "x"), read("t2", "y")], + writes: vec![write("t1", "a"), write("t1", "b")], + lineage: vec![ + passthrough(col("t2", "x"), relation("t1", "a")), + passthrough(col("t2", "y"), relation("t1", "b")), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn insert_select_transformation_marks_kind_per_source() { + assert_column_ops( + "INSERT INTO t1 (a) SELECT x + y FROM t2", + ColumnOperation { + statement_kind: StatementKind::Insert, + reads: vec![read("t2", "x"), read("t2", "y")], + writes: vec![write("t1", "a")], + lineage: vec![ + transformation(col("t2", "x"), relation("t1", "a")), + transformation(col("t2", "y"), relation("t1", "a")), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn insert_select_union_pairs_both_branches_with_target_cols() { + // Both UNION branches feed the same INSERT target positions, + // so each branch's projection should pair `position N → t.col_N`. + assert_column_ops( + "INSERT INTO t1 (a, b) \ + SELECT x, y FROM t2 \ + UNION ALL \ + SELECT p, q FROM t3", + ColumnOperation { + statement_kind: StatementKind::Insert, + reads: vec![ + read("t2", "x"), + read("t2", "y"), + read("t3", "p"), + read("t3", "q"), + ], + writes: vec![write("t1", "a"), write("t1", "b")], + lineage: vec![ + passthrough(col("t2", "x"), relation("t1", "a")), + passthrough(col("t2", "y"), relation("t1", "b")), + passthrough(col("t3", "p"), relation("t1", "a")), + passthrough(col("t3", "q"), relation("t1", "b")), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn insert_without_explicit_cols_emits_no_lineage() { + // Target column names would need catalog-driven positional + // mapping; without catalog the resolver emits nothing. + assert_column_ops( + "INSERT INTO t1 SELECT x FROM t2", + ColumnOperation { + statement_kind: StatementKind::Insert, + reads: vec![read("t2", "x")], + writes: vec![], + lineage: vec![], + diagnostics: vec![], + }, + ); + } + + #[test] + fn insert_values_with_literals_emits_no_lineage() { + assert_column_ops( + "INSERT INTO t1 (a, b) VALUES (1, 2)", + ColumnOperation { + statement_kind: StatementKind::Insert, + reads: vec![], + writes: vec![write("t1", "a"), write("t1", "b")], + lineage: vec![], + diagnostics: vec![], + }, + ); + } + + #[test] + fn update_set_literal_emits_no_lineage() { + assert_column_ops( + "UPDATE t1 SET a = 1", + ColumnOperation { + statement_kind: StatementKind::Update, + reads: vec![], + writes: vec![write("t1", "a")], + lineage: vec![], + diagnostics: vec![], + }, + ); + } + + #[test] + fn delete_emits_no_lineage() { + assert_column_ops( + "DELETE FROM t1 WHERE id = 5", + ColumnOperation { + statement_kind: StatementKind::Delete, + reads: vec![read("t1", "id")], + writes: vec![], + lineage: vec![], + diagnostics: vec![], + }, + ); + } + + #[test] + fn wildcard_select_emits_no_lineage() { + assert_column_ops( + "SELECT * FROM t1", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![], + writes: vec![], + lineage: vec![], + diagnostics: vec![diag(ColumnLevelDiagnosticKind::WildcardSuppressed)], + }, + ); + } + + #[test] + fn update_set_passthrough_lineage() { + assert_column_ops( + "UPDATE t1 SET a = b", + ColumnOperation { + statement_kind: StatementKind::Update, + reads: vec![read("t1", "b")], + writes: vec![write("t1", "a")], + lineage: vec![passthrough(col("t1", "b"), relation("t1", "a"))], + diagnostics: vec![], + }, + ); + } + + #[test] + fn update_set_transformation_lineage() { + assert_column_ops( + "UPDATE t1 SET a = b + 1", + ColumnOperation { + statement_kind: StatementKind::Update, + reads: vec![read("t1", "b")], + writes: vec![write("t1", "a")], + lineage: vec![transformation(col("t1", "b"), relation("t1", "a"))], + diagnostics: vec![], + }, + ); + } + + #[test] + fn update_set_with_qualified_rhs_resolves_to_other_table() { + assert_column_ops( + "UPDATE t1 SET a = t2.b FROM t2 WHERE t1.id = t2.id", + ColumnOperation { + statement_kind: StatementKind::Update, + reads: vec![read("t2", "b"), read("t1", "id"), read("t2", "id")], + writes: vec![write("t1", "a")], + lineage: vec![passthrough(col("t2", "b"), relation("t1", "a"))], + diagnostics: vec![], + }, + ); + } + + #[test] + fn aggregate_call_in_projection_emits_transformation_edge() { + assert_column_ops( + "SELECT SUM(a) FROM t1", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "a")], + writes: vec![], + lineage: vec![transformation(col("t1", "a"), out_anon(0))], + diagnostics: vec![], + }, + ); + } + + #[test] + fn aggregate_with_alias_carries_aliased_name() { + assert_column_ops( + "SELECT COUNT(b) AS n FROM t1", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "b")], + writes: vec![], + lineage: vec![transformation(col("t1", "b"), out("n", 0))], + diagnostics: vec![], + }, + ); + } + + #[test] + fn aggregate_wrapped_in_expression_is_transformation() { + // `SUM(a) + 1` is a value-changing expression, so the lineage edge + // is Transformation — same kind a bare aggregate call would + // produce, since the model no longer sub-classifies them. + assert_column_ops( + "SELECT SUM(a) + 1 FROM t1", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "a")], + writes: vec![], + lineage: vec![transformation(col("t1", "a"), out_anon(0))], + diagnostics: vec![], + }, + ); + } + + #[test] + fn aggregate_in_insert_select_propagates_transformation() { + assert_column_ops( + "INSERT INTO t2 (n) SELECT COUNT(a) FROM t1", + ColumnOperation { + statement_kind: StatementKind::Insert, + reads: vec![read("t1", "a")], + writes: vec![write("t2", "n")], + lineage: vec![transformation(col("t1", "a"), relation("t2", "n"))], + diagnostics: vec![], + }, + ); + } + + #[test] + fn cte_aggregate_composes_to_outer_as_transformation() { + // CTE body's `s` is Transformation (SUM(a)); outer's bare `s` + // would be Passthrough, but composition keeps the chain a + // Transformation (any transforming step dominates). + assert_column_ops( + "WITH cte AS (SELECT SUM(a) AS s FROM t1) SELECT s FROM cte", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "a")], + writes: vec![], + lineage: vec![transformation(col("t1", "a"), out("s", 0))], + diagnostics: vec![], + }, + ); + } + } + + mod cte_derived_rename { + use super::*; + + #[test] + fn cte_column_rename_composes_through_renamed_name() { + // Outer `a` refers to cte's renamed column at position 0, + // which body-positionally is `x` from t. Composition follows + // the renamed name back to the body item, then to t.x. + // Reads surface only the real-table ref (CTE binding is + // synthetic, dropped). + assert_column_ops( + "WITH cte (a) AS (SELECT x FROM t) SELECT a FROM cte", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t", "x")], + writes: vec![], + lineage: vec![passthrough(col("t", "x"), out("a", 0))], + diagnostics: vec![], + }, + ); + } + + #[test] + fn cte_column_alias_matched_case_insensitively() { + // The CTE projects `x AS Foo`; the outer query references it + // as unquoted `foo`. Composition's name-match folds both + // sides to the same key, so `foo` composes back to the real + // source `t1.x`. + assert_column_ops( + "WITH cte AS (SELECT x AS Foo FROM t1) SELECT foo FROM cte", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "x")], + writes: vec![], + lineage: vec![passthrough(col("t1", "x"), out("foo", 0))], + diagnostics: vec![], + }, + ); + } + + #[test] + fn cte_column_rename_partial_keeps_remaining_body_names() { + // Rename `(p)` covers position 0 only. Position 1's body name + // `y` survives; outer can reference `p` or `y`. + assert_column_ops( + "WITH cte (p) AS (SELECT x, y FROM t) SELECT p, y FROM cte", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t", "x"), read("t", "y")], + writes: vec![], + lineage: vec![ + passthrough(col("t", "x"), out("p", 0)), + passthrough(col("t", "y"), out("y", 1)), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn derived_table_column_rename_composes() { + // `(SELECT x FROM t) AS d(a)` — outer `a` resolves via d's + // renamed column at position 0 → body item x → t.x. + assert_column_ops( + "SELECT a FROM (SELECT x FROM t) d(a)", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t", "x")], + writes: vec![], + lineage: vec![passthrough(col("t", "x"), out("a", 0))], + diagnostics: vec![], + }, + ); + } + + #[test] + fn cte_column_rename_into_insert() { + // `INSERT INTO t2 (col) WITH cte(a) AS (SELECT x FROM t1) + // SELECT a FROM cte` composes through both the CTE rename + // and the INSERT pairing: t1.x → t2.col. + assert_column_ops( + "INSERT INTO t2 (col) WITH cte (a) AS (SELECT x FROM t1) \ + SELECT a FROM cte", + ColumnOperation { + statement_kind: StatementKind::Insert, + reads: vec![read("t1", "x")], + writes: vec![write("t2", "col")], + lineage: vec![passthrough(col("t1", "x"), relation("t2", "col"))], + diagnostics: vec![], + }, + ); + } + } + + mod with_in_dml { + //! `WITH cte AS (...) ` — Postgres / Sqlite / standard + //! SQL syntax for binding CTEs visible to a DML statement. + //! sqlparser typically parses these as Query-with-WITH at the + //! source level for INSERT, and wraps Update / Delete in + //! various ways. These tests pin down what actually surfaces + //! through the resolver. + use super::*; + + #[test] + fn with_in_insert_select_composes_cte_to_target() { + assert_column_ops( + "WITH cte AS (SELECT x FROM s) INSERT INTO t (a) SELECT x FROM cte", + ColumnOperation { + statement_kind: StatementKind::Insert, + reads: vec![read("s", "x")], + writes: vec![write("t", "a")], + lineage: vec![passthrough(col("s", "x"), relation("t", "a"))], + diagnostics: vec![], + }, + ); + } + + #[test] + fn with_in_update_via_scalar_subquery_composes() { + // CTE referenced from the SET RHS scalar subquery. The + // subquery emits no QueryOutput edge of its own (Option B); + // the UPDATE SET assignment captures its source (composed + // through cte to s.x) and emits the single Relation edge. + // Transformation (the value is derived through max + the + // subquery wrapping). + assert_column_ops( + "WITH cte AS (SELECT max(x) AS m FROM s) \ + UPDATE t SET a = (SELECT m FROM cte) WHERE id = 1", + ColumnOperation { + statement_kind: StatementKind::Update, + reads: vec![read("s", "x"), read("t", "id")], + writes: vec![write("t", "a")], + lineage: vec![transformation(col("s", "x"), relation("t", "a"))], + diagnostics: vec![], + }, + ); + } + + #[test] + fn with_in_delete_via_predicate_subquery_keeps_cte_source_as_read() { + // The DELETE target `t` lives in its own scope (the SetExpr + // DML scope), so the outer predicate `id` resolves + // unambiguously to `t`. The predicate subquery feeds a + // filter, so it emits no lineage (Option B); its refs (s.id + // via the cte) still surface in reads. DELETE has no column + // lineage of its own — so lineage is empty. + assert_column_ops( + "WITH cte AS (SELECT id FROM s WHERE flag) \ + DELETE FROM t WHERE id IN (SELECT id FROM cte)", + ColumnOperation { + statement_kind: StatementKind::Delete, + reads: vec![read("s", "id"), read("s", "flag"), read("t", "id")], + writes: vec![], + lineage: vec![], + diagnostics: vec![], + }, + ); + } + + #[test] + fn with_multiple_ctes_chained_into_insert() { + // Two CTEs where `b` references `a`. INSERT then pulls + // from `b`. Composition walks back through both layers + // to the base table. + assert_column_ops( + "WITH a AS (SELECT id FROM t1), \ + b AS (SELECT id + 1 AS x FROM a) \ + INSERT INTO t2 (col) SELECT x FROM b", + ColumnOperation { + statement_kind: StatementKind::Insert, + reads: vec![read("t1", "id")], + writes: vec![write("t2", "col")], + lineage: vec![transformation(col("t1", "id"), relation("t2", "col"))], + diagnostics: vec![], + }, + ); + } + } + + mod merge { + use super::*; + + #[test] + fn merge_when_matched_update_emits_lineage_and_write() { + assert_column_ops( + "MERGE INTO t USING s ON t.id = s.id WHEN MATCHED THEN UPDATE SET t.a = s.a", + ColumnOperation { + statement_kind: StatementKind::Merge, + reads: vec![read("t", "id"), read("s", "id"), read("s", "a")], + writes: vec![write("t", "a")], + lineage: vec![passthrough(col("s", "a"), relation("t", "a"))], + diagnostics: vec![], + }, + ); + } + + #[test] + fn merge_when_not_matched_insert_emits_lineage_and_write() { + assert_column_ops( + "MERGE INTO t USING s ON t.id = s.id \ + WHEN NOT MATCHED THEN INSERT (id, a) VALUES (s.id, s.a)", + ColumnOperation { + statement_kind: StatementKind::Merge, + reads: vec![ + read("t", "id"), + read("s", "id"), + read("s", "id"), + read("s", "a"), + ], + writes: vec![write("t", "id"), write("t", "a")], + lineage: vec![ + passthrough(col("s", "id"), relation("t", "id")), + passthrough(col("s", "a"), relation("t", "a")), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn merge_delete_action_emits_no_lineage_no_write() { + assert_column_ops( + "MERGE INTO t USING s ON t.id = s.id WHEN MATCHED THEN DELETE", + ColumnOperation { + statement_kind: StatementKind::Merge, + reads: vec![read("t", "id"), read("s", "id")], + writes: vec![], + lineage: vec![], + diagnostics: vec![], + }, + ); + } + + #[test] + fn merge_combined_clauses_emit_per_clause_lineage_and_writes() { + assert_column_ops( + "MERGE INTO t USING s ON t.id = s.id \ + WHEN MATCHED THEN UPDATE SET t.a = s.a \ + WHEN NOT MATCHED THEN INSERT (id, a) VALUES (s.id, s.a)", + ColumnOperation { + statement_kind: StatementKind::Merge, + reads: vec![ + read("t", "id"), + read("s", "id"), + read("s", "a"), + read("s", "id"), + read("s", "a"), + ], + writes: vec![write("t", "a"), write("t", "id"), write("t", "a")], + lineage: vec![ + passthrough(col("s", "a"), relation("t", "a")), + passthrough(col("s", "id"), relation("t", "id")), + passthrough(col("s", "a"), relation("t", "a")), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn merge_update_transformation_kind_propagates() { + assert_column_ops( + "MERGE INTO t USING s ON t.id = s.id \ + WHEN MATCHED THEN UPDATE SET t.a = s.a + 1", + ColumnOperation { + statement_kind: StatementKind::Merge, + reads: vec![read("t", "id"), read("s", "id"), read("s", "a")], + writes: vec![write("t", "a")], + lineage: vec![transformation(col("s", "a"), relation("t", "a"))], + diagnostics: vec![], + }, + ); + } + } + + mod ctas_view { + use super::*; + + #[test] + fn ctas_pairs_source_projection_with_inferred_column_names() { + // CREATE TABLE AS SELECT — no explicit column list, so target + // columns follow the source projection's inferred names + // (alias > bare ident). + assert_column_ops( + "CREATE TABLE t AS SELECT x AS a, y FROM s", + ColumnOperation { + statement_kind: StatementKind::CreateTable, + reads: vec![read("s", "x"), read("s", "y")], + writes: vec![write("t", "a"), write("t", "y")], + lineage: vec![ + passthrough(col("s", "x"), relation("t", "a")), + passthrough(col("s", "y"), relation("t", "y")), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn ctas_with_explicit_columns_overrides_projection_names() { + // Explicit column list wins over inferred names. + assert_column_ops( + "CREATE TABLE t (p INT, q INT) AS SELECT x, y FROM s", + ColumnOperation { + statement_kind: StatementKind::CreateTable, + reads: vec![read("s", "x"), read("s", "y")], + writes: vec![write("t", "p"), write("t", "q")], + lineage: vec![ + passthrough(col("s", "x"), relation("t", "p")), + passthrough(col("s", "y"), relation("t", "q")), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn ctas_propagates_transformation_kind() { + assert_column_ops( + "CREATE TABLE t AS SELECT SUM(x) AS total FROM s", + ColumnOperation { + statement_kind: StatementKind::CreateTable, + reads: vec![read("s", "x")], + writes: vec![write("t", "total")], + lineage: vec![transformation(col("s", "x"), relation("t", "total"))], + diagnostics: vec![], + }, + ); + } + + #[test] + fn create_view_pairs_source_projection() { + assert_column_ops( + "CREATE VIEW v AS SELECT x AS a, y FROM s", + ColumnOperation { + statement_kind: StatementKind::CreateView, + reads: vec![read("s", "x"), read("s", "y")], + writes: vec![write("v", "a"), write("v", "y")], + lineage: vec![ + passthrough(col("s", "x"), relation("v", "a")), + passthrough(col("s", "y"), relation("v", "y")), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn create_view_with_explicit_columns_uses_list() { + assert_column_ops( + "CREATE VIEW v (a, b) AS SELECT x, y FROM s", + ColumnOperation { + statement_kind: StatementKind::CreateView, + reads: vec![read("s", "x"), read("s", "y")], + writes: vec![write("v", "a"), write("v", "b")], + lineage: vec![ + passthrough(col("s", "x"), relation("v", "a")), + passthrough(col("s", "y"), relation("v", "b")), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn alter_view_pairs_replacement_query_projection() { + assert_column_ops( + "ALTER VIEW v AS SELECT x AS a FROM s", + ColumnOperation { + statement_kind: StatementKind::AlterView, + reads: vec![read("s", "x")], + writes: vec![write("v", "a")], + lineage: vec![passthrough(col("s", "x"), relation("v", "a"))], + diagnostics: vec![], + }, + ); + } + + #[test] + fn ctas_unnamed_projection_yields_no_paired_lineage() { + // `SELECT 1` has no column ref and no inferable name, so the + // CTAS source produces no lineage / no write for that slot. + assert_column_ops( + "CREATE TABLE t AS SELECT 1 FROM s", + ColumnOperation { + statement_kind: StatementKind::CreateTable, + reads: vec![], + writes: vec![], + lineage: vec![], + diagnostics: vec![], + }, + ); + } + + #[test] + fn aggregate_with_distinct_args_marker() { + // COUNT(DISTINCT user_id) — an aggregate call, so the source + // feeds into the output as a Transformation. + assert_column_ops( + "SELECT COUNT(DISTINCT user_id) FROM t1", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "user_id")], + writes: vec![], + lineage: vec![transformation(col("t1", "user_id"), out_anon(0))], + diagnostics: vec![], + }, + ); + } + + #[test] + fn aggregate_with_filter_clause_marker() { + // SUM(x) FILTER (WHERE y > 0) — both `x` and `y` surface as + // reads, and both feed into the aggregate's output as + // Transformation. Anything mentioned inside the aggregate's + // syntactic boundary (args + FILTER predicate) is a lineage + // source, not just the bare argument. + assert_column_ops( + "SELECT SUM(x) FILTER (WHERE y > 0) FROM t1", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "x"), read("t1", "y")], + writes: vec![], + lineage: vec![ + transformation(col("t1", "x"), out_anon(0)), + transformation(col("t1", "y"), out_anon(0)), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn cte_aggregate_then_outer_expression_still_transformation() { + // Outer wraps the CTE column in an expression (s + 1) — + // composition: outer Transformation × inner Transformation = + // Transformation. + assert_column_ops( + "WITH cte AS (SELECT SUM(a) AS s FROM t1) SELECT s + 1 FROM cte", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "a")], + writes: vec![], + lineage: vec![transformation(col("t1", "a"), out_anon(0))], + diagnostics: vec![], + }, + ); + } + } + + mod composition { + use super::*; + + #[test] + fn cte_passthrough_composes_to_base_table() { + // The outer edge's source `id` resolves to cte, then composes + // through the CTE body's projection back to t1.id. No + // intermediate cte.id → out edge survives. + assert_column_ops( + "WITH cte AS (SELECT id FROM t1) SELECT id FROM cte", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "id")], + writes: vec![], + lineage: vec![passthrough(col("t1", "id"), out("id", 0))], + diagnostics: vec![], + }, + ); + } + + #[test] + fn cte_transformation_propagates_kind_after_composition() { + // CTE body's `sum` is a transformation of a, b. Outer's bare + // `sum` composes back into two edges, each Transformation + // because the body item is (outer.bare && item.bare = false). + assert_column_ops( + "WITH cte AS (SELECT a + b AS sum FROM t1) SELECT sum FROM cte", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "a"), read("t1", "b")], + writes: vec![], + lineage: vec![ + transformation(col("t1", "a"), out("sum", 0)), + transformation(col("t1", "b"), out("sum", 0)), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn cte_to_insert_composes_end_to_end() { + // Composition reaches past the CTE boundary into the INSERT + // target — t1.id → t2.x directly, no cte.id step. + assert_column_ops( + "INSERT INTO t2 (x) WITH cte AS (SELECT id FROM t1) SELECT id FROM cte", + ColumnOperation { + statement_kind: StatementKind::Insert, + reads: vec![read("t1", "id")], + writes: vec![write("t2", "x")], + lineage: vec![passthrough(col("t1", "id"), relation("t2", "x"))], + diagnostics: vec![], + }, + ); + } + + #[test] + fn cte_chain_composes_through_all_levels() { + // a → b → outer: outer's `b.id` composes via b's body back to + // a, then via a's body back to t1. Outer is qualified because + // having both `a` and `b` in scope with the same column name + // makes the unqualified form ambiguous under our scope model + // (outer SELECT sees both CTE bindings, not just b). + assert_column_ops( + "WITH a AS (SELECT id FROM t1), b AS (SELECT id FROM a) SELECT b.id FROM b", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "id")], + writes: vec![], + lineage: vec![passthrough(col("t1", "id"), out("id", 0))], + diagnostics: vec![], + }, + ); + } + + #[test] + fn derived_table_composes_to_base_table() { + // The outer projection's `col` composes through derived `d`'s + // body (a + b AS col) into two Transformation edges on t1. + assert_column_ops( + "SELECT col FROM (SELECT a + b AS col FROM t1) d", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "a"), read("t1", "b")], + writes: vec![], + lineage: vec![ + transformation(col("t1", "a"), out("col", 0)), + transformation(col("t1", "b"), out("col", 0)), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn cte_referenced_twice_composes_each_use() { + // Each cte reference in the projection composes independently + // back to t1.id. + assert_column_ops( + "WITH cte AS (SELECT id FROM t1) SELECT cte.id AS a, cte.id AS b FROM cte", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "id")], + writes: vec![], + lineage: vec![ + passthrough(col("t1", "id"), out("a", 0)), + passthrough(col("t1", "id"), out("b", 1)), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn recursive_cte_does_not_panic_and_skips_composition() { + // Recursive CTEs don't carry body_projections (fixpoint is + // deferred), so composition falls back to leaving the lineage edge + // source pointing at the CTE binding (`r.id`) rather than + // tracing into a base table. Reads still get the synthetic + // filter, so only `t1.id` from the non-recursive branch + // surfaces in reads. No infinite recursion either. + assert_column_ops( + "WITH RECURSIVE r AS (SELECT id FROM t1 UNION SELECT id FROM r) SELECT id FROM r", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "id")], + writes: vec![], + lineage: vec![ColumnLineageEdge { + source: ColumnReference { + table: Some(TableReference { + catalog: None, + schema: None, + name: "r".into(), + }), + name: "id".into(), + }, + target: out("id", 0), + kind: ColumnLineageKind::Passthrough, + }], + diagnostics: vec![], + }, + ); + } + } + + mod set_operations { + use super::*; + + #[test] + fn union_two_branches_emit_query_output_per_branch() { + // Each branch contributes its own ProjectionGroup, so both + // branches' projections fan out independently into + // QueryOutput edges. Position is per-group, so both land at + // position 0; name follows each branch's own projection. + assert_column_ops( + "SELECT a FROM t1 UNION SELECT b FROM t2", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "a"), read("t2", "b")], + writes: vec![], + lineage: vec![ + passthrough(col("t1", "a"), out("a", 0)), + passthrough(col("t2", "b"), out("b", 0)), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn union_all_behaves_same_as_union() { + // UNION ALL only differs from UNION at runtime (dedup vs + // not); structurally the resolver should treat them identically. + assert_column_ops( + "SELECT a FROM t1 UNION ALL SELECT b FROM t2", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "a"), read("t2", "b")], + writes: vec![], + lineage: vec![ + passthrough(col("t1", "a"), out("a", 0)), + passthrough(col("t2", "b"), out("b", 0)), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn intersect_behaves_same_as_union() { + assert_column_ops( + "SELECT a FROM t1 INTERSECT SELECT b FROM t2", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "a"), read("t2", "b")], + writes: vec![], + lineage: vec![ + passthrough(col("t1", "a"), out("a", 0)), + passthrough(col("t2", "b"), out("b", 0)), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn except_behaves_same_as_union() { + assert_column_ops( + "SELECT a FROM t1 EXCEPT SELECT b FROM t2", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "a"), read("t2", "b")], + writes: vec![], + lineage: vec![ + passthrough(col("t1", "a"), out("a", 0)), + passthrough(col("t2", "b"), out("b", 0)), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn three_way_union_emits_one_lineage_edge_per_branch() { + // Chained UNION parses left-associatively as + // `(t1 UNION t2) UNION t3`, so the resolver recursively + // visits each base SELECT and each contributes its own group. + assert_column_ops( + "SELECT a FROM t1 UNION SELECT b FROM t2 UNION SELECT c FROM t3", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "a"), read("t2", "b"), read("t3", "c")], + writes: vec![], + lineage: vec![ + passthrough(col("t1", "a"), out("a", 0)), + passthrough(col("t2", "b"), out("b", 0)), + passthrough(col("t3", "c"), out("c", 0)), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn union_with_where_classifies_per_branch_kind() { + // Each branch's WHERE is its own filter scope, so each + // branch produces a Projection read plus a Filter read for + // its own column. + assert_column_ops( + "SELECT a FROM t1 WHERE a > 0 UNION SELECT b FROM t2 WHERE b < 10", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![ + read("t1", "a"), + read("t1", "a"), + read("t2", "b"), + read("t2", "b"), + ], + writes: vec![], + lineage: vec![ + passthrough(col("t1", "a"), out("a", 0)), + passthrough(col("t2", "b"), out("b", 0)), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn union_mixed_passthrough_and_transformation_kinds() { + // Branch lineage kinds are independent. Left passthrough, right + // transformation; both contribute to the same output position. + assert_column_ops( + "SELECT a FROM t1 UNION SELECT b + 1 AS a FROM t2", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "a"), read("t2", "b")], + writes: vec![], + lineage: vec![ + passthrough(col("t1", "a"), out("a", 0)), + transformation(col("t2", "b"), out("a", 0)), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn union_with_aggregate_branch_emits_transformation_edge() { + assert_column_ops( + "SELECT id FROM t1 UNION SELECT COUNT(id) AS id FROM t2", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "id"), read("t2", "id")], + writes: vec![], + lineage: vec![ + passthrough(col("t1", "id"), out("id", 0)), + transformation(col("t2", "id"), out("id", 0)), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn union_in_subquery_composes_both_branches_to_outer() { + // The inner UNION lives in a derived subquery; the outer + // SELECT projects from it and composes back to the base + // tables of both branches — no intermediate QueryOutput + // edge for the subquery survives. + assert_column_ops( + "SELECT x FROM (SELECT a AS x FROM t1 UNION SELECT b AS x FROM t2) sub", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "a"), read("t2", "b")], + writes: vec![], + lineage: vec![ + passthrough(col("t1", "a"), out("x", 0)), + passthrough(col("t2", "b"), out("x", 0)), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn union_in_cte_composes_to_outer_use() { + // CTE body is a UNION. Outer SELECT pulls `x` from the cte. + // Composition should walk back through both branches to t1/t2. + assert_column_ops( + "WITH cte AS (SELECT a AS x FROM t1 UNION SELECT b AS x FROM t2) \ + SELECT x FROM cte", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "a"), read("t2", "b")], + writes: vec![], + lineage: vec![ + passthrough(col("t1", "a"), out("x", 0)), + passthrough(col("t2", "b"), out("x", 0)), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn ctas_with_union_body_pairs_left_branch_names_for_all_branches() { + // CTAS schema follows the LEFT branch's projection names + // (SQL standard). The inferred-name path uses the first + // ProjectionGroup's item names for every branch's + // positional pairing — same as INSERT-SELECT-UNION. So: + // - writes: only `dst.a` (left branch's name) + // - lineage: BOTH branches feed `Relation(dst.a)` + assert_column_ops( + "CREATE TABLE dst AS SELECT a FROM t1 UNION SELECT b FROM t2", + ColumnOperation { + statement_kind: StatementKind::CreateTable, + reads: vec![read("t1", "a"), read("t2", "b")], + writes: vec![write("dst", "a")], + lineage: vec![ + passthrough(col("t1", "a"), relation("dst", "a")), + passthrough(col("t2", "b"), relation("dst", "a")), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn ctas_with_explicit_columns_and_union_body_pairs_left_target_for_all_branches() { + // When CTAS specifies its own column list, both branches + // pair positionally against the same target columns — same + // pattern as INSERT-SELECT-UNION. + assert_column_ops( + "CREATE TABLE dst (x INT) AS SELECT a FROM t1 UNION SELECT b FROM t2", + ColumnOperation { + statement_kind: StatementKind::CreateTable, + reads: vec![read("t1", "a"), read("t2", "b")], + writes: vec![write("dst", "x")], + lineage: vec![ + passthrough(col("t1", "a"), relation("dst", "x")), + passthrough(col("t2", "b"), relation("dst", "x")), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn union_with_trailing_order_by_ref_is_unresolved() { + // ORDER BY on the whole UNION is visited in the outer query + // scope, AFTER both branch scopes have been popped. The + // ORDER BY column refers to a UNION output column, not a + // base table — so `a` resolves to None (no in-scope + // binding). + assert_column_ops( + "SELECT a FROM t1 UNION SELECT b FROM t2 ORDER BY a", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "a"), read("t2", "b"), unresolved("a")], + writes: vec![], + lineage: vec![ + passthrough(col("t1", "a"), out("a", 0)), + passthrough(col("t2", "b"), out("b", 0)), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn union_with_trailing_limit_literal_adds_nothing() { + // LIMIT 10 is a literal — no column refs, no extra lineage. + assert_column_ops( + "SELECT a FROM t1 UNION SELECT b FROM t2 LIMIT 10", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "a"), read("t2", "b")], + writes: vec![], + lineage: vec![ + passthrough(col("t1", "a"), out("a", 0)), + passthrough(col("t2", "b"), out("b", 0)), + ], + diagnostics: vec![], + }, + ); + } + } + + mod join_using_and_natural { + //! USING / NATURAL JOIN merge expansion is documented as + //! future work (see the module-level note in + //! column_operation_extractor). These tests pin down the + //! *current* shape so when USING / NATURAL JOIN expansion lands + //! (merged refs splitting into both source tables), the diff + //! will surface here. + use super::*; + + #[test] + fn join_using_id_in_projection_is_unresolved_due_to_ambiguity() { + // `id` in the projection is unqualified with two candidate + // tables (t1, t2) — the resolver leaves it unresolved + // (`table: None`) because no catalog disambiguates and + // USING is not yet expanded into a merged-column binding. + assert_column_ops( + "SELECT id FROM t1 JOIN t2 USING (id)", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![unresolved("id")], + writes: vec![], + lineage: vec![ColumnLineageEdge { + source: ColumnReference { + table: None, + name: "id".into(), + }, + target: out("id", 0), + kind: ColumnLineageKind::Passthrough, + }], + diagnostics: vec![], + }, + ); + } + + #[test] + fn join_using_id_in_projection_and_where_yields_two_independent_unresolved_refs() { + // The same `id` ref in projection vs. WHERE produces two + // SEPARATE RawColumnRefs, each with a single-kind `kinds` + // vec. There is no merge into one ref-with-multi-kinds + // here — that would require resolver-level tracking of + // ref identity across clauses, which we don't do. + assert_column_ops( + "SELECT id FROM t1 JOIN t2 USING (id) WHERE id > 0", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![unresolved("id"), unresolved("id")], + writes: vec![], + lineage: vec![ColumnLineageEdge { + source: ColumnReference { + table: None, + name: "id".into(), + }, + target: out("id", 0), + kind: ColumnLineageKind::Passthrough, + }], + diagnostics: vec![], + }, + ); + } + + #[test] + fn join_using_qualified_id_resolves_to_named_table() { + // Qualifying the ref sidesteps the USING ambiguity: `t1.id` + // resolves to t1 unambiguously. Use this in real-world + // queries until USING expansion is available. + assert_column_ops( + "SELECT t1.id FROM t1 JOIN t2 USING (id)", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "id")], + writes: vec![], + lineage: vec![passthrough(col("t1", "id"), out("id", 0))], + diagnostics: vec![], + }, + ); + } + + #[test] + fn natural_join_no_catalog_leaves_unqualified_refs_unresolved() { + // NATURAL JOIN's merge set comes from the intersection of + // both tables' column lists — only knowable with a + // catalog. Without one, the resolver doesn't expand, and + // unqualified `id` is multi-candidate-unresolved (same + // shape as plain JOIN ON without USING). + assert_column_ops( + "SELECT id FROM t1 NATURAL JOIN t2", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![unresolved("id")], + writes: vec![], + lineage: vec![ColumnLineageEdge { + source: ColumnReference { + table: None, + name: "id".into(), + }, + target: out("id", 0), + kind: ColumnLineageKind::Passthrough, + }], + diagnostics: vec![], + }, + ); + } + } + + mod lateral_and_correlation { + use super::*; + + #[test] + fn lateral_subquery_resolves_inner_ref_to_inner_table() { + // The existing-style LATERAL: the inner subquery only + // references its own tables. The outer FROM joins it as + // a derived source. The inner `id` resolves to t1 from + // the LATERAL subquery's own scope. + assert_column_ops( + "SELECT d.id FROM LATERAL (SELECT id FROM t1) AS d JOIN t2 ON d.id = t2.id", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "id"), read("t2", "id")], + writes: vec![], + lineage: vec![passthrough(col("t1", "id"), out("id", 0))], + diagnostics: vec![], + }, + ); + } + + #[test] + fn lateral_with_outer_scope_reference_resolves_via_scope_chain() { + // The interesting LATERAL case: the inner subquery references + // `t1.x` from the OUTER FROM. Without LATERAL this is invalid + // SQL, but the resolver doesn't enforce LATERAL semantics — + // it walks the scope chain regardless. + assert_column_ops( + "SELECT sub.x FROM t1, LATERAL (SELECT t1.a + t2.b AS x FROM t2) sub", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "a"), read("t2", "b")], + writes: vec![], + lineage: vec![ + transformation(col("t1", "a"), out("x", 0)), + transformation(col("t2", "b"), out("x", 0)), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn non_lateral_derived_also_resolves_outer_ref_permissively() { + // The resolver doesn't distinguish LATERAL from non-LATERAL + // — both walk the scope chain identically. This is more + // lenient than strict SQL semantics (where this should be + // an error), but reasonable for lineage purposes: a + // best-effort resolution is more useful than silently + // dropping the reference. + assert_column_ops( + "SELECT sub.x FROM t1, (SELECT t1.a + t2.b AS x FROM t2) sub", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "a"), read("t2", "b")], + writes: vec![], + lineage: vec![ + transformation(col("t1", "a"), out("x", 0)), + transformation(col("t2", "b"), out("x", 0)), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn correlated_where_subquery_resolves_outer_ref() { + // Classic correlated subquery in WHERE: the inner SELECT + // references the outer t1.id. The resolver walks the + // scope chain to find t1.id in the outer scope. + assert_column_ops( + "SELECT a FROM t1 WHERE EXISTS (SELECT 1 FROM t2 WHERE t2.fk = t1.id)", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "a"), read("t2", "fk"), read("t1", "id")], + writes: vec![], + lineage: vec![passthrough(col("t1", "a"), out("a", 0))], + diagnostics: vec![], + }, + ); + } + } + + mod on_conflict { + //! ON CONFLICT (Postgres / Sqlite) and ON DUPLICATE KEY UPDATE + //! (MySQL) both sit in `Insert.on: Option`. The + //! resolver walks both, with subtle differences: + //! + //! - Postgres: `EXCLUDED.` is a pseudo-table for the + //! would-be-inserted row. Bound as synthetic so refs + //! through it filter out of `reads` but still emit valid + //! Relation lineage edges into the target. The synthetic + //! binding's columns mirror the INSERT target's columns. + //! - MySQL: `VALUES()` is a function-call form for the + //! same concept. No EXCLUDED binding (it would make + //! unqualified refs ambiguous against the INSERT target); + //! the inner ref resolves to the INSERT target like a + //! regular self-reference. + //! + //! DO UPDATE SET targets become writes on the INSERT target + //! table — same role as a standalone UPDATE SET. The optional + //! DO UPDATE WHERE clause walks in filter context. + use super::*; + use sqlparser::dialect::{MySqlDialect, PostgreSqlDialect}; + + fn assert_column_ops_with_dialect( + sql: &str, + dialect: &dyn sqlparser::dialect::Dialect, + expected: ColumnOperation, + ) { + let actual = extract_column_operations(dialect, sql, None) + .unwrap() + .into_iter() + .next() + .unwrap_or_else(|| panic!("no statements in result for SQL: {sql}")) + .unwrap(); + assert_column_ops_inner(sql, 0, actual, expected); + } + + /// Construct a `ColumnReference` for the synthetic EXCLUDED + /// pseudo-table — used only as a Source in lineage edges, not + /// as a real table. + fn excluded(name: &str) -> ColumnReference { + ColumnReference { + table: Some(TableReference { + catalog: None, + schema: None, + name: "EXCLUDED".into(), + }), + name: name.into(), + } + } + + #[test] + fn pg_on_conflict_do_update_set_excluded_emits_lineage_and_write() { + // DO UPDATE SET b = EXCLUDED.b + // - writes: t.a, t.b from INSERT columns plus another + // t.b for the SET target. + // - reads: empty (EXCLUDED is synthetic-filtered; + // VALUES (1, 2) are literals). + // - lineage: EXCLUDED.b → Relation(t.b), Passthrough. + assert_column_ops_with_dialect( + "INSERT INTO t (a, b) VALUES (1, 2) ON CONFLICT (a) DO UPDATE SET b = EXCLUDED.b", + &PostgreSqlDialect {}, + ColumnOperation { + statement_kind: StatementKind::Insert, + reads: vec![], + writes: vec![write("t", "a"), write("t", "b"), write("t", "b")], + lineage: vec![passthrough(excluded("b"), relation("t", "b"))], + diagnostics: vec![], + }, + ); + } + + #[test] + fn pg_on_conflict_do_nothing_is_indistinguishable_from_plain_insert() { + assert_column_ops_with_dialect( + "INSERT INTO t (a, b) VALUES (1, 2) ON CONFLICT (a) DO NOTHING", + &PostgreSqlDialect {}, + ColumnOperation { + statement_kind: StatementKind::Insert, + reads: vec![], + writes: vec![write("t", "a"), write("t", "b")], + lineage: vec![], + diagnostics: vec![], + }, + ); + } + + #[test] + fn pg_insert_select_with_on_conflict_composes_excluded_to_source() { + // EXCLUDED's body_projections come from the INSERT source + // renamed to the target columns positionally. So + // `EXCLUDED.b` composes through to the source's position-1 + // projection (`y` from s) — the conflict-action lineage edge + // bottoms out at the same base table as the + // source-projection lineage edge. + assert_column_ops_with_dialect( + "INSERT INTO t (a, b) SELECT x, y FROM s \ + ON CONFLICT (a) DO UPDATE SET b = EXCLUDED.b", + &PostgreSqlDialect {}, + ColumnOperation { + statement_kind: StatementKind::Insert, + reads: vec![read("s", "x"), read("s", "y")], + writes: vec![write("t", "a"), write("t", "b"), write("t", "b")], + lineage: vec![ + passthrough(col("s", "x"), relation("t", "a")), + passthrough(col("s", "y"), relation("t", "b")), + passthrough(col("s", "y"), relation("t", "b")), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn mysql_on_duplicate_key_update_values_func_self_references_target() { + // MySQL `VALUES()` is the implicit-row form. Without + // an EXCLUDED binding, the inner `b` ref resolves to t.b + // (the INSERT target). Result: t.b shows up as a read + // (the VALUES function call is a value-changing wrapper) and + // the SET clause adds a Relation-target lineage edge t.b → t.b. + assert_column_ops_with_dialect( + "INSERT INTO t (a, b) VALUES (1, 2) \ + ON DUPLICATE KEY UPDATE b = VALUES(b)", + &MySqlDialect {}, + ColumnOperation { + statement_kind: StatementKind::Insert, + reads: vec![read("t", "b")], + writes: vec![write("t", "a"), write("t", "b"), write("t", "b")], + lineage: vec![transformation(col("t", "b"), relation("t", "b"))], + diagnostics: vec![], + }, + ); + } + + #[test] + fn pg_insert_union_with_on_conflict_excluded_fans_out_to_each_branch() { + // The source has TWO ProjectionGroups (one per UNION + // branch), so EXCLUDED's body_projections also have two + // groups — each with a position-0 item named after the + // INSERT target column. `EXCLUDED.a` then composes to + // BOTH branches' position-0 source refs. + assert_column_ops_with_dialect( + "INSERT INTO t (a) SELECT x FROM s1 UNION SELECT y FROM s2 \ + ON CONFLICT (a) DO UPDATE SET a = EXCLUDED.a", + &PostgreSqlDialect {}, + ColumnOperation { + statement_kind: StatementKind::Insert, + reads: vec![read("s1", "x"), read("s2", "y")], + writes: vec![write("t", "a"), write("t", "a")], + lineage: vec![ + passthrough(col("s1", "x"), relation("t", "a")), + passthrough(col("s2", "y"), relation("t", "a")), + passthrough(col("s1", "x"), relation("t", "a")), + passthrough(col("s2", "y"), relation("t", "a")), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn pg_insert_aggregate_with_on_conflict_excluded_keeps_transformation_kind() { + // SUM(x) makes the source projection a Transformation. When + // EXCLUDED.total composes back, compose_lineage_kinds keeps the + // transforming step → lineage kind stays Transformation even on + // the conflict-action path. + assert_column_ops_with_dialect( + "INSERT INTO t (total) SELECT SUM(x) FROM s \ + ON CONFLICT (id) DO UPDATE SET total = EXCLUDED.total", + &PostgreSqlDialect {}, + ColumnOperation { + statement_kind: StatementKind::Insert, + reads: vec![read("s", "x")], + writes: vec![write("t", "total"), write("t", "total")], + lineage: vec![ + transformation(col("s", "x"), relation("t", "total")), + transformation(col("s", "x"), relation("t", "total")), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn pg_on_conflict_do_update_with_where_clause_emits_read() { + // DO UPDATE ... WHERE walks in filter context: `t.a` in the + // WHERE expression surfaces as a read but not a lineage source. + assert_column_ops_with_dialect( + "INSERT INTO t (a, b) VALUES (1, 2) \ + ON CONFLICT (a) DO UPDATE SET b = EXCLUDED.b WHERE t.a > 0", + &PostgreSqlDialect {}, + ColumnOperation { + statement_kind: StatementKind::Insert, + reads: vec![read("t", "a")], + writes: vec![write("t", "a"), write("t", "b"), write("t", "b")], + lineage: vec![passthrough(excluded("b"), relation("t", "b"))], + diagnostics: vec![], + }, + ); + } + } + + mod values_as_relation { + //! `VALUES` can stand in for a row-source in three positions: + //! - INSERT … VALUES (already covered in `lineage` / `on_conflict`) + //! - SELECT … FROM (VALUES …) AS t(x, y) — derived table + //! - WITH cte(x, y) AS (VALUES …) SELECT … — CTE body + //! + //! VALUES doesn't carry projection items the resolver can + //! capture (literals have no source refs), so lineage from these + //! variants bottom out at the synthetic binding — no + //! composition to a base table is possible. + use super::*; + + #[test] + fn values_as_derived_table_with_aliases_emits_synthetic_refs_only() { + // The derived table `t` carries schema [x, y] from the + // alias rename, but its body_projections are empty (VALUES + // contributes no ProjectionItems). So `t.x` is recorded as + // a synthetic ref pointing at the derived binding; reads + // filter it out, and lineage keeps `t.x` as the source + // (composition can't substitute further). + assert_column_ops( + "SELECT x, y FROM (VALUES (1, 'a'), (2, 'b')) AS t(x, y)", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![], + writes: vec![], + lineage: vec![ + ColumnLineageEdge { + source: ColumnReference { + table: Some(TableReference { + catalog: None, + schema: None, + name: "t".into(), + }), + name: "x".into(), + }, + target: out("x", 0), + kind: ColumnLineageKind::Passthrough, + }, + ColumnLineageEdge { + source: ColumnReference { + table: Some(TableReference { + catalog: None, + schema: None, + name: "t".into(), + }), + name: "y".into(), + }, + target: out("y", 1), + kind: ColumnLineageKind::Passthrough, + }, + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn values_as_cte_body_with_aliases_emits_synthetic_refs_only() { + assert_column_ops( + "WITH cte(id, val) AS (VALUES (1, 'a'), (2, 'b')) SELECT id FROM cte", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![], + writes: vec![], + lineage: vec![ColumnLineageEdge { + source: ColumnReference { + table: Some(TableReference { + catalog: None, + schema: None, + name: "cte".into(), + }), + name: "id".into(), + }, + target: out("id", 0), + kind: ColumnLineageKind::Passthrough, + }], + diagnostics: vec![], + }, + ); + } + + #[test] + fn values_with_column_ref_in_row_picks_up_outer_ref() { + // A column ref inside a VALUES row (rare in practice but + // syntactically valid) does get walked and surfaces in + // reads — the outer table `t1` is in scope of the derived + // table per the resolver's permissive scope-chain rule. + assert_column_ops( + "SELECT v.x FROM t1, (VALUES (t1.a)) AS v(x)", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "a")], + writes: vec![], + lineage: vec![ColumnLineageEdge { + source: ColumnReference { + table: Some(TableReference { + catalog: None, + schema: None, + name: "v".into(), + }), + name: "x".into(), + }, + target: out("x", 0), + kind: ColumnLineageKind::Passthrough, + }], + diagnostics: vec![], + }, + ); + } + } + + mod alter_table { + //! ALTER TABLE produces column-level writes for column-naming + //! operations: ADD COLUMN, DROP COLUMN, RENAME COLUMN, CHANGE + //! COLUMN, MODIFY COLUMN, ALTER COLUMN. RENAME / CHANGE surface + //! BOTH the old and new names — both ends of the rename are + //! useful for downstream lineage consumers tracking column + //! history. Schema-level operations (constraints, partitions, + //! RENAME TABLE) contribute no column writes. + use super::*; + + #[test] + fn alter_table_add_column_emits_write() { + assert_column_ops( + "ALTER TABLE t ADD COLUMN c INT", + ColumnOperation { + statement_kind: StatementKind::AlterTable, + reads: vec![], + writes: vec![write("t", "c")], + lineage: vec![], + diagnostics: vec![], + }, + ); + } + + #[test] + fn alter_table_drop_column_emits_write() { + assert_column_ops( + "ALTER TABLE t DROP COLUMN c", + ColumnOperation { + statement_kind: StatementKind::AlterTable, + reads: vec![], + writes: vec![write("t", "c")], + lineage: vec![], + diagnostics: vec![], + }, + ); + } + + #[test] + fn alter_table_rename_column_emits_both_old_and_new() { + // RENAME moves data from old to new; surface both for + // downstream consumers tracking column history. + assert_column_ops( + "ALTER TABLE t RENAME COLUMN a TO b", + ColumnOperation { + statement_kind: StatementKind::AlterTable, + reads: vec![], + writes: vec![write("t", "a"), write("t", "b")], + lineage: vec![], + diagnostics: vec![], + }, + ); + } + + #[test] + fn alter_table_alter_column_emits_write_for_target_column() { + assert_column_ops( + "ALTER TABLE t ALTER COLUMN a SET NOT NULL", + ColumnOperation { + statement_kind: StatementKind::AlterTable, + reads: vec![], + writes: vec![write("t", "a")], + lineage: vec![], + diagnostics: vec![], + }, + ); + } + + #[test] + fn alter_table_multiple_ops_collects_all_target_columns() { + // sqlparser parses multi-op ALTER as a single statement + // with `operations: Vec`. + assert_column_ops( + "ALTER TABLE t ADD COLUMN c INT, DROP COLUMN d", + ColumnOperation { + statement_kind: StatementKind::AlterTable, + reads: vec![], + writes: vec![write("t", "c"), write("t", "d")], + lineage: vec![], + diagnostics: vec![], + }, + ); + } + + #[test] + fn alter_table_add_constraint_emits_no_column_writes() { + // AddConstraint is schema-level — no column-level writes + // surface (the table itself stays in table_op writes). + assert_column_ops( + "ALTER TABLE t ADD CONSTRAINT uq UNIQUE (a)", + ColumnOperation { + statement_kind: StatementKind::AlterTable, + reads: vec![], + writes: vec![], + lineage: vec![], + diagnostics: vec![], + }, + ); + } + } + + mod returning { + //! `RETURNING ` on INSERT / UPDATE / DELETE + //! (Postgres / Sqlite extension) projects from the affected + //! rows of the target table — treated like a top-level SELECT + //! projection: each item contributes refs to `reads` and a + //! `QueryOutput` lineage edge. Walked BEFORE the ON-clause for + //! INSERT so any EXCLUDED binding doesn't ambify unqualified + //! refs that collide with INSERT column names. + use super::*; + + #[test] + fn insert_values_with_returning_emits_target_reads_and_query_output() { + assert_column_ops( + "INSERT INTO t (a, b) VALUES (1, 2) RETURNING id", + ColumnOperation { + statement_kind: StatementKind::Insert, + reads: vec![read("t", "id")], + writes: vec![write("t", "a"), write("t", "b")], + lineage: vec![passthrough(col("t", "id"), out("id", 0))], + diagnostics: vec![], + }, + ); + } + + #[test] + fn returning_aliased_uses_alias_as_output_name() { + assert_column_ops( + "INSERT INTO t (a) VALUES (1) RETURNING id AS pk", + ColumnOperation { + statement_kind: StatementKind::Insert, + reads: vec![read("t", "id")], + writes: vec![write("t", "a")], + lineage: vec![passthrough(col("t", "id"), out("pk", 0))], + diagnostics: vec![], + }, + ); + } + + #[test] + fn returning_with_expression_marks_kind_transformation() { + assert_column_ops( + "INSERT INTO t (a) VALUES (1) RETURNING id + 1 AS bumped", + ColumnOperation { + statement_kind: StatementKind::Insert, + reads: vec![read("t", "id")], + writes: vec![write("t", "a")], + lineage: vec![transformation(col("t", "id"), out("bumped", 0))], + diagnostics: vec![], + }, + ); + } + + #[test] + fn returning_wildcard_records_wildcard_suppressed_diagnostic() { + assert_column_ops( + "INSERT INTO t (a) VALUES (1) RETURNING *", + ColumnOperation { + statement_kind: StatementKind::Insert, + reads: vec![], + writes: vec![write("t", "a")], + lineage: vec![], + diagnostics: vec![diag(ColumnLevelDiagnosticKind::WildcardSuppressed)], + }, + ); + } + + #[test] + fn update_returning_walks_target_columns() { + assert_column_ops( + "UPDATE t SET a = b + 1 WHERE id = 5 RETURNING id, a", + ColumnOperation { + statement_kind: StatementKind::Update, + reads: vec![ + read("t", "b"), + read("t", "id"), + read("t", "id"), + read("t", "a"), + ], + writes: vec![write("t", "a")], + lineage: vec![ + transformation(col("t", "b"), relation("t", "a")), + passthrough(col("t", "id"), out("id", 0)), + passthrough(col("t", "a"), out("a", 1)), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn delete_returning_walks_target_columns() { + assert_column_ops( + "DELETE FROM t WHERE id = 5 RETURNING id, val", + ColumnOperation { + statement_kind: StatementKind::Delete, + reads: vec![read("t", "id"), read("t", "id"), read("t", "val")], + writes: vec![], + lineage: vec![ + passthrough(col("t", "id"), out("id", 0)), + passthrough(col("t", "val"), out("val", 1)), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn insert_select_with_returning_keeps_source_lineage_and_target_returning() { + // Source SELECT's tables are out of scope by the time + // RETURNING walks (their nested scope was popped after + // resolve_query). So RETURNING refs resolve to the target + // table alone, even when the bare name `id` exists in the + // source too. + assert_column_ops( + "INSERT INTO t (a) SELECT x FROM s RETURNING id", + ColumnOperation { + statement_kind: StatementKind::Insert, + reads: vec![read("s", "x"), read("t", "id")], + writes: vec![write("t", "a")], + lineage: vec![ + passthrough(col("s", "x"), relation("t", "a")), + passthrough(col("t", "id"), out("id", 0)), + ], + diagnostics: vec![], + }, + ); + } + } + + mod catalog_strict { + use super::*; + use crate::catalog::{Catalog, ColumnSchema}; + use sqlparser::ast::Ident; + use std::collections::HashMap; + + #[derive(Debug, Default)] + struct TestCatalog { + tables: HashMap>, + } + + impl TestCatalog { + fn with(mut self, name: &str, cols: Vec<&'static str>) -> Self { + self.tables.insert(name.to_string(), cols); + self + } + } + + impl Catalog for TestCatalog { + fn columns(&self, table: &TableReference) -> Option> { + self.tables.get(table.name.value.as_str()).map(|cols| { + cols.iter() + .map(|c| ColumnSchema { + name: c.to_string(), + }) + .collect() + }) + } + } + + fn assert_column_ops_with_catalog( + sql: &str, + catalog: &dyn Catalog, + expected: ColumnOperation, + ) { + let actual = extract_column_operations(&GenericDialect {}, sql, Some(catalog)) + .unwrap() + .into_iter() + .next() + .unwrap() + .unwrap(); + assert_column_ops_inner(sql, 0, actual, expected); + } + + #[test] + fn catalog_known_schema_rejects_columns_not_in_table() { + // Without catalog `SELECT a FROM t1` resolves a → t1.a + // unconditionally (single Unknown binding heuristic). With + // a catalog that says t1's columns are [x, y], `a` cannot + // come from t1 — it surfaces as unresolved and fires + // UnresolvedColumn. + let catalog = TestCatalog::default().with("t1", vec!["x", "y"]); + assert_column_ops_with_catalog( + "SELECT a FROM t1", + &catalog, + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![unresolved("a")], + writes: vec![], + lineage: vec![ColumnLineageEdge { + source: ColumnReference { + table: None, + name: "a".into(), + }, + target: out("a", 0), + kind: ColumnLineageKind::Passthrough, + }], + diagnostics: vec![diag(ColumnLevelDiagnosticKind::UnresolvedColumn)], + }, + ); + } + + #[test] + fn catalog_known_schema_resolves_columns_present_in_table() { + let catalog = TestCatalog::default().with("t1", vec!["a", "b"]); + assert_column_ops_with_catalog( + "SELECT a FROM t1", + &catalog, + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "a")], + writes: vec![], + lineage: vec![passthrough(col("t1", "a"), out("a", 0))], + diagnostics: vec![], + }, + ); + } + + #[test] + fn catalog_resolves_unquoted_ref_case_insensitively() { + // The catalog declares `id` (lowercase); an unquoted `ID` + // folds to the same key, so it resolves to t1. The column + // name surfaces as written (`ID`) — folding governs matching, + // not the surfaced identity. + let catalog = TestCatalog::default().with("t1", vec!["id"]); + assert_column_ops_with_catalog( + "SELECT ID FROM t1", + &catalog, + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "ID")], + writes: vec![], + lineage: vec![passthrough(col("t1", "ID"), out("ID", 0))], + diagnostics: vec![], + }, + ); + } + + #[test] + fn catalog_does_not_match_quoted_ref_against_unquoted_column() { + // A quoted `"ID"` matches exactly (case-sensitive), so it does + // not match the catalog's `id`; it stays unresolved and fires + // UnresolvedColumn. Placed in WHERE so it is a read but not a + // lineage source. + let catalog = TestCatalog::default().with("t1", vec!["a", "id"]); + assert_column_ops_with_catalog( + r#"SELECT a FROM t1 WHERE "ID" > 0"#, + &catalog, + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![ + read("t1", "a"), + ColumnReference { + table: None, + name: Ident::with_quote('"', "ID"), + }, + ], + writes: vec![], + lineage: vec![passthrough(col("t1", "a"), out("a", 0))], + diagnostics: vec![diag(ColumnLevelDiagnosticKind::UnresolvedColumn)], + }, + ); + } + + #[test] + fn catalog_insert_without_explicit_columns_pairs_via_catalog_schema() { + // INSERT INTO t SELECT a, b FROM s — no explicit column + // list. With t = [x, y, z] in catalog, the resolver pairs + // source projections positionally (s.a → t.x, s.b → t.y). + // Unpaired catalog cols (z) get no lineage / no write. + let catalog = TestCatalog::default().with("t", vec!["x", "y", "z"]); + assert_column_ops_with_catalog( + "INSERT INTO t SELECT a, b FROM s", + &catalog, + ColumnOperation { + statement_kind: StatementKind::Insert, + reads: vec![read("s", "a"), read("s", "b")], + writes: vec![write("t", "x"), write("t", "y")], + lineage: vec![ + passthrough(col("s", "a"), relation("t", "x")), + passthrough(col("s", "b"), relation("t", "y")), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn catalog_insert_without_explicit_columns_source_longer_than_target() { + // 3 source projections vs t = [x, y] — pair what fits, + // surplus source column gets no lineage. + let catalog = TestCatalog::default().with("t", vec!["x", "y"]); + assert_column_ops_with_catalog( + "INSERT INTO t SELECT a, b, c FROM s", + &catalog, + ColumnOperation { + statement_kind: StatementKind::Insert, + reads: vec![read("s", "a"), read("s", "b"), read("s", "c")], + writes: vec![write("t", "x"), write("t", "y")], + lineage: vec![ + passthrough(col("s", "a"), relation("t", "x")), + passthrough(col("s", "b"), relation("t", "y")), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn catalog_insert_explicit_columns_override_catalog_schema() { + // Explicit (q) wins over catalog [x, y, z]. + let catalog = TestCatalog::default().with("t", vec!["x", "y", "z"]); + assert_column_ops_with_catalog( + "INSERT INTO t (q) SELECT a FROM s", + &catalog, + ColumnOperation { + statement_kind: StatementKind::Insert, + reads: vec![read("s", "a")], + writes: vec![write("t", "q")], + lineage: vec![passthrough(col("s", "a"), relation("t", "q"))], + diagnostics: vec![], + }, + ); + } + + #[test] + fn catalog_merge_not_matched_insert_no_cols_pairs_via_catalog() { + // Same catalog fallback applies to MERGE's INSERT clause: + // lineage is paired via catalog. Surprise surfaced by whole- + // value compare: writes stay empty for catalog-paired MERGE + // INSERT — only `INSERT (cols) VALUES (...)` with an + // explicit column list populates writes. + let catalog = TestCatalog::default().with("t", vec!["id", "a"]); + assert_column_ops_with_catalog( + "MERGE INTO t USING s ON t.id = s.id \ + WHEN NOT MATCHED THEN INSERT VALUES (s.id, s.a)", + &catalog, + ColumnOperation { + statement_kind: StatementKind::Merge, + reads: vec![ + read("t", "id"), + read("s", "id"), + read("s", "id"), + read("s", "a"), + ], + writes: vec![], + lineage: vec![ + passthrough(col("s", "id"), relation("t", "id")), + passthrough(col("s", "a"), relation("t", "a")), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn catalog_disambiguates_join_unqualified_ref() { + // Both tables are Known via catalog; only t2 has `a`, so + // unqualified `a` in `t1 JOIN t2` resolves to t2 (no + // catalog: same SQL would be ambiguous). + let catalog = TestCatalog::default() + .with("t1", vec!["id"]) + .with("t2", vec!["id", "a"]); + assert_column_ops_with_catalog( + "SELECT a FROM t1 JOIN t2 ON t1.id = t2.id", + &catalog, + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "id"), read("t2", "id"), read("t2", "a")], + writes: vec![], + lineage: vec![passthrough(col("t2", "a"), out("a", 0))], + diagnostics: vec![], + }, + ); + } + + #[test] + fn catalog_confirmed_ambiguity_reports_diagnostic() { + // Both tables Known and both declare `a`. ColumnLevelDiagnostic must + // fire — without catalog the same query is silently + // ambiguous (no diagnostic) since Unknown schemas could + // contain anything. assert_column_ops compares diagnostics + // by kind only; the message-content checks are kept inline + // since they're this test's specific purpose. + let catalog = TestCatalog::default() + .with("t1", vec!["a"]) + .with("t2", vec!["a"]); + assert_column_ops_with_catalog( + "SELECT a FROM t1 JOIN t2 ON t1.a = t2.a", + &catalog, + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "a"), read("t2", "a"), unresolved("a")], + writes: vec![], + lineage: vec![ColumnLineageEdge { + source: ColumnReference { + table: None, + name: "a".into(), + }, + target: out("a", 0), + kind: ColumnLineageKind::Passthrough, + }], + diagnostics: vec![diag(ColumnLevelDiagnosticKind::AmbiguousColumn)], + }, + ); + // Specific message-content checks for this test's purpose. + let ops = extract_column_operations( + &GenericDialect {}, + "SELECT a FROM t1 JOIN t2 ON t1.a = t2.a", + Some(&catalog), + ) + .unwrap(); + let ops = ops.into_iter().next().unwrap().unwrap(); + let amb = ops + .diagnostics + .iter() + .find(|d| matches!(d.kind, ColumnLevelDiagnosticKind::AmbiguousColumn)) + .expect("AmbiguousColumn must fire"); + assert!(amb.message.contains("ambiguous column `a`")); + assert!(amb.message.contains("t1")); + assert!(amb.message.contains("t2")); + } + + #[test] + fn catalog_unresolved_unqualified_reports_diagnostic() { + // Catalog says t1 has [x, y]; unqualified `z` belongs to + // nothing in scope — UnresolvedColumn fires. + let catalog = TestCatalog::default().with("t1", vec!["x", "y"]); + assert_column_ops_with_catalog( + "SELECT z FROM t1", + &catalog, + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![unresolved("z")], + writes: vec![], + lineage: vec![ColumnLineageEdge { + source: ColumnReference { + table: None, + name: "z".into(), + }, + target: out("z", 0), + kind: ColumnLineageKind::Passthrough, + }], + diagnostics: vec![diag(ColumnLevelDiagnosticKind::UnresolvedColumn)], + }, + ); + // Message-content check for this test's purpose. + let ops = + extract_column_operations(&GenericDialect {}, "SELECT z FROM t1", Some(&catalog)) + .unwrap(); + let ops = ops.into_iter().next().unwrap().unwrap(); + let unr = ops + .diagnostics + .iter() + .find(|d| matches!(d.kind, ColumnLevelDiagnosticKind::UnresolvedColumn)) + .expect("UnresolvedColumn must fire"); + assert!(unr.message.contains("unresolved column `z`")); + } + + #[test] + fn no_catalog_unqualified_is_silent_even_when_ambiguous_shape() { + // No catalog → all schemas are Unknown → resolver can't + // tell whether `a` is genuinely in both t1 and t2, only one, + // or neither. Two diagnostic kinds are intentionally + // suppressed in this mode: AmbiguousColumn (no confirmed + // matches) and UnresolvedColumn (no Known schemas in scope). + // The resolution itself still returns None for the column, + // and the lineage source is also unresolved. + assert_column_ops( + "SELECT a FROM t1 JOIN t2 ON t1.id = t2.id", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t1", "id"), read("t2", "id"), unresolved("a")], + writes: vec![], + lineage: vec![ColumnLineageEdge { + source: ColumnReference { + table: None, + name: "a".into(), + }, + target: out("a", 0), + kind: ColumnLineageKind::Passthrough, + }], + diagnostics: vec![], + }, + ); + } + } +} diff --git a/sql-insight/src/extractor/crud_table_extractor.rs b/sql-insight/src/extractor/crud_table_extractor.rs index b962a9e..b05e5f0 100644 --- a/sql-insight/src/extractor/crud_table_extractor.rs +++ b/sql-insight/src/extractor/crud_table_extractor.rs @@ -3,12 +3,12 @@ //! See [`extract_crud_tables`](crate::extract_crud_tables()) as the entry point for extracting CRUD tables from SQL. use std::fmt; -use std::ops::ControlFlow; +use crate::diagnostic::TableLevelDiagnostic; use crate::error::Error; -use crate::extractor::table_extractor::TableReference; -use crate::{helper, TableExtractor}; -use sqlparser::ast::{Delete, MergeAction, Statement, Visit, Visitor}; +use crate::reference::TableReference; +use crate::{StatementKind, TableOperationExtractor}; +use sqlparser::ast::{MergeAction, Statement}; use sqlparser::dialect::Dialect; use sqlparser::parser::Parser; @@ -39,6 +39,10 @@ pub struct CrudTables { pub read_tables: Vec, pub update_tables: Vec, pub delete_tables: Vec, + /// Non-fatal diagnostics, forwarded from the underlying table-level + /// extraction (only [`UnsupportedStatement`](crate::TableLevelDiagnosticKind::UnsupportedStatement) + /// arises at this granularity). + pub diagnostics: Vec, } impl fmt::Display for CrudTables { @@ -65,106 +69,12 @@ impl CrudTables { } } -/// A visitor to extract CRUD tables from SQL. +/// Extracts CRUD tables from SQL. A thin shim over +/// [`TableOperationExtractor`] that buckets `reads`/`writes` into the +/// CRUD positions and consults the AST only for MERGE clauses (whose +/// target placement depends on WHEN actions). #[derive(Default, Debug)] -pub struct CrudTableExtractor { - create_tables: Vec, - read_tables: Vec, - update_tables: Vec, - delete_tables: Vec, - possibly_aliased_delete_tables: Vec, -} - -impl Visitor for CrudTableExtractor { - type Break = Error; - - fn pre_visit_statement(&mut self, statement: &Statement) -> ControlFlow { - match statement { - Statement::Insert(insert) => { - match TableReference::try_from(insert) { - Ok(table) => self.create_tables.push(table), - Err(e) => return ControlFlow::Break(e), - } - self.read_tables = helper::calc_difference_of_tables( - self.read_tables.clone(), - self.create_tables.clone(), - ); - } - Statement::Update(update) => { - match TableExtractor::extract_from_table_node(&update.table) { - Ok(tables) => tables - .0 - .into_iter() - .for_each(|table| self.update_tables.push(table)), - Err(e) => return ControlFlow::Break(e), - } - self.read_tables = helper::calc_difference_of_tables( - self.read_tables.clone(), - self.update_tables.clone(), - ); - } - Statement::Delete(Delete { tables, from, .. }) => { - // When tables are present, deletion sqls are these tables, - // and from clause is used as a data source. - if !tables.is_empty() { - for table in tables { - match TableReference::try_from(table) { - Ok(table) => self.possibly_aliased_delete_tables.push(table), - Err(e) => return ControlFlow::Break(e), - } - } - } else { - let from = match from { - sqlparser::ast::FromTable::WithFromKeyword(items) => items, - sqlparser::ast::FromTable::WithoutKeyword(items) => items, - }; - for table_with_join in from { - match TableExtractor::extract_from_table_node(table_with_join) { - Ok(tables) => tables - .0 - .into_iter() - .for_each(|table| self.possibly_aliased_delete_tables.push(table)), - Err(e) => return ControlFlow::Break(e), - } - } - } - self.delete_tables = helper::resolve_aliased_tables( - self.possibly_aliased_delete_tables.clone(), - self.read_tables.clone(), - ); - self.read_tables = helper::calc_difference_of_tables( - self.read_tables.clone(), - self.delete_tables.clone(), - ); - } - Statement::Merge(merge) => { - let target_table = match TableReference::try_from(&merge.table) { - Ok(table) => table, - Err(e) => return ControlFlow::Break(e), - }; - let (mut inserted, mut updated, mut deleted) = (false, false, false); - merge.clauses.iter().for_each(|clause| match clause.action { - MergeAction::Update { .. } => updated = true, - MergeAction::Delete { .. } => deleted = true, - MergeAction::Insert(_) => inserted = true, - }); - if inserted { - self.create_tables.push(target_table.clone()); - } - if updated { - self.update_tables.push(target_table.clone()); - } - if deleted { - self.delete_tables.push(target_table.clone()); - } - self.read_tables = - helper::calc_difference_of_tables(self.read_tables.clone(), vec![target_table]); - } - _ => {} - } - ControlFlow::Continue(()) - } -} +pub struct CrudTableExtractor; impl CrudTableExtractor { /// Extract CRUD tables from SQL. @@ -173,27 +83,72 @@ impl CrudTableExtractor { sql: &str, ) -> Result>, Error> { let statements = Parser::parse_sql(dialect, sql)?; - let results = statements + Ok(statements .iter() .map(Self::extract_from_statement) - .collect::>>(); - Ok(results) + .collect()) } fn extract_from_statement(statement: &Statement) -> Result { - let mut visitor = CrudTableExtractor { - read_tables: TableExtractor::extract_from_statement(statement)?.0, + let ops = TableOperationExtractor::extract_from_statement(statement, None)?; + let reads = ops.reads; + let writes = ops.writes; + let diagnostics = ops.diagnostics; + + let mut crud = CrudTables { + diagnostics, ..Default::default() }; - match statement.visit(&mut visitor) { - ControlFlow::Break(e) => Err(e), - ControlFlow::Continue(()) => Ok(CrudTables { - create_tables: visitor.create_tables, - read_tables: visitor.read_tables, - update_tables: visitor.update_tables, - delete_tables: visitor.delete_tables, - }), + match ops.statement_kind { + StatementKind::Insert => { + crud.create_tables = writes; + crud.read_tables = reads; + } + StatementKind::Update => { + crud.update_tables = writes; + crud.read_tables = reads; + } + StatementKind::Delete => { + crud.delete_tables = writes; + crud.read_tables = reads; + } + StatementKind::Merge => { + // MERGE target placement depends on which WHEN actions + // appear; reach into the AST for that one detail. The + // source comes from `reads` directly. + if let Statement::Merge(merge) = statement { + let (mut inserted, mut updated, mut deleted) = (false, false, false); + for clause in &merge.clauses { + match &clause.action { + MergeAction::Insert(_) => inserted = true, + MergeAction::Update { .. } => updated = true, + MergeAction::Delete { .. } => deleted = true, + } + } + for target in &writes { + if inserted { + crud.create_tables.push(target.clone()); + } + if updated { + crud.update_tables.push(target.clone()); + } + if deleted { + crud.delete_tables.push(target.clone()); + } + } + } + crud.read_tables = reads; + } + // SELECT, CreateTable, CreateView, AlterTable, AlterView, + // Drop, Truncate, Unsupported — every touched table goes to + // read_tables, matching the legacy catch-all behavior. + _ => { + crud.read_tables = reads; + crud.read_tables.extend(writes); + } } + + Ok(crud) } } @@ -215,111 +170,120 @@ mod tests { } } - #[test] - fn test_single_statement() { - let sql = "SELECT a FROM t1"; - let expected = vec![Ok(CrudTables { - create_tables: vec![], - read_tables: vec![TableReference { - catalog: None, - schema: None, - name: "t1".into(), - alias: None, - }], - update_tables: vec![], - delete_tables: vec![], - })]; - assert_crud_table_extraction(sql, expected, all_dialects()); + fn table(name: &str) -> TableReference { + TableReference { + catalog: None, + schema: None, + name: name.into(), + } } - #[test] - fn test_multiple_statements() { - let sql = "SELECT a FROM t1; SELECT b FROM t2"; - let expected = vec![ - Ok(CrudTables { + fn catalog_schema_table(catalog: &str, schema: &str, name: &str) -> TableReference { + TableReference { + catalog: Some(catalog.into()), + schema: Some(schema.into()), + name: name.into(), + } + } + + mod basic { + use super::*; + + #[test] + fn test_single_statement() { + let sql = "SELECT a FROM t1"; + let expected = vec![Ok(CrudTables { create_tables: vec![], - read_tables: vec![TableReference { - catalog: None, - schema: None, - name: "t1".into(), - alias: None, - }], + read_tables: vec![table("t1")], update_tables: vec![], delete_tables: vec![], - }), - Ok(CrudTables { + diagnostics: vec![], + })]; + assert_crud_table_extraction(sql, expected, all_dialects()); + } + + #[test] + fn test_multiple_statements() { + let sql = "SELECT a FROM t1; SELECT b FROM t2"; + let expected = vec![ + Ok(CrudTables { + create_tables: vec![], + read_tables: vec![table("t1")], + update_tables: vec![], + delete_tables: vec![], + diagnostics: vec![], + }), + Ok(CrudTables { + create_tables: vec![], + read_tables: vec![table("t2")], + update_tables: vec![], + delete_tables: vec![], + diagnostics: vec![], + }), + ]; + assert_crud_table_extraction(sql, expected, all_dialects()); + } + + #[test] + fn test_statement_with_alias() { + let sql = "SELECT a FROM t1 AS t1_alias"; + let expected = vec![Ok(CrudTables { create_tables: vec![], - read_tables: vec![TableReference { - catalog: None, - schema: None, - name: "t2".into(), - alias: None, - }], + read_tables: vec![table("t1")], update_tables: vec![], delete_tables: vec![], - }), - ]; - assert_crud_table_extraction(sql, expected, all_dialects()); - } + diagnostics: vec![], + })]; + assert_crud_table_extraction(sql, expected, all_dialects()); + } - #[test] - fn test_statement_with_alias() { - let sql = "SELECT a FROM t1 AS t1_alias"; - let expected = vec![Ok(CrudTables { - create_tables: vec![], - read_tables: vec![TableReference { - catalog: None, - schema: None, - name: "t1".into(), - alias: Some("t1_alias".into()), - }], - update_tables: vec![], - delete_tables: vec![], - })]; - assert_crud_table_extraction(sql, expected, all_dialects()); - } + #[test] + fn test_statement_with_table_identifier() { + let sql = "SELECT a FROM catalog.schema.table"; + let expected = vec![Ok(CrudTables { + create_tables: vec![], + read_tables: vec![catalog_schema_table("catalog", "schema", "table")], + update_tables: vec![], + delete_tables: vec![], + diagnostics: vec![], + })]; + assert_crud_table_extraction(sql, expected, all_dialects()); + } - #[test] - fn test_statement_with_table_identifier() { - let sql = "SELECT a FROM catalog.schema.table"; - let expected = vec![Ok(CrudTables { - create_tables: vec![], - read_tables: vec![TableReference { - catalog: Some("catalog".into()), - schema: Some("schema".into()), - name: "table".into(), - alias: None, - }], - update_tables: vec![], - delete_tables: vec![], - })]; - assert_crud_table_extraction(sql, expected, all_dialects()); - } + #[test] + fn test_statement_with_table_identifier_and_alias() { + let sql = "SELECT a FROM catalog.schema.table AS table_alias"; + let expected = vec![Ok(CrudTables { + create_tables: vec![], + read_tables: vec![catalog_schema_table("catalog", "schema", "table")], + update_tables: vec![], + delete_tables: vec![], + diagnostics: vec![], + })]; + assert_crud_table_extraction(sql, expected, all_dialects()); + } - #[test] - fn test_statement_with_table_identifier_and_alias() { - let sql = "SELECT a FROM catalog.schema.table AS table_alias"; - let expected = vec![Ok(CrudTables { - create_tables: vec![], - read_tables: vec![TableReference { - catalog: Some("catalog".into()), - schema: Some("schema".into()), - name: "table".into(), - alias: Some("table_alias".into()), - }], - update_tables: vec![], - delete_tables: vec![], - })]; - assert_crud_table_extraction(sql, expected, all_dialects()); - } + #[test] + fn test_statement_with_cte() { + let sql = "WITH t2 AS (SELECT id FROM t1) SELECT * FROM t2"; + let expected = vec![Ok(CrudTables { + create_tables: vec![], + read_tables: vec![table("t1")], + update_tables: vec![], + delete_tables: vec![], + diagnostics: vec![], + })]; + assert_crud_table_extraction(sql, expected, all_dialects()); + } - #[test] - fn test_statement_error_with_too_many_identifiers() { - let sql = "INSERT INTO catalog.schema.table.extra (a) VALUES (1)"; - let expected = vec![Err(Error::AnalysisError( - "Too many identifiers provided".to_string(), - ))]; - assert_crud_table_extraction(sql, expected, all_dialects()); + #[test] + fn test_statement_error_with_too_many_identifiers() { + let sql = "INSERT INTO catalog.schema.table.extra (a) VALUES (1)"; + let expected = vec![Err(Error::AnalysisError( + "Too many identifiers provided".to_string(), + ))]; + assert_crud_table_extraction(sql, expected, all_dialects()); + } } mod delete_statement { @@ -334,12 +298,8 @@ mod tests { create_tables: vec![], read_tables: vec![], update_tables: vec![], - delete_tables: vec![TableReference { - catalog: None, - schema: None, - name: "t1".into(), - alias: None, - }], + delete_tables: vec![table("t1")], + diagnostics: vec![], })]; assert_crud_table_extraction(sql, expected, all_dialects()); } @@ -351,12 +311,8 @@ mod tests { create_tables: vec![], read_tables: vec![], update_tables: vec![], - delete_tables: vec![TableReference { - catalog: Some("catalog".into()), - schema: Some("schema".into()), - name: "t1".into(), - alias: None, - }], + delete_tables: vec![catalog_schema_table("catalog", "schema", "t1")], + diagnostics: vec![], })]; assert_crud_table_extraction(sql, expected, all_dialects()); } @@ -368,12 +324,8 @@ mod tests { create_tables: vec![], read_tables: vec![], update_tables: vec![], - delete_tables: vec![TableReference { - catalog: None, - schema: None, - name: "t1".into(), - alias: Some("t1_alias".into()), - }], + delete_tables: vec![table("t1")], + diagnostics: vec![], })]; assert_crud_table_extraction(sql, expected, all_dialects()); } @@ -383,41 +335,10 @@ mod tests { let sql = "DELETE t1, t2 FROM t1 INNER JOIN t2 INNER JOIN t3"; let expected = vec![Ok(CrudTables { create_tables: vec![], - read_tables: vec![ - TableReference { - catalog: None, - schema: None, - name: "t1".into(), - alias: None, - }, - TableReference { - catalog: None, - schema: None, - name: "t2".into(), - alias: None, - }, - TableReference { - catalog: None, - schema: None, - name: "t3".into(), - alias: None, - }, - ], + read_tables: vec![table("t1"), table("t2"), table("t3")], update_tables: vec![], - delete_tables: vec![ - TableReference { - catalog: None, - schema: None, - name: "t1".into(), - alias: None, - }, - TableReference { - catalog: None, - schema: None, - name: "t2".into(), - alias: None, - }, - ], + delete_tables: vec![table("t1"), table("t2")], + diagnostics: vec![], })]; // BigQuery and Generic do not support DELETE ... FROM assert_crud_table_extraction( @@ -433,41 +354,10 @@ mod tests { "DELETE t1_alias, t2_alias FROM t1 AS t1_alias INNER JOIN t2 AS t2_alias INNER JOIN t3"; let expected = vec![Ok(CrudTables { create_tables: vec![], - read_tables: vec![ - TableReference { - catalog: None, - schema: None, - name: "t1".into(), - alias: Some("t1_alias".into()), - }, - TableReference { - catalog: None, - schema: None, - name: "t2".into(), - alias: Some("t2_alias".into()), - }, - TableReference { - catalog: None, - schema: None, - name: "t3".into(), - alias: None, - }, - ], + read_tables: vec![table("t1"), table("t2"), table("t3")], update_tables: vec![], - delete_tables: vec![ - TableReference { - catalog: None, - schema: None, - name: "t1".into(), - alias: Some("t1_alias".into()), - }, - TableReference { - catalog: None, - schema: None, - name: "t2".into(), - alias: Some("t2_alias".into()), - }, - ], + delete_tables: vec![table("t1"), table("t2")], + diagnostics: vec![], })]; // BigQuery and Generic do not support DELETE ... FROM assert_crud_table_extraction( @@ -482,41 +372,10 @@ mod tests { let sql = "DELETE FROM t1, t2 USING t1 INNER JOIN t2 INNER JOIN t3"; let expected = vec![Ok(CrudTables { create_tables: vec![], - read_tables: vec![ - TableReference { - catalog: None, - schema: None, - name: "t1".into(), - alias: None, - }, - TableReference { - catalog: None, - schema: None, - name: "t2".into(), - alias: None, - }, - TableReference { - catalog: None, - schema: None, - name: "t3".into(), - alias: None, - }, - ], + read_tables: vec![table("t1"), table("t2"), table("t3")], update_tables: vec![], - delete_tables: vec![ - TableReference { - catalog: None, - schema: None, - name: "t1".into(), - alias: None, - }, - TableReference { - catalog: None, - schema: None, - name: "t2".into(), - alias: None, - }, - ], + delete_tables: vec![table("t1"), table("t2")], + diagnostics: vec![], })]; assert_crud_table_extraction(sql, expected, all_dialects()); } @@ -526,41 +385,10 @@ mod tests { let sql = "DELETE FROM t1_alias, t2_alias USING t1 AS t1_alias INNER JOIN t2 AS t2_alias INNER JOIN t3"; let expected = vec![Ok(CrudTables { create_tables: vec![], - read_tables: vec![ - TableReference { - catalog: None, - schema: None, - name: "t1".into(), - alias: Some("t1_alias".into()), - }, - TableReference { - catalog: None, - schema: None, - name: "t2".into(), - alias: Some("t2_alias".into()), - }, - TableReference { - catalog: None, - schema: None, - name: "t3".into(), - alias: None, - }, - ], + read_tables: vec![table("t1"), table("t2"), table("t3")], update_tables: vec![], - delete_tables: vec![ - TableReference { - catalog: None, - schema: None, - name: "t1".into(), - alias: Some("t1_alias".into()), - }, - TableReference { - catalog: None, - schema: None, - name: "t2".into(), - alias: Some("t2_alias".into()), - }, - ], + delete_tables: vec![table("t1"), table("t2")], + diagnostics: vec![], })]; assert_crud_table_extraction(sql, expected, all_dialects()); } @@ -573,15 +401,11 @@ mod tests { fn test_insert_statement() { let sql = "INSERT INTO t1 (a) VALUES (1)"; let expected = vec![Ok(CrudTables { - create_tables: vec![TableReference { - catalog: None, - schema: None, - name: "t1".into(), - alias: None, - }], + create_tables: vec![table("t1")], read_tables: vec![], update_tables: vec![], delete_tables: vec![], + diagnostics: vec![], })]; assert_crud_table_extraction(sql, expected, all_dialects()); } @@ -590,34 +414,17 @@ mod tests { fn test_insert_select_statement() { let sql = "INSERT INTO t1 (a) SELECT a FROM t2 AS t2_alias INNER JOIN t3 USING (id)"; let expected = vec![Ok(CrudTables { - create_tables: vec![TableReference { - catalog: None, - schema: None, - name: "t1".into(), - alias: None, - }], - read_tables: vec![ - TableReference { - catalog: None, - schema: None, - name: "t2".into(), - alias: Some("t2_alias".into()), - }, - TableReference { - catalog: None, - schema: None, - name: "t3".into(), - alias: None, - }, - ], + create_tables: vec![table("t1")], + read_tables: vec![table("t2"), table("t3")], update_tables: vec![], delete_tables: vec![], + diagnostics: vec![], })]; assert_crud_table_extraction(sql, expected, all_dialects()); } } - mod update_statemnet { + mod update_statement { use super::*; #[test] @@ -629,114 +436,78 @@ mod tests { vec![Ok(CrudTables { create_tables: vec![], read_tables: vec![], - update_tables: vec![TableReference { - catalog: None, - schema: None, - name: "t1".into(), - alias: None, - }], + update_tables: vec![table("t1")], delete_tables: vec![], + diagnostics: vec![], }),] ) } #[test] fn test_update_statement_with_alias() { + // Behavior change vs the legacy implementation: joined tables + // (`t2` here) are now classified as `read_tables` rather than + // bundled into `update_tables`. This matches the SQL semantics + // — only `t1` is being updated; `t2` is a join partner. let sql = "UPDATE t1 AS t1_alias INNER JOIN t2 ON t1_alias.a = t2.a SET t1_alias.b = t2.b WHERE t2.c = (SELECT c FROM t3)"; let expected = vec![Ok(CrudTables { create_tables: vec![], - read_tables: vec![TableReference { - catalog: None, - schema: None, - name: "t3".into(), - alias: None, - }], - update_tables: vec![ - TableReference { - catalog: None, - schema: None, - name: "t1".into(), - alias: Some("t1_alias".into()), - }, - TableReference { - catalog: None, - schema: None, - name: "t2".into(), - alias: None, - }, - ], + read_tables: vec![table("t2"), table("t3")], + update_tables: vec![table("t1")], delete_tables: vec![], + diagnostics: vec![], })]; assert_crud_table_extraction(sql, expected, all_dialects()); } } - #[test] - fn test_merge_statement() { - let sql = "MERGE INTO t1 AS t1_alias USING t2 AS t2_alias ON t1_alias.a = t2_alias.a \ + mod merge { + use super::*; + + #[test] + fn test_merge_statement() { + let sql = "MERGE INTO t1 AS t1_alias USING t2 AS t2_alias ON t1_alias.a = t2_alias.a \ WHEN MATCHED AND t2_alias.b = 1 THEN DELETE \ WHEN MATCHED AND t2_alias.b = 2 THEN UPDATE SET t1_alias.b = t2_alias.b \ WHEN NOT MATCHED THEN INSERT (a, b) VALUES (t2_alias.a, t2_alias.b)"; - let expected = vec![Ok(CrudTables { - create_tables: vec![TableReference { - catalog: None, - schema: None, - name: "t1".into(), - alias: Some("t1_alias".into()), - }], - read_tables: vec![TableReference { - catalog: None, - schema: None, - name: "t2".into(), - alias: Some("t2_alias".into()), - }], - update_tables: vec![TableReference { - catalog: None, - schema: None, - name: "t1".into(), - alias: Some("t1_alias".into()), - }], - delete_tables: vec![TableReference { - catalog: None, - schema: None, - name: "t1".into(), - alias: Some("t1_alias".into()), - }], - })]; - assert_crud_table_extraction(sql, expected, all_dialects()); + let expected = vec![Ok(CrudTables { + create_tables: vec![table("t1")], + read_tables: vec![table("t2")], + update_tables: vec![table("t1")], + delete_tables: vec![table("t1")], + diagnostics: vec![], + })]; + assert_crud_table_extraction(sql, expected, all_dialects()); + } } - #[test] - fn test_create_table_statement() { - let sql = "CREATE TABLE t1 (a INT)"; - let expected = vec![Ok(CrudTables { - create_tables: vec![], - read_tables: vec![TableReference { - catalog: None, - schema: None, - name: "t1".into(), - alias: None, - }], - update_tables: vec![], - delete_tables: vec![], - })]; - assert_crud_table_extraction(sql, expected, all_dialects()); - } + mod ddl { + use super::*; - #[test] - fn test_alters_table_statement() { - let sql = "ALTER TABLE t1 ADD COLUMN a INT"; - let expected = vec![Ok(CrudTables { - create_tables: vec![], - read_tables: vec![TableReference { - catalog: None, - schema: None, - name: "t1".into(), - alias: None, - }], - update_tables: vec![], - delete_tables: vec![], - })]; - assert_crud_table_extraction(sql, expected, all_dialects()); + #[test] + fn test_create_table_statement() { + let sql = "CREATE TABLE t1 (a INT)"; + let expected = vec![Ok(CrudTables { + create_tables: vec![], + read_tables: vec![table("t1")], + update_tables: vec![], + delete_tables: vec![], + diagnostics: vec![], + })]; + assert_crud_table_extraction(sql, expected, all_dialects()); + } + + #[test] + fn test_alters_table_statement() { + let sql = "ALTER TABLE t1 ADD COLUMN a INT"; + let expected = vec![Ok(CrudTables { + create_tables: vec![], + read_tables: vec![table("t1")], + update_tables: vec![], + delete_tables: vec![], + diagnostics: vec![], + })]; + assert_crud_table_extraction(sql, expected, all_dialects()); + } } } diff --git a/sql-insight/src/extractor/helper.rs b/sql-insight/src/extractor/helper.rs deleted file mode 100644 index c912cdc..0000000 --- a/sql-insight/src/extractor/helper.rs +++ /dev/null @@ -1,334 +0,0 @@ -use crate::TableReference; -use std::collections::HashMap; - -pub(crate) fn resolve_aliased_tables( - possibly_aliased_tables: Vec, - original_tables: Vec, -) -> Vec { - possibly_aliased_tables - .iter() - .map(|possibly_aliased_table| { - if possibly_aliased_table.has_qualifiers() || possibly_aliased_table.has_alias() { - return possibly_aliased_table.clone(); - } - if let Some(resolved_table) = original_tables.iter().find_map(|original_table| { - original_table.alias.as_ref().and_then(|alias| { - if *alias == possibly_aliased_table.name { - Some(original_table.clone()) - } else { - None - } - }) - }) { - return resolved_table; - } - possibly_aliased_table.clone() - }) - .collect() -} - -pub(crate) fn calc_difference_of_tables( - base_tables: Vec, - exclude_tables: Vec, -) -> Vec { - let mut exclude_tables_count = HashMap::new(); - for exclude_table in exclude_tables.iter() { - *exclude_tables_count.entry(exclude_table).or_insert(0) += 1; - } - base_tables - .into_iter() - .filter(|base_table| { - if let Some(count) = exclude_tables_count.get_mut(base_table) { - if *count > 0 { - *count -= 1; - return false; - } - } - true - }) - .collect() -} - -#[cfg(test)] -mod tests { - use super::*; - use sqlparser::ast::Ident; - - mod resolve_aliased_tables { - use super::*; - - #[test] - fn test_single_aliased_table() { - let possibly_aliased_tables = vec![TableReference { - catalog: None, - schema: None, - name: Ident::new("t1_alias"), - alias: None, - }]; - let original_tables = vec![TableReference { - catalog: None, - schema: None, - name: Ident::new("t1"), - alias: Some(Ident::new("t1_alias")), - }]; - let expected_resolved_tables = vec![TableReference { - catalog: None, - schema: None, - name: Ident::new("t1"), - alias: Some(Ident::new("t1_alias")), - }]; - let result = resolve_aliased_tables(possibly_aliased_tables, original_tables); - assert_eq!(result, expected_resolved_tables); - } - - #[test] - fn test_multiple_aliased_tables() { - let possibly_aliased_tables = vec![ - TableReference { - catalog: None, - schema: None, - name: Ident::new("t1_alias"), - alias: None, - }, - TableReference { - catalog: None, - schema: None, - name: Ident::new("t2_alias"), - alias: None, - }, - ]; - let original_tables = vec![ - TableReference { - catalog: None, - schema: None, - name: Ident::new("t1"), - alias: Some(Ident::new("t1_alias")), - }, - TableReference { - catalog: None, - schema: None, - name: Ident::new("t2"), - alias: Some(Ident::new("t2_alias")), - }, - ]; - let expected_resolved_tables = vec![ - TableReference { - catalog: None, - schema: None, - name: Ident::new("t1"), - alias: Some(Ident::new("t1_alias")), - }, - TableReference { - catalog: None, - schema: None, - name: Ident::new("t2"), - alias: Some(Ident::new("t2_alias")), - }, - ]; - let result = resolve_aliased_tables(possibly_aliased_tables, original_tables); - assert_eq!(result, expected_resolved_tables); - } - - #[test] - fn test_catalog_and_schema_qualified_table_in_original_tables() { - let possibly_aliased_tables = vec![ - TableReference { - catalog: None, - schema: None, - name: Ident::new("t1_alias"), - alias: None, - }, - TableReference { - catalog: None, - schema: None, - name: Ident::new("t2_alias"), - alias: None, - }, - ]; - let original_tables = vec![ - TableReference { - catalog: Some(Ident::new("c1")), - schema: Some(Ident::new("s1")), - name: Ident::new("t1"), - alias: Some(Ident::new("t1_alias")), - }, - TableReference { - catalog: None, - schema: Some(Ident::new("s2")), - name: Ident::new("t2"), - alias: Some(Ident::new("t2_alias")), - }, - ]; - let expected_resolved_tables = vec![ - TableReference { - catalog: Some(Ident::new("c1")), - schema: Some(Ident::new("s1")), - name: Ident::new("t1"), - alias: Some(Ident::new("t1_alias")), - }, - TableReference { - catalog: None, - schema: Some(Ident::new("s2")), - name: Ident::new("t2"), - alias: Some(Ident::new("t2_alias")), - }, - ]; - let result = resolve_aliased_tables(possibly_aliased_tables, original_tables); - assert_eq!(result, expected_resolved_tables); - } - - #[test] - fn test_catalog_and_schema_qualified_table_in_possible_aliased_tables() { - // qualified alias is not valid syntax in standard SQL, - // so qualified tables are not regarded as aliased tables, hence they are not resolved. - let possibly_aliased_tables = vec![ - TableReference { - catalog: Some(Ident::new("c1")), - schema: Some(Ident::new("s1")), - name: Ident::new("t1_alias"), - alias: None, - }, - TableReference { - catalog: None, - schema: Some(Ident::new("s2")), - name: Ident::new("t2_alias"), - alias: None, - }, - ]; - let original_tables = vec![ - TableReference { - catalog: Some(Ident::new("c1")), - schema: Some(Ident::new("s1")), - name: Ident::new("t1"), - alias: Some(Ident::new("t1_alias")), - }, - TableReference { - catalog: None, - schema: Some(Ident::new("s2")), - name: Ident::new("t2"), - alias: Some(Ident::new("t2_alias")), - }, - ]; - let expected_resolved_tables = vec![ - TableReference { - catalog: Some(Ident::new("c1")), - schema: Some(Ident::new("s1")), - name: Ident::new("t1_alias"), - alias: None, - }, - TableReference { - catalog: None, - schema: Some(Ident::new("s2")), - name: Ident::new("t2_alias"), - alias: None, - }, - ]; - let result = resolve_aliased_tables(possibly_aliased_tables, original_tables); - assert_eq!(result, expected_resolved_tables); - } - } - - mod calc_difference_of_tables { - use super::*; - - #[test] - fn test_single_table() { - let base_tables = vec![TableReference { - catalog: None, - schema: None, - name: Ident::new("t1"), - alias: None, - }]; - let exclude_tables = vec![TableReference { - catalog: None, - schema: None, - name: Ident::new("t1"), - alias: None, - }]; - let expected_result = vec![]; - let result = calc_difference_of_tables(base_tables, exclude_tables); - assert_eq!(result, expected_result); - } - - #[test] - fn test_multiple_unique_tables() { - let base_tables = vec![ - TableReference { - catalog: None, - schema: None, - name: Ident::new("t1"), - alias: None, - }, - TableReference { - catalog: None, - schema: None, - name: Ident::new("t2"), - alias: None, - }, - ]; - let exclude_tables = vec![ - TableReference { - catalog: None, - schema: None, - name: Ident::new("t1"), - alias: None, - }, - TableReference { - catalog: None, - schema: None, - name: Ident::new("t2"), - alias: None, - }, - ]; - let expected_result = vec![]; - let result = calc_difference_of_tables(base_tables, exclude_tables); - assert_eq!(result, expected_result); - } - - #[test] - fn test_multiple_tables_with_duplicates() { - let base_tables = vec![ - TableReference { - catalog: None, - schema: None, - name: Ident::new("t1"), - alias: None, - }, - TableReference { - catalog: None, - schema: None, - name: Ident::new("t1"), - alias: None, - }, - TableReference { - catalog: None, - schema: None, - name: Ident::new("t2"), - alias: None, - }, - ]; - let exclude_tables = vec![ - TableReference { - catalog: None, - schema: None, - name: Ident::new("t1"), - alias: None, - }, - TableReference { - catalog: None, - schema: None, - name: Ident::new("t2"), - alias: None, - }, - ]; - let expected_result = vec![TableReference { - catalog: None, - schema: None, - name: Ident::new("t1"), - alias: None, - }]; - let result = calc_difference_of_tables(base_tables, exclude_tables); - assert_eq!(result, expected_result); - } - } -} diff --git a/sql-insight/src/extractor/table_extractor.rs b/sql-insight/src/extractor/table_extractor.rs index 38b718c..3a497f0 100644 --- a/sql-insight/src/extractor/table_extractor.rs +++ b/sql-insight/src/extractor/table_extractor.rs @@ -3,19 +3,19 @@ //! See [`extract_tables`](crate::extract_tables()) as the entry point for extracting tables from SQL. use core::fmt; -use std::ops::ControlFlow; +use crate::diagnostic::TableLevelDiagnostic; use crate::error::Error; -use crate::helper; -use sqlparser::ast::{ - Delete, Ident, Insert, ObjectName, Statement, TableFactor, TableObject, TableWithJoins, Visit, - Visitor, -}; +pub use crate::reference::TableReference; +use crate::resolver::Resolver; +use sqlparser::ast::Statement; use sqlparser::dialect::Dialect; use sqlparser::parser::Parser; /// Convenience function to extract tables from SQL. /// +/// Each statement returns extracted table references plus non-fatal diagnostics. +/// /// ## Example /// /// ```rust @@ -30,114 +30,10 @@ use sqlparser::parser::Parser; pub fn extract_tables( dialect: &dyn Dialect, sql: &str, -) -> Result>, Error> { +) -> Result>, Error> { TableExtractor::extract(dialect, sql) } -/// [`TableReference`] represents a qualified table with alias. -/// In this crate, this is the canonical representation of a table. -/// Tables found during analyzing an AST are stored as `TableReference`. -#[derive(Clone, Debug, PartialEq, Eq, Hash)] -pub struct TableReference { - pub catalog: Option, - pub schema: Option, - pub name: Ident, - pub alias: Option, -} - -impl TableReference { - pub fn has_alias(&self) -> bool { - self.alias.is_some() - } - pub fn has_qualifiers(&self) -> bool { - self.catalog.is_some() || self.schema.is_some() - } - pub fn try_from_name_and_alias( - name: &ObjectName, - alias: &Option, - ) -> Result { - match name.0.len() { - 0 => unreachable!("Parser should not allow empty identifiers"), - 1 => Ok(TableReference { - catalog: None, - schema: None, - name: name.0[0].as_ident().unwrap().clone(), - alias: alias.clone(), - }), - 2 => Ok(TableReference { - catalog: None, - schema: Some(name.0[0].as_ident().unwrap().clone()), - name: name.0[1].as_ident().unwrap().clone(), - alias: alias.clone(), - }), - 3 => Ok(TableReference { - catalog: Some(name.0[0].as_ident().unwrap().clone()), - schema: Some(name.0[1].as_ident().unwrap().clone()), - name: name.0[2].as_ident().unwrap().clone(), - alias: alias.clone(), - }), - _ => Err(Error::AnalysisError( - "Too many identifiers provided".to_string(), - )), - } - } - pub fn try_from_name(name: &ObjectName) -> Result { - Self::try_from_name_and_alias(name, &None) - } -} - -impl fmt::Display for TableReference { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let mut parts = Vec::new(); - if let Some(catalog) = &self.catalog { - parts.push(catalog.to_string()); - } - if let Some(schema) = &self.schema { - parts.push(schema.to_string()); - } - parts.push(self.name.to_string()); - let table = parts.join("."); - if let Some(alias) = &self.alias { - write!(f, "{} AS {}", table, alias) - } else { - write!(f, "{}", table) - } - } -} - -impl TryFrom<&Insert> for TableReference { - type Error = Error; - - fn try_from(value: &Insert) -> Result { - let name = match &value.table { - TableObject::TableName(object_name) => object_name, - TableObject::TableFunction(function) => &function.name, - }; - Self::try_from_name_and_alias(name, &value.table_alias) - } -} - -impl TryFrom<&TableFactor> for TableReference { - type Error = Error; - - fn try_from(table: &TableFactor) -> Result { - match table { - TableFactor::Table { name, alias, .. } => { - Self::try_from_name_and_alias(name, &alias.as_ref().map(|a| a.name.clone())) - } - _ => unreachable!("TableFactor::Table expected"), - } - } -} - -impl TryFrom<&ObjectName> for TableReference { - type Error = Error; - - fn try_from(obj_name: &ObjectName) -> Result { - Self::try_from_name(obj_name) - } -} - /// [`Tables`] represents a list of [`TableReference`] that found in SQL. #[derive(Debug, PartialEq)] pub struct Tables(pub Vec); @@ -154,97 +50,65 @@ impl fmt::Display for Tables { } } -/// A visitor to extract tables from SQL. -#[derive(Default, Debug)] -pub struct TableExtractor { - // All tables found in the SQL including aliases, must be resolved to original tables. - all_tables: Vec, - // Original tables found in the SQL, used to resolve aliases. - original_tables: Vec, - // Flag to indicate if the current relation is part of a `TableFactor::Table` - relation_of_table: bool, +/// [`TableExtraction`] represents extracted tables and non-fatal diagnostics. +#[derive(Debug, PartialEq)] +pub struct TableExtraction { + pub tables: Vec, + pub diagnostics: Vec, } -impl Visitor for TableExtractor { - type Break = Error; - - fn pre_visit_relation(&mut self, relation: &ObjectName) -> ControlFlow { - // Skip if relation is part of a TableFactor::Table - if self.relation_of_table { - self.relation_of_table = false; - return ControlFlow::Continue(()); - } - match TableReference::try_from(relation) { - Ok(table) => { - self.all_tables.push(table.clone()); - self.original_tables.push(table) - } - Err(e) => return ControlFlow::Break(e), - } - ControlFlow::Continue(()) - } - - fn pre_visit_table_factor(&mut self, table_factor: &TableFactor) -> ControlFlow { - if let TableFactor::Table { .. } = table_factor { - self.relation_of_table = true; - match TableReference::try_from(table_factor) { - Ok(table) => { - self.all_tables.push(table.clone()); - self.original_tables.push(table) - } - Err(e) => return ControlFlow::Break(e), - } - } - ControlFlow::Continue(()) +impl TableExtraction { + pub fn into_tables(self) -> Tables { + Tables(self.tables) } +} - fn pre_visit_statement(&mut self, statement: &Statement) -> ControlFlow { - if let Statement::Delete(Delete { tables, .. }) = statement { - // tables of delete statement are not visited by `pre_visit_table_factor` nor `pre_visit_relation`. - for table in tables { - match TableReference::try_from(table) { - Ok(table) => self.all_tables.push(table), - Err(e) => return ControlFlow::Break(e), - } - } - } - ControlFlow::Continue(()) +impl fmt::Display for TableExtraction { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let tables = self + .tables + .iter() + .map(|t| t.to_string()) + .collect::>() + .join(", "); + write!(f, "{}", tables) } } +/// Extracts tables from SQL. +#[derive(Default, Debug)] +pub struct TableExtractor; + impl TableExtractor { /// Extract tables from SQL. - pub fn extract(dialect: &dyn Dialect, sql: &str) -> Result>, Error> { + /// + /// Each statement returns extracted table references plus non-fatal diagnostics. + pub fn extract( + dialect: &dyn Dialect, + sql: &str, + ) -> Result>, Error> { let statements = Parser::parse_sql(dialect, sql)?; let results = statements .iter() .map(Self::extract_from_statement) - .collect::>>(); + .collect::>>(); Ok(results) } - pub fn extract_from_statement(statement: &Statement) -> Result { - let mut visitor = TableExtractor::default(); - match statement.visit(&mut visitor) { - ControlFlow::Break(e) => Err(e), - ControlFlow::Continue(()) => Ok(Tables(helper::resolve_aliased_tables( - visitor.all_tables, - visitor.original_tables, - ))), - } - } - - // `Visit` trait object cannot be used since method `visit` has generic type parameters. - // Concrete type `TableWithJoins` is used instead. - pub fn extract_from_table_node(table: &TableWithJoins) -> Result { - let mut visitor = TableExtractor::default(); - match table.visit(&mut visitor) { - ControlFlow::Break(e) => Err(e), - ControlFlow::Continue(()) => Ok(Tables(helper::resolve_aliased_tables( - visitor.all_tables, - visitor.original_tables, - ))), - } + pub fn extract_from_statement(statement: &Statement) -> Result { + // The legacy table-extraction API does not surface columns, so a + // catalog would not influence its output; pass `None`. + let resolution = Resolver::resolve_statement(None, statement)?; + Ok(TableExtraction { + tables: resolution.tables(), + // Project resolver diagnostics to table granularity; column + // resolution / wildcard gaps don't affect the table list. + diagnostics: resolution + .diagnostics + .iter() + .filter_map(|d| d.to_table_level()) + .collect(), + }) } } @@ -252,6 +116,43 @@ impl TableExtractor { mod tests { use super::*; use crate::test_utils::all_dialects; + use sqlparser::dialect::GenericDialect; + + fn table(name: &str) -> TableReference { + TableReference { + catalog: None, + schema: None, + name: name.into(), + } + } + + fn schema_table(schema: &str, name: &str) -> TableReference { + TableReference { + catalog: None, + schema: Some(schema.into()), + name: name.into(), + } + } + + fn catalog_schema_table(catalog: &str, schema: &str, name: &str) -> TableReference { + TableReference { + catalog: Some(catalog.into()), + schema: Some(schema.into()), + name: name.into(), + } + } + + fn ok_tables(tables: Vec) -> Result { + Ok(Tables(tables)) + } + + fn generic_dialect() -> Vec> { + vec![Box::new(GenericDialect {})] + } + + fn one_dialect(dialect: impl Dialect + 'static) -> Vec> { + vec![Box::new(dialect)] + } fn assert_table_extraction( sql: &str, @@ -259,148 +160,532 @@ mod tests { dialects: Vec>, ) { for dialect in dialects { - let result = TableExtractor::extract(dialect.as_ref(), sql) - .unwrap_or_else(|_| panic!("parse failed for dialect: {dialect:?}")); + let result = TableExtractor::extract(dialect.as_ref(), sql).unwrap_or_else(|e| { + panic!("parse failed for dialect: {dialect:?}, sql: {sql}, error: {e}") + }); + let result = result + .into_iter() + .map(|result| result.map(TableExtraction::into_tables)) + .collect::>>(); assert_eq!(result, expected, "Failed for dialect: {dialect:?}") } } - #[test] - fn test_single_statement() { - let sql = "SELECT a FROM t1"; - let expected = vec![Ok(Tables(vec![TableReference { - catalog: None, - schema: None, - name: "t1".into(), - alias: None, - }]))]; - assert_table_extraction(sql, expected, all_dialects()); - } + mod basic { + use super::*; - #[test] - fn test_multiple_statements() { - let sql = "SELECT a FROM t1; SELECT b FROM t2"; - let expected = vec![ - Ok(Tables(vec![TableReference { - catalog: None, - schema: None, - name: "t1".into(), - alias: None, - }])), - Ok(Tables(vec![TableReference { - catalog: None, - schema: None, - name: "t2".into(), - alias: None, - }])), - ]; - assert_table_extraction(sql, expected, all_dialects()); - } + #[test] + fn test_single_statement() { + let sql = "SELECT a FROM t1"; + let expected = vec![ok_tables(vec![table("t1")])]; + assert_table_extraction(sql, expected, all_dialects()); + } - #[test] - fn test_statement_with_alias() { - let sql = "SELECT a FROM t1 AS t1_alias"; - let expected = vec![Ok(Tables(vec![TableReference { - catalog: None, - schema: None, - name: "t1".into(), - alias: Some("t1_alias".into()), - }]))]; - assert_table_extraction(sql, expected, all_dialects()); + #[test] + fn test_multiple_statements() { + let sql = "SELECT a FROM t1; SELECT b FROM t2"; + let expected = vec![ok_tables(vec![table("t1")]), ok_tables(vec![table("t2")])]; + assert_table_extraction(sql, expected, all_dialects()); + } + + #[test] + fn test_tables_display() { + let tables = Tables(vec![catalog_schema_table("c1", "s1", "t1"), table("t2")]); + + assert_eq!(tables.to_string(), "c1.s1.t1, t2"); + } + + #[test] + fn test_table_extraction_display() { + let extraction = TableExtraction { + tables: vec![schema_table("s1", "t1"), table("t2")], + diagnostics: Vec::new(), + }; + + assert_eq!(extraction.to_string(), "s1.t1, t2"); + } + + fn assert_unsupported_statement(sql: &str) { + let result = TableExtractor::extract(&GenericDialect {}, sql).unwrap(); + let extraction = result.into_iter().next().unwrap().unwrap(); + assert_eq!(extraction.tables, vec![]); + assert_eq!(extraction.diagnostics.len(), 1); + assert_eq!( + extraction.diagnostics[0].kind, + crate::TableLevelDiagnosticKind::UnsupportedStatement + ); + assert!(extraction.diagnostics[0] + .message + .contains("Unsupported statement while inspecting SQL")); + } + + #[test] + fn test_unsupported_statements_are_reported_as_diagnostics() { + for sql in [ + "SET x = 1", + "ANALYZE TABLE t1", + "SHOW TABLES", + "SHOW COLUMNS FROM t1", + "SHOW DATABASES", + "SHOW SCHEMAS", + "USE mydb", + "START TRANSACTION", + "COMMIT", + "ROLLBACK", + "EXPLAIN SELECT * FROM t1", + "CREATE INDEX idx ON t1 (a)", + "CREATE SCHEMA s", + "CREATE DATABASE db", + "DEALLOCATE PREPARE stmt", + "PREPARE stmt AS SELECT 1", + "SAVEPOINT sp", + "RELEASE SAVEPOINT sp", + "RESET ALL", + ] { + assert_unsupported_statement(sql); + } + } } - #[test] - fn test_statement_with_schema_identifier() { - let sql = "SELECT a FROM schema.table; INSERT INTO schema.table (a) VALUES (1)"; - let expected = vec![ - Ok(Tables(vec![TableReference { - catalog: None, - schema: Some("schema".into()), - name: "table".into(), - alias: None, - }])), - Ok(Tables(vec![TableReference { - catalog: None, - schema: Some("schema".into()), - name: "table".into(), - alias: None, - }])), - ]; - assert_table_extraction(sql, expected, all_dialects()); + mod resolver_traversal { + use super::*; + + #[test] + fn test_subqueries_inside_predicate_expressions() { + for (sql, expected_tables) in [ + ( + "SELECT * FROM t1 WHERE EXISTS (SELECT 1 FROM t2)", + vec![table("t1"), table("t2")], + ), + ( + "SELECT * FROM t1 WHERE a IN (SELECT a FROM t2)", + vec![table("t1"), table("t2")], + ), + ( + "SELECT * FROM t1 WHERE a BETWEEN (SELECT b FROM t2) AND (SELECT c FROM t3)", + vec![table("t1"), table("t2"), table("t3")], + ), + ( + "SELECT * FROM t1 WHERE a LIKE (SELECT pattern FROM t2)", + vec![table("t1"), table("t2")], + ), + ] { + assert_table_extraction(sql, vec![ok_tables(expected_tables)], generic_dialect()); + } + } + + #[test] + fn test_subqueries_inside_projection_expressions() { + for (sql, expected_tables) in [ + ( + "SELECT CASE WHEN a > 0 THEN (SELECT b FROM t2) ELSE (SELECT c FROM t3) END FROM t1", + vec![table("t1"), table("t2"), table("t3")], + ), + ( + "SELECT CAST((SELECT b FROM t2) AS INT) FROM t1", + vec![table("t1"), table("t2")], + ), + ( + "SELECT ((SELECT b FROM t2)) FROM t1", + vec![table("t1"), table("t2")], + ), + ( + "SELECT ARRAY[(SELECT b FROM t2)] FROM t1", + vec![table("t1"), table("t2")], + ), + ( + "SELECT STRUCT((SELECT b FROM t2) AS b) FROM t1", + vec![table("t1"), table("t2")], + ), + ] { + assert_table_extraction(sql, vec![ok_tables(expected_tables)], generic_dialect()); + } + } + + #[test] + fn test_subqueries_inside_query_clauses() { + for (sql, expected_tables) in [ + ( + "SELECT a FROM t1 GROUP BY (SELECT b FROM t2)", + vec![table("t1"), table("t2")], + ), + ( + "SELECT a FROM t1 HAVING (SELECT b FROM t2) > 0", + vec![table("t1"), table("t2")], + ), + ( + "SELECT a FROM t1 ORDER BY (SELECT b FROM t2)", + vec![table("t1"), table("t2")], + ), + ] { + assert_table_extraction(sql, vec![ok_tables(expected_tables)], generic_dialect()); + } + } + + #[test] + fn test_subqueries_inside_function_clauses() { + for (sql, expected_tables) in [ + ( + "SELECT COUNT(*) FILTER (WHERE EXISTS (SELECT 1 FROM t2)) FROM t1", + vec![table("t1"), table("t2")], + ), + ( + "SELECT ARRAY_AGG(a ORDER BY (SELECT b FROM t2)) FROM t1", + vec![table("t1"), table("t2")], + ), + ( + "SELECT SUM(a) OVER (PARTITION BY (SELECT b FROM t2) ORDER BY (SELECT c FROM t3)) FROM t1", + vec![table("t1"), table("t2"), table("t3")], + ), + ] { + assert_table_extraction(sql, vec![ok_tables(expected_tables)], generic_dialect()); + } + } + + #[test] + fn test_nested_join_and_join_constraints() { + let sql = "SELECT * FROM (t1 JOIN t2 ON t1.id = t2.id) AS t12 JOIN t3 USING (id)"; + let expected = vec![ok_tables(vec![table("t1"), table("t2"), table("t3")])]; + assert_table_extraction(sql, expected, generic_dialect()); + } + + #[test] + fn test_derived_table_and_lateral_sources() { + // Outer scope's tables (t2 via JOIN) come before nested + // scopes (LATERAL subquery's t1). + let sql = "SELECT * FROM LATERAL (SELECT id FROM t1) AS d JOIN t2 ON d.id = t2.id"; + let expected = vec![ok_tables(vec![table("t2"), table("t1")])]; + assert_table_extraction(sql, expected, generic_dialect()); + } + + #[test] + fn test_table_function_sources() { + for (sql, expected_tables) in [ + ( + "SELECT * FROM UNNEST(ARRAY[(SELECT id FROM t1)]) AS u", + vec![table("t1")], + ), + ( + "SELECT * FROM generate_series((SELECT min_id FROM t1), 10) AS g", + vec![table("generate_series"), table("t1")], + ), + ] { + assert_table_extraction(sql, vec![ok_tables(expected_tables)], generic_dialect()); + } + } + + #[test] + fn test_query_set_expr_forms() { + for (sql, expected_tables) in [ + ( + "SELECT * FROM t1 UNION SELECT * FROM t2", + vec![table("t1"), table("t2")], + ), + ("VALUES ((SELECT id FROM t1))", vec![table("t1")]), + ( + "CREATE TABLE t2 AS TABLE t1", + vec![table("t2"), table("t1")], + ), + ] { + assert_table_extraction(sql, vec![ok_tables(expected_tables)], generic_dialect()); + } + } + + #[test] + fn test_query_clauses_with_subqueries() { + for (sql, expected_tables) in [ + ( + "SELECT * FROM t1 LIMIT (SELECT n FROM t2)", + vec![table("t1"), table("t2")], + ), + ( + "SELECT * FROM t1 FETCH FIRST 10 ROWS ONLY", + vec![table("t1")], + ), + ( + "SELECT SUM(a) OVER w FROM t1 WINDOW w AS (PARTITION BY (SELECT b FROM t2))", + vec![table("t1"), table("t2")], + ), + ] { + assert_table_extraction(sql, vec![ok_tables(expected_tables)], generic_dialect()); + } + } + + #[test] + fn test_dialect_specific_query_clauses_with_subqueries() { + // DISTINCT ON / TOP exprs are walked before FROM, but the outer + // scope's tables (t1) still come before the nested + // subquery's (t2) under scope-order traversal. + assert_table_extraction( + "SELECT DISTINCT ON ((SELECT id FROM t2)) id FROM t1", + vec![ok_tables(vec![table("t1"), table("t2")])], + one_dialect(sqlparser::dialect::PostgreSqlDialect {}), + ); + assert_table_extraction( + "SELECT TOP ((SELECT n FROM t2)) id FROM t1", + vec![ok_tables(vec![table("t1"), table("t2")])], + one_dialect(sqlparser::dialect::MsSqlDialect {}), + ); + assert_table_extraction( + "SELECT * INTO t2 FROM t1", + vec![ok_tables(vec![table("t1"), table("t2")])], + one_dialect(sqlparser::dialect::MsSqlDialect {}), + ); + assert_table_extraction( + "SELECT * FROM t1 SETTINGS max_threads = (SELECT n FROM t2)", + vec![ok_tables(vec![table("t1"), table("t2")])], + one_dialect(sqlparser::dialect::ClickHouseDialect {}), + ); + } + + #[test] + fn test_join_variants() { + for sql in [ + "SELECT * FROM t1 LEFT JOIN t2 ON t1.id = t2.id", + "SELECT * FROM t1 RIGHT JOIN t2 ON t1.id = t2.id", + "SELECT * FROM t1 FULL OUTER JOIN t2 ON t1.id = t2.id", + "SELECT * FROM t1 CROSS JOIN t2", + ] { + assert_table_extraction( + sql, + vec![ok_tables(vec![table("t1"), table("t2")])], + generic_dialect(), + ); + } + } + + #[test] + fn test_table_factor_extensions() { + assert_table_extraction( + "SELECT * FROM t1 TABLESAMPLE (10)", + vec![ok_tables(vec![table("t1")])], + generic_dialect(), + ); + assert_table_extraction( + "SELECT * FROM monthly_sales PIVOT(SUM(amount) FOR month IN ('JAN')) AS p", + vec![ok_tables(vec![table("monthly_sales")])], + generic_dialect(), + ); + } + + #[test] + fn test_pipe_operator_sources() { + // Outer scope's tables (t1 from FROM, t3 from |> JOIN) come + // before the WHERE subquery's nested scope (t2). + let sql = + "SELECT * FROM t1 |> WHERE id IN (SELECT id FROM t2) |> JOIN t3 ON id = t3.id"; + let expected = vec![ok_tables(vec![table("t1"), table("t3"), table("t2")])]; + assert_table_extraction( + sql, + expected, + one_dialect(sqlparser::dialect::BigQueryDialect {}), + ); + } } - #[test] - fn test_statement_with_full_identifier() { - let sql = + mod query_shapes { + use super::*; + + #[test] + fn test_statement_with_alias() { + let sql = "SELECT a FROM t1 AS t1_alias"; + let expected = vec![ok_tables(vec![table("t1")])]; + assert_table_extraction(sql, expected, all_dialects()); + } + + #[test] + fn test_statement_with_schema_identifier() { + let sql = "SELECT a FROM schema.table; INSERT INTO schema.table (a) VALUES (1)"; + let expected = vec![ + ok_tables(vec![schema_table("schema", "table")]), + ok_tables(vec![schema_table("schema", "table")]), + ]; + assert_table_extraction(sql, expected, all_dialects()); + } + + #[test] + fn test_statement_with_full_identifier() { + let sql = "SELECT a FROM catalog.schema.table; INSERT INTO catalog.schema.table (a) VALUES (1)"; - let expected = vec![ - Ok(Tables(vec![TableReference { - catalog: Some("catalog".into()), - schema: Some("schema".into()), - name: "table".into(), - alias: None, - }])), - Ok(Tables(vec![TableReference { - catalog: Some("catalog".into()), - schema: Some("schema".into()), - name: "table".into(), - alias: None, - }])), - ]; - assert_table_extraction(sql, expected, all_dialects()); - } + let expected = vec![ + ok_tables(vec![catalog_schema_table("catalog", "schema", "table")]), + ok_tables(vec![catalog_schema_table("catalog", "schema", "table")]), + ]; + assert_table_extraction(sql, expected, all_dialects()); + } - #[test] - fn test_statement_with_table_identifier_and_alias() { - let sql = "SELECT a FROM catalog.schema.table AS table_alias"; - let expected = vec![Ok(Tables(vec![TableReference { - catalog: Some("catalog".into()), - schema: Some("schema".into()), - name: "table".into(), - alias: Some("table_alias".into()), - }]))]; - assert_table_extraction(sql, expected, all_dialects()); - } + #[test] + fn test_statement_with_table_identifier_and_alias() { + let sql = "SELECT a FROM catalog.schema.table AS table_alias"; + let expected = vec![ok_tables(vec![catalog_schema_table( + "catalog", "schema", "table", + )])]; + assert_table_extraction(sql, expected, all_dialects()); + } - #[test] - fn test_statement_where_same_tables_appear_multiple_times() { - let sql = "SELECT a FROM t1 INNER JOIN t2 ON t1.id = t2.id WHERE b = ( SELECT c FROM t3 INNER JOIN t1 ON t3.id = t1.id )"; - let expected = vec![Ok(Tables(vec![ - TableReference { - catalog: None, - schema: None, - name: "t1".into(), - alias: None, - }, - TableReference { - catalog: None, - schema: None, - name: "t2".into(), - alias: None, - }, - TableReference { - catalog: None, - schema: None, - name: "t3".into(), - alias: None, - }, - TableReference { - catalog: None, - schema: None, - name: "t1".into(), - alias: None, - }, - ]))]; - assert_table_extraction(sql, expected, all_dialects()); + #[test] + fn test_statement_where_same_tables_appear_multiple_times() { + let sql = "SELECT a FROM t1 INNER JOIN t2 ON t1.id = t2.id WHERE b = ( SELECT c FROM t3 INNER JOIN t1 ON t3.id = t1.id )"; + let expected = vec![ok_tables(vec![ + table("t1"), + table("t2"), + table("t3"), + table("t1"), + ])]; + assert_table_extraction(sql, expected, all_dialects()); + } + + #[test] + fn test_statement_with_subquery_inside_function_expression() { + let sql = "SELECT COALESCE((SELECT b FROM t2), a) FROM t1"; + let expected = vec![ok_tables(vec![table("t1"), table("t2")])]; + assert_table_extraction(sql, expected, all_dialects()); + } + + #[test] + fn test_statement_with_subquery_in_order_by() { + let sql = "SELECT a FROM t1 ORDER BY (SELECT b FROM t2)"; + let expected = vec![ok_tables(vec![table("t1"), table("t2")])]; + assert_table_extraction(sql, expected, all_dialects()); + } } - #[test] - fn test_statement_error_with_too_many_identifiers() { - let sql = "SELECT a FROM catalog.schema.table.extra"; - let expected = vec![Err(Error::AnalysisError( - "Too many identifiers provided".to_string(), - ))]; - assert_table_extraction(sql, expected, all_dialects()); + mod cte { + use super::*; + + #[test] + fn test_statement_with_cte() { + let sql = "WITH t2 AS (SELECT id FROM t1) SELECT * FROM t2"; + let expected = vec![ok_tables(vec![table("t1")])]; + assert_table_extraction(sql, expected, all_dialects()); + } + + #[test] + fn test_statement_with_case_insensitive_cte_reference() { + let sql = "WITH T2 AS (SELECT id FROM t1) SELECT * FROM t2"; + let expected = vec![ok_tables(vec![table("t1")])]; + assert_table_extraction(sql, expected, all_dialects()); + } + + #[test] + fn test_statement_with_quoted_cte_does_not_match_unquoted_reference() { + let sql = r#"WITH "T2" AS (SELECT id FROM t1) SELECT * FROM t2"#; + // Outer scope's t2 (CTE didn't match the unquoted reference) + // precedes the nested CTE body's t1. + let expected = vec![ok_tables(vec![table("t2"), table("t1")])]; + assert_table_extraction( + sql, + expected, + vec![Box::new(sqlparser::dialect::GenericDialect {})], + ); + } + + #[test] + fn test_statement_with_quoted_cte_exact_reference() { + let sql = r#"WITH "T2" AS (SELECT id FROM t1) SELECT * FROM "T2""#; + let expected = vec![ok_tables(vec![table("t1")])]; + assert_table_extraction( + sql, + expected, + vec![Box::new(sqlparser::dialect::GenericDialect {})], + ); + } + + #[test] + fn test_statement_with_cte_referencing_previous_cte() { + let sql = "WITH t2 AS (SELECT id FROM t1), t3 AS (SELECT id FROM t2) SELECT * FROM t3"; + let expected = vec![ok_tables(vec![table("t1")])]; + assert_table_extraction(sql, expected, all_dialects()); + } + + #[test] + fn test_statement_with_cte_does_not_resolve_forward_reference() { + let sql = "WITH t2 AS (SELECT id FROM t3), t3 AS (SELECT id FROM t1) SELECT * FROM t2"; + let expected = vec![ok_tables(vec![table("t3"), table("t1")])]; + assert_table_extraction(sql, expected, all_dialects()); + } + + #[test] + fn test_statement_with_cte_shadows_base_table_after_definition() { + let sql = "WITH t2 AS (SELECT id FROM t3), t3 AS (SELECT id FROM t1) SELECT * FROM t3"; + let expected = vec![ok_tables(vec![table("t3"), table("t1")])]; + assert_table_extraction(sql, expected, all_dialects()); + } + + #[test] + fn test_statement_with_qualified_table_not_shadowed_by_cte() { + let sql = + "WITH t2 AS (SELECT id FROM t4), t3 AS (SELECT id FROM t1) SELECT * FROM s.t3"; + // Outer scope's s.t3 comes first; CTE bodies (t4, t1) follow in + // creation order. + let expected = vec![ok_tables(vec![ + schema_table("s", "t3"), + table("t4"), + table("t1"), + ])]; + assert_table_extraction(sql, expected, all_dialects()); + } + + #[test] + fn test_statement_with_qualified_table_not_shadowed_by_previous_cte_inside_cte_body() { + let sql = + "WITH t2 AS (SELECT id FROM t1), t3 AS (SELECT id FROM s.t2) SELECT * FROM t3"; + let expected = vec![ok_tables(vec![table("t1"), schema_table("s", "t2")])]; + assert_table_extraction(sql, expected, all_dialects()); + } + + #[test] + fn test_statement_with_recursive_cte_self_reference() { + let sql = "WITH RECURSIVE t2 AS (SELECT id FROM t2) SELECT * FROM t2"; + let expected = vec![ok_tables(vec![])]; + assert_table_extraction( + sql, + expected, + vec![Box::new(sqlparser::dialect::GenericDialect {})], + ); + } + + #[test] + fn test_statement_with_cte_shadowing_base_table() { + let sql = + "WITH t1 AS (SELECT id FROM t2) SELECT * FROM t1 JOIN s1.t1 AS t3 ON t1.id = t3.id"; + // Outer scope's s1.t1 AS t3 (from JOIN) is recorded before the CTE + // body's t2 in the nested scope. + let expected = vec![ok_tables(vec![schema_table("s1", "t1"), table("t2")])]; + assert_table_extraction(sql, expected, all_dialects()); + } + + #[test] + fn test_nested_statement_with_cte_scope() { + let sql = "WITH t1 AS (SELECT id FROM t2) SELECT * FROM (WITH t1 AS (SELECT id FROM t3) SELECT * FROM t1) AS t4 JOIN t1 ON t4.id = t1.id"; + let expected = vec![ok_tables(vec![table("t2"), table("t3")])]; + assert_table_extraction(sql, expected, all_dialects()); + } + + #[test] + fn test_nested_cte_does_not_leak_to_outer_query() { + let sql = "SELECT * FROM (WITH t2 AS (SELECT id FROM t1) SELECT * FROM t2) AS t3 JOIN t2 ON t3.id = t2.id"; + // Outer scope's t2 (from JOIN, base table) comes before the nested + // CTE body's t1. + let expected = vec![ok_tables(vec![table("t2"), table("t1")])]; + assert_table_extraction(sql, expected, all_dialects()); + } + + #[test] + fn test_insert_select_with_cte_source() { + let sql = "INSERT INTO t1 WITH t3 AS (SELECT id FROM t2) SELECT * FROM t3"; + let expected = vec![ok_tables(vec![table("t1"), table("t2")])]; + assert_table_extraction(sql, expected, all_dialects()); + } + + #[test] + fn test_statement_error_with_too_many_identifiers() { + let sql = "SELECT a FROM catalog.schema.table.extra"; + let expected = vec![Err(Error::AnalysisError( + "Too many identifiers provided".to_string(), + ))]; + assert_table_extraction(sql, expected, all_dialects()); + } } mod delete_statement { @@ -410,21 +695,10 @@ mod tests { #[test] fn test_delete_statement() { + // Targets used to be spliced into the output; now only scope-bound + // sources appear, so the target reference no longer duplicates. let sql = "DELETE t1 FROM t1"; - let expected = vec![Ok(Tables(vec![ - TableReference { - catalog: None, - schema: None, - name: "t1".into(), - alias: None, - }, - TableReference { - catalog: None, - schema: None, - name: "t1".into(), - alias: None, - }, - ]))]; + let expected = vec![ok_tables(vec![table("t1")])]; // BigQuery and Generic do not support DELETE ... FROM assert_table_extraction( sql, @@ -436,26 +710,19 @@ mod tests { #[test] fn test_delete_statement_with_aliases() { let sql = "DELETE t1_alias FROM t1 AS t1_alias JOIN t2 AS t2_alias ON t1_alias.a = t2_alias.a WHERE t2_alias.b = 1"; - let expected = vec![Ok(Tables(vec![ - TableReference { - catalog: None, - schema: None, - name: "t1".into(), - alias: Some("t1_alias".into()), - }, - TableReference { - catalog: None, - schema: None, - name: "t1".into(), - alias: Some("t1_alias".into()), - }, - TableReference { - catalog: None, - schema: None, - name: "t2".into(), - alias: Some("t2_alias".into()), - }, - ]))]; + let expected = vec![ok_tables(vec![table("t1"), table("t2")])]; + // BigQuery and Generic do not support DELETE ... FROM + assert_table_extraction( + sql, + expected, + all_dialects_except(&vec!["GenericDialect", "BigQueryDialect"]), + ); + } + + #[test] + fn test_delete_statement_with_case_insensitive_alias_target() { + let sql = "DELETE T1_ALIAS FROM t1 AS t1_alias JOIN t2 ON t1_alias.a = t2.a"; + let expected = vec![ok_tables(vec![table("t1"), table("t2")])]; // BigQuery and Generic do not support DELETE ... FROM assert_table_extraction( sql, @@ -468,38 +735,7 @@ mod tests { fn test_delete_multiple_tables_with_join() { let sql = "DELETE t1, t2 FROM t1 INNER JOIN t2 INNER JOIN t3 WHERE t1.a = t2.a AND t2.a = t3.a"; - let expected = vec![Ok(Tables(vec![ - TableReference { - catalog: None, - schema: None, - name: "t1".into(), - alias: None, - }, - TableReference { - catalog: None, - schema: None, - name: "t2".into(), - alias: None, - }, - TableReference { - catalog: None, - schema: None, - name: "t1".into(), - alias: None, - }, - TableReference { - catalog: None, - schema: None, - name: "t2".into(), - alias: None, - }, - TableReference { - catalog: None, - schema: None, - name: "t3".into(), - alias: None, - }, - ]))]; + let expected = vec![ok_tables(vec![table("t1"), table("t2"), table("t3")])]; // BigQuery and Generic do not support DELETE ... FROM assert_table_extraction( sql, @@ -511,50 +747,21 @@ mod tests { #[test] fn test_delete_from_statement() { let sql = "DELETE FROM t1"; - let expected = vec![Ok(Tables(vec![TableReference { - catalog: None, - schema: None, - name: "t1".into(), - alias: None, - }]))]; + let expected = vec![ok_tables(vec![table("t1")])]; + assert_table_extraction(sql, expected, all_dialects()); + } + + #[test] + fn test_delete_from_statement_with_selection() { + let sql = "DELETE FROM t1 WHERE id IN (SELECT id FROM t2)"; + let expected = vec![ok_tables(vec![table("t1"), table("t2")])]; assert_table_extraction(sql, expected, all_dialects()); } #[test] fn test_delete_from_statement_with_alias() { let sql = "DELETE FROM t1_alias, t2_alias USING t1 AS t1_alias INNER JOIN t2 AS t2_alias INNER JOIN t3"; - let expected = vec![Ok(Tables(vec![ - TableReference { - catalog: None, - schema: None, - name: "t1".into(), - alias: Some("t1_alias".into()), - }, - TableReference { - catalog: None, - schema: None, - name: "t2".into(), - alias: Some("t2_alias".into()), - }, - TableReference { - catalog: None, - schema: None, - name: "t1".into(), - alias: Some("t1_alias".into()), - }, - TableReference { - catalog: None, - schema: None, - name: "t2".into(), - alias: Some("t2_alias".into()), - }, - TableReference { - catalog: None, - schema: None, - name: "t3".into(), - alias: None, - }, - ]))]; + let expected = vec![ok_tables(vec![table("t1"), table("t2"), table("t3")])]; assert_table_extraction(sql, expected, all_dialects()); } } @@ -565,34 +772,38 @@ mod tests { #[test] fn test_insert_statement() { let sql = "INSERT INTO t1 (a, b) VALUES (1, 2)"; - let expected = vec![Ok(Tables(vec![TableReference { - catalog: None, - schema: None, - name: "t1".into(), - alias: None, - }]))]; + let expected = vec![ok_tables(vec![table("t1")])]; assert_table_extraction(sql, expected, all_dialects()); } #[test] fn test_insert_select_statement() { let sql = "INSERT INTO t1 SELECT * FROM t2"; - let expected = vec![Ok(Tables(vec![ - TableReference { - catalog: None, - schema: None, - name: "t1".into(), - alias: None, - }, - TableReference { - catalog: None, - schema: None, - name: "t2".into(), - alias: None, - }, - ]))]; + let expected = vec![ok_tables(vec![table("t1"), table("t2")])]; assert_table_extraction(sql, expected, all_dialects()); } + + #[test] + fn test_insert_set_statement() { + let sql = "INSERT INTO t1 SET a = (SELECT b FROM t2)"; + let expected = vec![ok_tables(vec![table("t1"), table("t2")])]; + assert_table_extraction( + sql, + expected, + one_dialect(sqlparser::dialect::MySqlDialect {}), + ); + } + + #[test] + fn test_insert_table_function_statement() { + let sql = "INSERT INTO FUNCTION remote('localhost', default.t1) SELECT * FROM t2"; + let expected = vec![ok_tables(vec![table("remote"), table("t2")])]; + assert_table_extraction( + sql, + expected, + one_dialect(sqlparser::dialect::ClickHouseDialect {}), + ); + } } mod update_statement { @@ -601,107 +812,126 @@ mod tests { #[test] fn test_update_statement() { let sql = "UPDATE t1 SET a = 1"; - let expected = vec![Ok(Tables(vec![TableReference { - catalog: None, - schema: None, - name: "t1".into(), - alias: None, - }]))]; + let expected = vec![ok_tables(vec![table("t1")])]; assert_table_extraction(sql, expected, all_dialects()); } #[test] fn test_update_statement_with_alias() { let sql = "UPDATE t1 AS t1_alias INNER JOIN t2 ON t1_alias.a = t2.a SET t1_alias.b = t2.b WHERE t2.c = (SELECT c FROM t3)"; - let expected = vec![Ok(Tables(vec![ - TableReference { - catalog: None, - schema: None, - name: "t1".into(), - alias: Some("t1_alias".into()), - }, - TableReference { - catalog: None, - schema: None, - name: "t2".into(), - alias: None, - }, - TableReference { - catalog: None, - schema: None, - name: "t3".into(), - alias: None, - }, - ]))]; + let expected = vec![ok_tables(vec![table("t1"), table("t2"), table("t3")])]; assert_table_extraction(sql, expected, all_dialects()); } + + #[test] + fn test_update_statement_with_from_and_subqueries() { + let sql = + "UPDATE t1 SET a = (SELECT b FROM t3) FROM t2 WHERE t1.id IN (SELECT id FROM t4)"; + let expected = vec![ok_tables(vec![ + table("t1"), + table("t2"), + table("t3"), + table("t4"), + ])]; + assert_table_extraction( + sql, + expected, + one_dialect(sqlparser::dialect::PostgreSqlDialect {}), + ); + } } - #[test] - fn test_merge_statement() { - let sql = "MERGE INTO t1 USING t2 ON t1.a = t2.a \ + mod merge { + use super::*; + + #[test] + fn test_merge_statement() { + let sql = "MERGE INTO t1 USING t2 ON t1.a = t2.a \ WHEN MATCHED THEN UPDATE SET t1.b = t2.b \ WHEN NOT MATCHED THEN INSERT (a, b) VALUES (t2.a, t2.b)"; - let expected = vec![Ok(Tables(vec![ - TableReference { - catalog: None, - schema: None, - name: "t1".into(), - alias: None, - }, - TableReference { - catalog: None, - schema: None, - name: "t2".into(), - alias: None, - }, - ]))]; - assert_table_extraction(sql, expected, all_dialects()); - } + let expected = vec![ok_tables(vec![table("t1"), table("t2")])]; + assert_table_extraction(sql, expected, all_dialects()); + } - #[test] - fn test_merge_statement_with_alias() { - let sql = "MERGE INTO t1 AS t1_alias USING (SELECT a, b FROM t2) AS t2_alias(a, b) ON t1_alias.a = t2_alias.a \ + #[test] + fn test_merge_statement_with_alias() { + let sql = "MERGE INTO t1 AS t1_alias USING (SELECT a, b FROM t2) AS t2_alias(a, b) ON t1_alias.a = t2_alias.a \ WHEN MATCHED THEN UPDATE SET t1_alias.b = t2_alias.b \ WHEN NOT MATCHED THEN INSERT (a, b) VALUES (t2_alias.a, t2_alias.b)"; - let expected = vec![Ok(Tables(vec![ - TableReference { - catalog: None, - schema: None, - name: "t1".into(), - alias: Some("t1_alias".into()), - }, - TableReference { - catalog: None, - schema: None, - name: "t2".into(), - alias: None, - }, - ]))]; - assert_table_extraction(sql, expected, all_dialects()); - } + let expected = vec![ok_tables(vec![table("t1"), table("t2")])]; + assert_table_extraction(sql, expected, all_dialects()); + } - #[test] - fn test_create_table_statement() { - let sql = "CREATE TABLE t1 (a INT)"; - let expected = vec![Ok(Tables(vec![TableReference { - catalog: None, - schema: None, - name: "t1".into(), - alias: None, - }]))]; - assert_table_extraction(sql, expected, all_dialects()); + #[test] + fn test_merge_statement_with_clause_predicate() { + let sql = "MERGE INTO t1 USING t2 ON t1.id = t2.id \ + WHEN MATCHED AND EXISTS (SELECT 1 FROM t3) THEN DELETE"; + let expected = vec![ok_tables(vec![table("t1"), table("t2"), table("t3")])]; + assert_table_extraction(sql, expected, generic_dialect()); + } } - #[test] - fn test_alters_table_statement() { - let sql = "ALTER TABLE t1 ADD COLUMN a INT"; - let expected = vec![Ok(Tables(vec![TableReference { - catalog: None, - schema: None, - name: "t1".into(), - alias: None, - }]))]; - assert_table_extraction(sql, expected, all_dialects()); + mod ddl { + use super::*; + + #[test] + fn test_create_table_statement() { + let sql = "CREATE TABLE t1 (a INT)"; + let expected = vec![ok_tables(vec![table("t1")])]; + assert_table_extraction(sql, expected, all_dialects()); + } + + #[test] + fn test_create_table_as_select_statement() { + let sql = "CREATE TABLE t1 AS SELECT * FROM t2"; + let expected = vec![ok_tables(vec![table("t1"), table("t2")])]; + assert_table_extraction(sql, expected, generic_dialect()); + } + + #[test] + fn test_create_view_statement() { + let sql = "CREATE VIEW t1 AS SELECT * FROM t2"; + let expected = vec![ok_tables(vec![table("t1"), table("t2")])]; + assert_table_extraction(sql, expected, generic_dialect()); + } + + #[test] + fn test_create_virtual_table_statement() { + let sql = "CREATE VIRTUAL TABLE t1 USING fts5(a)"; + let expected = vec![ok_tables(vec![table("t1")])]; + assert_table_extraction( + sql, + expected, + one_dialect(sqlparser::dialect::SQLiteDialect {}), + ); + } + + #[test] + fn test_alters_table_statement() { + let sql = "ALTER TABLE t1 ADD COLUMN a INT"; + let expected = vec![ok_tables(vec![table("t1")])]; + assert_table_extraction(sql, expected, all_dialects()); + } + + #[test] + fn test_drop_table_statement() { + let sql = "DROP TABLE t1, t2"; + let expected = vec![ok_tables(vec![table("t1"), table("t2")])]; + assert_table_extraction(sql, expected, generic_dialect()); + } + + #[test] + fn test_drop_index_statement_records_parent_table() { + let sql = "DROP INDEX idx1 ON t1"; + let expected = vec![ok_tables(vec![table("t1")])]; + assert_table_extraction(sql, expected, generic_dialect()); + } + + #[test] + fn test_truncate_table_statement() { + let sql = "TRUNCATE TABLE t1, t2"; + let expected = vec![ok_tables(vec![table("t1"), table("t2")])]; + assert_table_extraction(sql, expected, generic_dialect()); + } } } diff --git a/sql-insight/src/extractor/table_operation_extractor.rs b/sql-insight/src/extractor/table_operation_extractor.rs new file mode 100644 index 0000000..3a2d3f7 --- /dev/null +++ b/sql-insight/src/extractor/table_operation_extractor.rs @@ -0,0 +1,1088 @@ +//! Extracts the application-level operations a SQL statement performs. +//! +//! Where [`extract_tables`](crate::extract_tables()) answers "what tables +//! does this SQL touch?" and [`extract_crud_tables`](crate::extract_crud_tables()) +//! answers it in CRUD buckets, this module answers "what operations does +//! this SQL perform, on which tables, and how do those tables relate?". +//! +//! The output is per-statement: one [`TableOperation`] per parsed +//! statement, since a single application call (e.g. an ORM `execute()`) +//! typically corresponds to a single statement. +//! +//! Three parallel surfaces describe the statement: +//! - `reads` — every table the statement reads from. +//! - `writes` — every table the statement writes to. +//! - `lineage` — directed `source → target` edges for statements that +//! physically move data. +//! +//! A single table can appear in both `reads` and `writes` when it plays +//! both roles (e.g. `DELETE t1 FROM t1` — t1 is the deletion target and +//! a row source). + +use crate::catalog::Catalog; +use crate::diagnostic::{TableLevelDiagnostic, TableLevelDiagnosticKind}; +use crate::error::Error; +use crate::reference::TableReference; +use crate::resolver::Resolver; +use sqlparser::ast::Statement; +use sqlparser::dialect::Dialect; +use sqlparser::parser::Parser; + +/// Convenience function to extract table-level operations from SQL. +/// +/// `catalog` is consulted opportunistically for relation-level enrichment +/// (table schema lookup, future view expansion and synonym resolution). +/// Pass `None` for the lightest path — table-level extraction works +/// purely from the AST and never requires a catalog. +/// +/// ## Example +/// +/// ```rust +/// use sql_insight::sqlparser::dialect::GenericDialect; +/// use sql_insight::{extract_table_operations, StatementKind}; +/// +/// let dialect = GenericDialect {}; +/// let result = extract_table_operations(&dialect, "SELECT * FROM users", None).unwrap(); +/// let ops = result[0].as_ref().unwrap(); +/// assert_eq!(ops.statement_kind, StatementKind::Select); +/// assert_eq!(ops.reads.len(), 1); +/// assert_eq!(ops.reads[0].name.value, "users"); +/// assert!(ops.writes.is_empty()); +/// ``` +pub fn extract_table_operations( + dialect: &dyn Dialect, + sql: &str, + catalog: Option<&dyn Catalog>, +) -> Result>, Error> { + TableOperationExtractor::extract(dialect, sql, catalog) +} + +/// Operations performed by a single SQL statement. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct TableOperation { + pub statement_kind: StatementKind, + pub reads: Vec, + pub writes: Vec, + pub lineage: Vec, + pub diagnostics: Vec, +} + +/// What a statement does, at a coarse level. The *verb* of the statement +/// — INSERT vs CREATE TABLE vs MERGE vs … — combined with the +/// `reads` / `writes` split recovers every distinction the project needs +/// to make at table granularity. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum StatementKind { + /// `SELECT ...` (and other read-only queries: `TABLE foo`, `VALUES`, + /// `WITH ... SELECT ...`). Reads only — no writes, no lineage. + Select, + /// `INSERT INTO ...`. Writes to one target table; reads from the + /// `VALUES` / `SELECT` source. Emits source → target lineage. + Insert, + /// `UPDATE ... SET ...`. Reads and writes the same target table; + /// reads from any joined / sub-query sources. Emits lineage from + /// SET right-hand-side sources into the target columns. + Update, + /// `DELETE FROM ...`. The target table appears in both `reads` + /// (row source) and `writes` (deletion target). No lineage. + Delete, + /// `MERGE INTO ... USING ...`. The target appears in both `reads` + /// and `writes`; each `WHEN` clause may emit lineage from the + /// source into the target's update / insert columns. + Merge, + /// `CREATE TABLE ...`. The new table is a write target. CREATE + /// TABLE AS (CTAS) also reads from its SELECT and emits per-column + /// lineage into the new table's columns. + CreateTable, + /// `CREATE VIEW ... AS SELECT ...`. The new view is a write + /// target; reads come from the SELECT body. Per-column lineage + /// pairs the SELECT projections with the view's columns. + CreateView, + /// `ALTER TABLE ...`. The altered table is a write target. + /// Column-level changes are not modelled in detail. + AlterTable, + /// `ALTER VIEW ... AS SELECT ...`. Treated like CREATE VIEW for + /// extraction purposes — the view is a write target, the new + /// SELECT body supplies reads and per-column lineage. + AlterView, + /// `DROP TABLE` / `DROP VIEW` / `DROP MATERIALIZED VIEW`. The + /// dropped relation is a write target. Other DROP variants + /// (functions, schemas, indexes, etc.) classify as + /// [`Unsupported`](StatementKind::Unsupported). + Drop, + /// `TRUNCATE TABLE ...`. The truncated table is a write target. + Truncate, + /// Statement is outside the operation-extraction scope. The + /// accompanying `diagnostics` list explains why. + Unsupported, +} + +/// A source-to-target table lineage edge inferred from the statement +/// structure. +/// +/// Emitted only for statements that physically move data into a target +/// (`INSERT`, `UPDATE`, `MERGE`, `CREATE TABLE AS SELECT`, `CREATE VIEW`). +/// `DELETE`, `DROP`, `TRUNCATE`, `ALTER`, and bare `SELECT` produce no +/// lineage even when they reference other tables — the touched tables are +/// still visible through [`TableOperation::reads`] and +/// [`TableOperation::writes`]. +/// +/// Each `TableLineageEdge` is a single directed edge — a statement that derives +/// `t` from `a JOIN b` emits two edges (`a → t`, `b → t`), not one entry +/// with both sources. This keeps equality and aggregation across +/// statements simple (set-union over edges). +/// +/// Tables referenced only inside a predicate subquery are excluded: +/// `INSERT INTO t SELECT FROM s WHERE id IN (SELECT id FROM x)` emits +/// `s → t` but not `x → t`. `x` remains visible via `reads`. +/// +/// CTE transitivity: `WITH cte AS (SELECT ... FROM s) INSERT INTO t +/// SELECT ... FROM cte` emits `s → t` because `s` sits in a +/// data-feeding chain from the CTE body up through the INSERT target. +/// Deeper transitivity (recursive CTEs, multi-hop indirection) is +/// intentionally out of scope for the MVP. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct TableLineageEdge { + pub source: TableReference, + pub target: TableReference, +} + +/// Extracts operations from SQL. +#[derive(Default, Debug)] +pub struct TableOperationExtractor; + +impl TableOperationExtractor { + pub fn extract( + dialect: &dyn Dialect, + sql: &str, + catalog: Option<&dyn Catalog>, + ) -> Result>, Error> { + let statements = Parser::parse_sql(dialect, sql)?; + Ok(statements + .iter() + .map(|s| Self::extract_from_statement(s, catalog)) + .collect()) + } + + pub fn extract_from_statement( + statement: &Statement, + catalog: Option<&dyn Catalog>, + ) -> Result { + let kind = classify_statement(statement); + let resolution = Resolver::resolve_statement(catalog, statement)?; + + let mut reads = Vec::new(); + let mut writes = Vec::new(); + // Start from resolver-level diagnostics, projected down to the + // table granularity — column-resolution gaps and suppressed + // wildcards don't affect table-level completeness, so they drop + // out here (only `UnsupportedStatement` carries over). Extractor + // adds its own only when classify_statement detects an unsupported + // case the resolver did not already report — avoids duplicating + // the common case where both layers agree. + let mut diagnostics: Vec = resolution + .diagnostics + .iter() + .filter_map(|d| d.to_table_level()) + .collect(); + + if matches!(kind, StatementKind::Unsupported) { + if !diagnostics + .iter() + .any(|d| matches!(d.kind, TableLevelDiagnosticKind::UnsupportedStatement)) + { + diagnostics.push(TableLevelDiagnostic { + kind: TableLevelDiagnosticKind::UnsupportedStatement, + message: format!( + "Unsupported statement for operation extraction: {}", + statement + ), + span: None, + }); + } + } else { + // A multi-role table (e.g. `DELETE t1 FROM t1` — t1 is both + // deletion target and row source) appears in both lists. + reads = resolution.read_tables(); + writes = resolution.write_tables(); + } + + let lineage = extract_table_lineage(&resolution, &kind); + + Ok(TableOperation { + statement_kind: kind, + reads, + writes, + lineage, + diagnostics, + }) + } +} + +/// Emit one `TableLineageEdge` per (feeding source × write target) pair +/// for statements that physically move data. Statements without a write +/// target or without any data-feeding source produce no lineage. +fn extract_table_lineage( + resolution: &crate::resolver::Resolution, + kind: &StatementKind, +) -> Vec { + if !is_data_moving(kind) { + return Vec::new(); + } + // Data-moving statements all carry exactly one write target. If + // somehow zero or many appear (parser oddity, unsupported variant) + // we conservatively emit no lineage rather than guessing. + let mut targets = resolution.write_tables().into_iter(); + let Some(target) = targets.next() else { + return Vec::new(); + }; + resolution + .feeding_read_tables() + .into_iter() + .map(|source| TableLineageEdge { + source, + target: target.clone(), + }) + .collect() +} + +fn is_data_moving(kind: &StatementKind) -> bool { + matches!( + kind, + StatementKind::Insert + | StatementKind::Update + | StatementKind::Merge + | StatementKind::CreateTable + | StatementKind::CreateView + ) +} + +pub(super) fn classify_statement(statement: &Statement) -> StatementKind { + use sqlparser::ast::{ObjectType, SetExpr}; + match statement { + // `WITH cte AS (...) INSERT/UPDATE/DELETE/MERGE ...` is parsed + // by sqlparser as a top-level Query whose body is a + // `SetExpr::Insert/Update/Delete/Merge` wrapping the actual + // DML statement. Reclassify against the inner statement so + // the public StatementKind matches the verb the user wrote, + // not the parser-level wrapper. + Statement::Query(query) => match query.body.as_ref() { + SetExpr::Insert(stmt) + | SetExpr::Update(stmt) + | SetExpr::Delete(stmt) + | SetExpr::Merge(stmt) => classify_statement(stmt), + _ => StatementKind::Select, + }, + Statement::Insert(_) => StatementKind::Insert, + Statement::Update(_) => StatementKind::Update, + Statement::Delete(_) => StatementKind::Delete, + Statement::Merge(_) => StatementKind::Merge, + Statement::CreateTable(_) | Statement::CreateVirtualTable { .. } => { + StatementKind::CreateTable + } + Statement::CreateView(_) => StatementKind::CreateView, + Statement::AlterTable(_) => StatementKind::AlterTable, + Statement::AlterView { .. } => StatementKind::AlterView, + Statement::Drop { + object_type: ObjectType::Table | ObjectType::View | ObjectType::MaterializedView, + .. + } => StatementKind::Drop, + Statement::Truncate(_) => StatementKind::Truncate, + // Drop variants that don't target relations (DROP FUNCTION, + // DROP SCHEMA, etc.) — and every other unsupported variant — + // fall through to Unsupported so the caller still gets a clear + // diagnostic. + _ => StatementKind::Unsupported, + } +} + +#[cfg(test)] +mod tests { + use super::*; + use sqlparser::dialect::{Dialect, GenericDialect, MySqlDialect, PostgreSqlDialect}; + + fn table(name: &str) -> TableReference { + TableReference { + catalog: None, + schema: None, + name: name.into(), + } + } + + fn edge(source: &str, target: &str) -> TableLineageEdge { + TableLineageEdge { + source: table(source), + target: table(target), + } + } + + /// Whole-value-ish assertion: pin down the full + /// `TableOperation` for `sql`, but compare diagnostics + /// by **kind sequence only** — message text and span coordinates + /// are ignored. This lets tests focus on "what was extracted" + /// without coupling to diagnostic wording or column offsets that + /// shift when SQL is reformatted. + /// + /// Tests that genuinely care about the message / span shape + /// should fall back to per-field `assert_eq!`. + fn assert_ops(sql: &str, expected: TableOperation) { + assert_nth_ops_with(sql, 0, &GenericDialect {}, expected); + } + + fn assert_ops_with(sql: &str, dialect: &dyn Dialect, expected: TableOperation) { + assert_nth_ops_with(sql, 0, dialect, expected); + } + + /// Like `assert_ops`, but for multi-statement SQL — pins down the + /// statement at `index` in the parsed batch. Compose calls to pin + /// down every statement in a batch separately. + fn assert_nth_ops(sql: &str, index: usize, expected: TableOperation) { + assert_nth_ops_with(sql, index, &GenericDialect {}, expected); + } + + fn assert_nth_ops_with( + sql: &str, + index: usize, + dialect: &dyn Dialect, + expected: TableOperation, + ) { + let result = extract_table_operations(dialect, sql, None).unwrap(); + let actual = result + .into_iter() + .nth(index) + .unwrap_or_else(|| panic!("statement {index} missing in result for SQL: {sql}")) + .unwrap(); + let TableOperation { + statement_kind, + reads, + writes, + lineage, + diagnostics, + } = expected; + assert_eq!( + actual.statement_kind, statement_kind, + "kind for SQL: {sql} (statement {index})" + ); + assert_eq!( + actual.reads, reads, + "reads for SQL: {sql} (statement {index})" + ); + assert_eq!( + actual.writes, writes, + "writes for SQL: {sql} (statement {index})" + ); + assert_eq!( + actual.lineage, lineage, + "lineage for SQL: {sql} (statement {index})" + ); + let actual_kinds: Vec<_> = actual.diagnostics.iter().map(|d| d.kind.clone()).collect(); + let expected_kinds: Vec<_> = diagnostics.iter().map(|d| d.kind.clone()).collect(); + assert_eq!( + actual_kinds, expected_kinds, + "diagnostic kinds for SQL: {sql} (statement {index})" + ); + } + + /// Construct a placeholder `TableLevelDiagnostic` for the + /// `expected.diagnostics` list in `assert_ops`. Only the kind is + /// compared; the message and span are placeholders. + fn diag(kind: TableLevelDiagnosticKind) -> TableLevelDiagnostic { + TableLevelDiagnostic { + kind, + message: String::new(), + span: None, + } + } + + mod select { + use super::*; + + #[test] + fn select_emits_reads_only() { + assert_ops( + "SELECT id FROM users", + TableOperation { + statement_kind: StatementKind::Select, + reads: vec![table("users")], + writes: vec![], + lineage: vec![], + diagnostics: vec![], + }, + ); + } + + #[test] + fn select_with_join_emits_one_read_per_table() { + // The `*` does not surface a diagnostic at table granularity — + // WildcardSuppressed is a column-level concern and is filtered + // out of table-level output (the table set is complete + // regardless of wildcard expansion). + assert_ops( + "SELECT * FROM t1 JOIN t2 ON t1.id = t2.id", + TableOperation { + statement_kind: StatementKind::Select, + reads: vec![table("t1"), table("t2")], + writes: vec![], + lineage: vec![], + diagnostics: vec![], + }, + ); + } + + #[test] + fn select_with_subquery_emits_read_for_every_table() { + assert_ops( + "SELECT t1.a FROM t1 WHERE id IN (SELECT id FROM t2)", + TableOperation { + statement_kind: StatementKind::Select, + reads: vec![table("t1"), table("t2")], + writes: vec![], + lineage: vec![], + diagnostics: vec![], + }, + ); + } + + #[test] + fn cte_body_tables_emit_reads_but_cte_name_does_not() { + // Only t1 is a table reference; t2 is the CTE binding and stays out. + assert_ops( + "WITH t2 AS (SELECT id FROM t1) SELECT t2.id FROM t2", + TableOperation { + statement_kind: StatementKind::Select, + reads: vec![table("t1")], + writes: vec![], + lineage: vec![], + diagnostics: vec![], + }, + ); + } + } + + mod set_operations { + use super::*; + + #[test] + fn union_emits_read_for_each_branch_table() { + // Each UNION branch walks its own FROM, so both tables + // surface in reads. No lineage: bare SELECT statements + // never produce table-level data movement. + assert_ops( + "SELECT a FROM t1 UNION SELECT b FROM t2", + TableOperation { + statement_kind: StatementKind::Select, + reads: vec![table("t1"), table("t2")], + writes: vec![], + lineage: vec![], + diagnostics: vec![], + }, + ); + } + + #[test] + fn intersect_and_except_match_union_shape() { + // SetOperator variant doesn't influence table-level + // surfacing — INTERSECT and EXCEPT both walk both branches. + for op in ["INTERSECT", "EXCEPT"] { + let sql = format!("SELECT a FROM t1 {op} SELECT b FROM t2"); + assert_ops( + &sql, + TableOperation { + statement_kind: StatementKind::Select, + reads: vec![table("t1"), table("t2")], + writes: vec![], + lineage: vec![], + diagnostics: vec![], + }, + ); + } + } + + #[test] + fn insert_select_union_emits_one_lineage_edge_per_branch() { + // INSERT-SELECT-UNION moves data from each branch into the + // target, so both source tables surface as lineage sources. + assert_ops( + "INSERT INTO dst SELECT a FROM t1 UNION SELECT b FROM t2", + TableOperation { + statement_kind: StatementKind::Insert, + reads: vec![table("t1"), table("t2")], + writes: vec![table("dst")], + lineage: vec![edge("t1", "dst"), edge("t2", "dst")], + diagnostics: vec![], + }, + ); + } + + #[test] + fn ctas_with_union_body_emits_lineage_per_branch() { + assert_ops( + "CREATE TABLE dst AS SELECT a FROM t1 UNION SELECT b FROM t2", + TableOperation { + statement_kind: StatementKind::CreateTable, + reads: vec![table("t1"), table("t2")], + writes: vec![table("dst")], + lineage: vec![edge("t1", "dst"), edge("t2", "dst")], + diagnostics: vec![], + }, + ); + } + } + + mod diagnostics { + use super::*; + + #[test] + fn unsupported_statement_reports_diagnostic() { + assert_ops( + "CREATE INDEX idx ON t1 (a)", + TableOperation { + statement_kind: StatementKind::Unsupported, + reads: vec![], + writes: vec![], + lineage: vec![], + diagnostics: vec![diag(TableLevelDiagnosticKind::UnsupportedStatement)], + }, + ); + } + + #[test] + fn multiple_statements_produce_multiple_results() { + let sql = "SELECT * FROM t1; SELECT * FROM t2"; + assert_nth_ops( + sql, + 0, + TableOperation { + statement_kind: StatementKind::Select, + reads: vec![table("t1")], + writes: vec![], + lineage: vec![], + diagnostics: vec![], + }, + ); + assert_nth_ops( + sql, + 1, + TableOperation { + statement_kind: StatementKind::Select, + reads: vec![table("t2")], + writes: vec![], + lineage: vec![], + diagnostics: vec![], + }, + ); + } + } + + mod insert { + use super::*; + + #[test] + fn insert_values_emits_write_only() { + assert_ops( + "INSERT INTO t1 (a, b) VALUES (1, 2)", + TableOperation { + statement_kind: StatementKind::Insert, + reads: vec![], + writes: vec![table("t1")], + lineage: vec![], + diagnostics: vec![], + }, + ); + } + + #[test] + fn insert_select_emits_write_and_read() { + assert_ops( + "INSERT INTO t1 SELECT * FROM t2", + TableOperation { + statement_kind: StatementKind::Insert, + reads: vec![table("t2")], + writes: vec![table("t1")], + lineage: vec![edge("t2", "t1")], + diagnostics: vec![], + }, + ); + } + } + + mod update { + use super::*; + + #[test] + fn update_basic_emits_write_only() { + assert_ops( + "UPDATE t1 SET a = 1", + TableOperation { + statement_kind: StatementKind::Update, + reads: vec![], + writes: vec![table("t1")], + lineage: vec![], + diagnostics: vec![], + }, + ); + } + + #[test] + fn update_with_subquery_predicate_emits_write_plus_read() { + assert_ops( + "UPDATE t1 SET a = 1 WHERE id IN (SELECT id FROM t2)", + TableOperation { + statement_kind: StatementKind::Update, + reads: vec![table("t2")], + writes: vec![table("t1")], + lineage: vec![], + diagnostics: vec![], + }, + ); + } + + #[test] + fn update_with_from_clause_treats_from_as_read() { + // FROM t2 contributes rows to the UPDATE target → t2 → t1 + // lineage edge. SET RHS scalar subquery from t3 feeds the new + // value → t3 → t1 lineage edge. WHERE predicate subquery from + // t4 is predicate-only → no lineage. + assert_ops_with( + "UPDATE t1 SET a = (SELECT b FROM t3) FROM t2 WHERE t1.id IN (SELECT id FROM t4)", + &PostgreSqlDialect {}, + TableOperation { + statement_kind: StatementKind::Update, + reads: vec![table("t2"), table("t3"), table("t4")], + writes: vec![table("t1")], + lineage: vec![edge("t2", "t1"), edge("t3", "t1")], + diagnostics: vec![], + }, + ); + } + } + + mod delete { + use super::*; + + #[test] + fn delete_from_emits_write_only() { + assert_ops( + "DELETE FROM t1", + TableOperation { + statement_kind: StatementKind::Delete, + reads: vec![], + writes: vec![table("t1")], + lineage: vec![], + diagnostics: vec![], + }, + ); + } + + #[test] + fn delete_from_with_subquery_predicate_emits_write_plus_read() { + assert_ops( + "DELETE FROM t1 WHERE id IN (SELECT id FROM t2)", + TableOperation { + statement_kind: StatementKind::Delete, + reads: vec![table("t2")], + writes: vec![table("t1")], + lineage: vec![], + diagnostics: vec![], + }, + ); + } + + #[test] + fn delete_with_target_list_overlaps_writes_and_reads() { + // `DELETE t1, t2 FROM t1 JOIN t2 JOIN t3` — t1 and t2 are both + // deletion targets (writes) AND row sources (reads via FROM). + assert_ops_with( + "DELETE t1, t2 FROM t1 INNER JOIN t2 INNER JOIN t3", + &MySqlDialect {}, + TableOperation { + statement_kind: StatementKind::Delete, + reads: vec![table("t1"), table("t2"), table("t3")], + writes: vec![table("t1"), table("t2")], + lineage: vec![], + diagnostics: vec![], + }, + ); + } + + #[test] + fn delete_with_using_lists_target_in_writes_and_source_in_reads() { + assert_ops( + "DELETE FROM t1, t2 USING t1 INNER JOIN t2 INNER JOIN t3", + TableOperation { + statement_kind: StatementKind::Delete, + reads: vec![table("t1"), table("t2"), table("t3")], + writes: vec![table("t1"), table("t2")], + lineage: vec![], + diagnostics: vec![], + }, + ); + } + + #[test] + fn delete_resolves_target_alias_to_base_table() { + assert_ops_with( + "DELETE t1_alias FROM t1 AS t1_alias JOIN t2 ON t1_alias.a = t2.a", + &MySqlDialect {}, + TableOperation { + statement_kind: StatementKind::Delete, + reads: vec![table("t1"), table("t2")], + writes: vec![table("t1")], + lineage: vec![], + diagnostics: vec![], + }, + ); + } + } + + mod merge { + use super::*; + + #[test] + fn merge_emits_write_target_and_read_source() { + assert_ops( + "MERGE INTO t1 USING t2 ON t1.id = t2.id \ + WHEN MATCHED THEN UPDATE SET t1.b = t2.b", + TableOperation { + statement_kind: StatementKind::Merge, + reads: vec![table("t2")], + writes: vec![table("t1")], + lineage: vec![edge("t2", "t1")], + diagnostics: vec![], + }, + ); + } + } + + mod ddl { + use super::*; + + #[test] + fn create_table_emits_write_only() { + assert_ops( + "CREATE TABLE t1 (a INT)", + TableOperation { + statement_kind: StatementKind::CreateTable, + reads: vec![], + writes: vec![table("t1")], + lineage: vec![], + diagnostics: vec![], + }, + ); + } + + #[test] + fn create_table_as_select_emits_write_and_read() { + assert_ops( + "CREATE TABLE t1 AS SELECT * FROM t2", + TableOperation { + statement_kind: StatementKind::CreateTable, + reads: vec![table("t2")], + writes: vec![table("t1")], + lineage: vec![edge("t2", "t1")], + diagnostics: vec![], + }, + ); + } + + #[test] + fn create_view_emits_write_and_read() { + assert_ops( + "CREATE VIEW v1 AS SELECT * FROM t1", + TableOperation { + statement_kind: StatementKind::CreateView, + reads: vec![table("t1")], + writes: vec![table("v1")], + lineage: vec![edge("t1", "v1")], + diagnostics: vec![], + }, + ); + } + + #[test] + fn alter_table_emits_write_only() { + assert_ops( + "ALTER TABLE t1 ADD COLUMN a INT", + TableOperation { + statement_kind: StatementKind::AlterTable, + reads: vec![], + writes: vec![table("t1")], + lineage: vec![], + diagnostics: vec![], + }, + ); + } + + #[test] + fn drop_table_emits_one_write_per_name() { + assert_ops( + "DROP TABLE t1, t2", + TableOperation { + statement_kind: StatementKind::Drop, + reads: vec![], + writes: vec![table("t1"), table("t2")], + lineage: vec![], + diagnostics: vec![], + }, + ); + } + + #[test] + fn truncate_emits_one_write_per_name() { + assert_ops( + "TRUNCATE TABLE t1, t2", + TableOperation { + statement_kind: StatementKind::Truncate, + reads: vec![], + writes: vec![table("t1"), table("t2")], + lineage: vec![], + diagnostics: vec![], + }, + ); + } + + #[test] + fn drop_function_still_unsupported() { + // DROP variants that target non-relation objects don't carry a + // meaningful table-level operation. + assert_ops( + "DROP FUNCTION my_fn", + TableOperation { + statement_kind: StatementKind::Unsupported, + reads: vec![], + writes: vec![], + lineage: vec![], + diagnostics: vec![diag(TableLevelDiagnosticKind::UnsupportedStatement)], + }, + ); + } + } + + mod lineage { + use super::*; + + #[test] + fn insert_select_emits_lineage_from_source_to_target() { + assert_ops( + "INSERT INTO t1 SELECT * FROM t2", + TableOperation { + statement_kind: StatementKind::Insert, + reads: vec![table("t2")], + writes: vec![table("t1")], + lineage: vec![edge("t2", "t1")], + diagnostics: vec![], + }, + ); + } + + #[test] + fn insert_select_join_emits_one_lineage_edge_per_source() { + assert_ops( + "INSERT INTO t1 SELECT * FROM t2 JOIN t3 ON t2.id = t3.id", + TableOperation { + statement_kind: StatementKind::Insert, + reads: vec![table("t2"), table("t3")], + writes: vec![table("t1")], + lineage: vec![edge("t2", "t1"), edge("t3", "t1")], + diagnostics: vec![], + }, + ); + } + + #[test] + fn predicate_subquery_does_not_feed_lineage() { + // t3 is referenced only inside `WHERE id IN (SELECT id FROM t3)`, + // so it must not appear as a lineage source even though it does + // appear in `reads`. + assert_ops( + "INSERT INTO t1 SELECT * FROM t2 WHERE id IN (SELECT id FROM t3)", + TableOperation { + statement_kind: StatementKind::Insert, + reads: vec![table("t2"), table("t3")], + writes: vec![table("t1")], + lineage: vec![edge("t2", "t1")], + diagnostics: vec![], + }, + ); + } + + #[test] + fn join_on_predicate_does_not_promote_to_lineage() { + // t4 is in JOIN ON's predicate subquery — touches as read + // but doesn't promote to a lineage edge (predicate position excluded + // from data-feeding chain). + assert_ops( + "INSERT INTO t1 SELECT * FROM t2 JOIN t3 ON t2.id = t3.id \ + AND t2.id IN (SELECT id FROM t4)", + TableOperation { + statement_kind: StatementKind::Insert, + reads: vec![table("t2"), table("t3"), table("t4")], + writes: vec![table("t1")], + lineage: vec![edge("t2", "t1"), edge("t3", "t1")], + diagnostics: vec![], + }, + ); + } + + #[test] + fn update_scalar_subquery_in_set_feeds_lineage() { + assert_ops( + "UPDATE t1 SET col = (SELECT v FROM t2)", + TableOperation { + statement_kind: StatementKind::Update, + reads: vec![table("t2")], + writes: vec![table("t1")], + lineage: vec![edge("t2", "t1")], + diagnostics: vec![], + }, + ); + } + + #[test] + fn update_predicate_subquery_does_not_feed_lineage() { + assert_ops( + "UPDATE t1 SET col = 1 WHERE id IN (SELECT id FROM t2)", + TableOperation { + statement_kind: StatementKind::Update, + reads: vec![table("t2")], + writes: vec![table("t1")], + lineage: vec![], + diagnostics: vec![], + }, + ); + } + + #[test] + fn create_table_as_select_emits_lineage() { + assert_ops( + "CREATE TABLE t1 AS SELECT * FROM t2", + TableOperation { + statement_kind: StatementKind::CreateTable, + reads: vec![table("t2")], + writes: vec![table("t1")], + lineage: vec![edge("t2", "t1")], + diagnostics: vec![], + }, + ); + } + + #[test] + fn create_view_emits_lineage() { + assert_ops( + "CREATE VIEW v1 AS SELECT * FROM t1", + TableOperation { + statement_kind: StatementKind::CreateView, + reads: vec![table("t1")], + writes: vec![table("v1")], + lineage: vec![edge("t1", "v1")], + diagnostics: vec![], + }, + ); + } + + #[test] + fn merge_emits_lineage_from_source_to_target() { + assert_ops( + "MERGE INTO t1 USING t2 ON t1.id = t2.id \ + WHEN MATCHED THEN UPDATE SET t1.b = t2.b", + TableOperation { + statement_kind: StatementKind::Merge, + reads: vec![table("t2")], + writes: vec![table("t1")], + lineage: vec![edge("t2", "t1")], + diagnostics: vec![], + }, + ); + } + + #[test] + fn cte_data_reaches_write_target() { + assert_ops( + "INSERT INTO t1 WITH cte AS (SELECT * FROM s) SELECT * FROM cte", + TableOperation { + statement_kind: StatementKind::Insert, + reads: vec![table("s")], + writes: vec![table("t1")], + lineage: vec![edge("s", "t1")], + diagnostics: vec![], + }, + ); + } + + #[test] + fn cte_predicate_subquery_does_not_leak_into_lineage() { + // x is in the CTE body's WHERE predicate subquery — touches + // as read but doesn't promote to a lineage edge. + assert_ops( + "INSERT INTO t1 WITH cte AS (\ + SELECT * FROM s WHERE id IN (SELECT id FROM x)\ + ) SELECT * FROM cte", + TableOperation { + statement_kind: StatementKind::Insert, + reads: vec![table("s"), table("x")], + writes: vec![table("t1")], + lineage: vec![edge("s", "t1")], + diagnostics: vec![], + }, + ); + } + + #[test] + fn select_only_statement_emits_no_lineage() { + assert_ops( + "SELECT * FROM t1 JOIN t2 ON t1.id = t2.id", + TableOperation { + statement_kind: StatementKind::Select, + reads: vec![table("t1"), table("t2")], + writes: vec![], + lineage: vec![], + diagnostics: vec![], + }, + ); + } + + #[test] + fn insert_values_emits_no_lineage() { + assert_ops( + "INSERT INTO t1 VALUES (1, 2)", + TableOperation { + statement_kind: StatementKind::Insert, + reads: vec![], + writes: vec![table("t1")], + lineage: vec![], + diagnostics: vec![], + }, + ); + } + + #[test] + fn delete_with_subquery_predicate_emits_no_lineage() { + // DELETE doesn't move data — no lineage, even when a subquery + // references another table. + assert_ops( + "DELETE FROM t1 WHERE id IN (SELECT id FROM t2)", + TableOperation { + statement_kind: StatementKind::Delete, + reads: vec![table("t2")], + writes: vec![table("t1")], + lineage: vec![], + diagnostics: vec![], + }, + ); + } + + #[test] + fn truncate_emits_no_lineage() { + assert_ops( + "TRUNCATE TABLE t1", + TableOperation { + statement_kind: StatementKind::Truncate, + reads: vec![], + writes: vec![table("t1")], + lineage: vec![], + diagnostics: vec![], + }, + ); + } + } +} diff --git a/sql-insight/src/lib.rs b/sql-insight/src/lib.rs index 1a3db58..6e8ef15 100644 --- a/sql-insight/src/lib.rs +++ b/sql-insight/src/lib.rs @@ -1,36 +1,187 @@ //! # sql-insight //! -//! `sql-insight` is a utility designed for SQL query analysis, formatting, and transformation. +//! Operation extraction for SQL, built on +//! [`sqlparser-rs`](https://crates.io/crates/sqlparser). Turn a SQL +//! string into structured facts about what a statement does — +//! which tables and columns it reads, which it writes, and how data +//! moves from sources to targets — alongside utilities for +//! formatting and normalization. //! //! ## Main Functionalities //! -//! - **SQL Formatting**: Format SQL queries into a standardized format. See the [`formatter`] module for more information. -//! - **SQL Normalization**: Normalize SQL queries by abstracting literals. See the [`normalizer`] module for more information. -//! - **Table Extraction**: Extract tables within SQL queries. See the [`table_extractor`] module for more information. -//! - **CRUD Table Extraction**: Extract CRUD tables from SQL queries. See the [`crud_table_extractor`] module for more information. +//! - **SQL Formatting** — pretty-print SQL with a standardized +//! layout. See [`formatter`]. +//! - **SQL Normalization** — abstract literals into placeholders so +//! structurally identical queries hash to the same shape. See +//! [`normalizer`]. +//! - **Table Extraction** — flat list of +//! [`TableReference`]s touched by a statement. See +//! [`extract_tables`]. +//! - **CRUD Table Extraction** — CRUD-bucketed table sets per +//! statement. See [`extract_crud_tables`]. +//! - **Table-level Operation Extraction** — `reads` / `writes` / +//! `lineage` surfaces with [`StatementKind`] classification. See +//! [`extract_table_operations`]. +//! - **Column-level Operation Extraction** — the same three +//! surfaces at column granularity. `reads` / `writes` are plain +//! occurrence lists of [`ColumnReference`]s; `lineage` form a +//! source → target graph carrying [`ColumnLineageKind`] +//! (`Passthrough` vs `Transformation`). The value-vs-filter +//! distinction is structural: a value contributor is a `lineage` +//! source, a filter-only column is in `reads` but not `lineage`. +//! See [`extract_column_operations`]. +//! - **Optional [`Catalog`]** — supply a schema provider to make +//! resolution strict (catch typos as +//! [`UnresolvedColumn`](ColumnLevelDiagnosticKind::UnresolvedColumn), +//! pair INSERT positional values with target columns, etc.). +//! Every extractor works catalog-free in best-effort mode. +//! - **Diagnostics** ([`TableLevelDiagnostic`] / [`ColumnLevelDiagnostic`]) +//! — non-fatal issues surface alongside the extraction result rather +//! than failing the whole call: unsupported statements, suppressed +//! wildcards, ambiguous / unresolved columns. Split by granularity so a +//! table-level result can't carry a column-only condition. //! //! ## Quick Start //! -//! Here's a quick example to get you started with SQL formatting: +//! Table-level operation extraction — get `reads` / `writes` / +//! `lineage` and the statement kind from a single call: //! //! ```rust //! use sql_insight::sqlparser::dialect::GenericDialect; +//! use sql_insight::{extract_table_operations, StatementKind}; //! //! let dialect = GenericDialect {}; -//! let normalized_sql = sql_insight::format(&dialect, "SELECT * \n from users WHERE id = 1").unwrap(); -//! assert_eq!(normalized_sql, ["SELECT * FROM users WHERE id = 1"]); +//! let result = extract_table_operations( +//! &dialect, +//! "INSERT INTO orders (id) SELECT id FROM staging", +//! None, +//! ).unwrap(); +//! let ops = result[0].as_ref().unwrap(); +//! assert_eq!(ops.statement_kind, StatementKind::Insert); +//! assert_eq!(ops.reads.len(), 1); // staging +//! assert_eq!(ops.writes.len(), 1); // orders +//! assert_eq!(ops.lineage.len(), 1); // staging → orders //! ``` //! -//! For more comprehensive examples and usage, refer to [crates.io](https://crates.io/crates/sql-insight) or the documentation of each module. +//! SQL formatting: +//! +//! ```rust +//! use sql_insight::sqlparser::dialect::GenericDialect; +//! +//! let dialect = GenericDialect {}; +//! let formatted = sql_insight::format( +//! &dialect, "SELECT * \n from users WHERE id = 1" +//! ).unwrap(); +//! assert_eq!(formatted, ["SELECT * FROM users WHERE id = 1"]); +//! ``` +//! +//! ## Vocabulary +//! +//! Operation extraction returns three parallel surfaces per +//! statement: +//! +//! - `reads` — every table (or column) the statement reads from. +//! - `writes` — every table (or column) the statement writes to. A +//! table that plays both roles (e.g. `DELETE t1 FROM t1`) appears +//! in both. +//! - `lineage` — directed `source → target` edges, emitted only for +//! statements that physically move data (`INSERT` / `UPDATE` / +//! `MERGE` / `CREATE TABLE AS` / `CREATE VIEW`). +//! +//! For column-level lineage, [`ColumnLineageKind`] makes one clean +//! distinction: `Passthrough` (the value is forwarded unchanged; a +//! rename still counts) vs `Transformation` (any expression that +//! changes the value — arithmetic, function calls, aggregates, +//! window functions, CASE, casts, …). `reads` / `writes` are plain +//! occurrence lists of column references with no clause tag; whether +//! a column contributes a value or merely influences the result +//! (e.g. a `WHERE` predicate) is recovered structurally — value +//! contributors appear as `lineage` sources, filter-only columns do +//! not. +//! +//! ## Limitations +//! +//! Intentional non-support and known gaps — set expectations before +//! relying on a given output: +//! +//! - **Wildcards not expanded**: `SELECT *` / `t.*` contribute +//! nothing to `reads` / `lineage`. Expanding them safely would +//! require modelling USING / NATURAL JOIN merge, EXCLUDE / REPLACE +//! clauses, and multi-level aliases — too much rigor for a +//! SQL-text-only library. Surfaced as +//! [`WildcardSuppressed`](ColumnLevelDiagnosticKind::WildcardSuppressed) so +//! consumers can detect incomplete projections. +//! - **TableFunction schemas stay `Unknown`** (`UNNEST`, +//! `generate_series`, `JSON_TABLE`, etc.) — catalog enrichment +//! doesn't reach them yet. +//! - **Recursive CTE bodies** are pre-bound under a stub for +//! self-reference; their projection composition is deferred, so +//! `lineage` won't trace through them end-to-end. +//! - **Lineage kind is coarse** (`Passthrough` vs `Transformation`). +//! Aggregates, window functions, arithmetic, casts, etc. are all +//! `Transformation` — the model deliberately does not sub-classify +//! "changed" values (that distinction is lossy for edge cases like +//! window aggregates and value-preserving `STRING_AGG`, and not +//! needed for the core dependency / impact-analysis use case). +//! - **Multi-segment qualifiers** (`s.t.col`): only the head `s` +//! is matched against in-scope bindings for synthetic-vs-real +//! classification — schema- / catalog-qualified shapes resolve +//! loosely. +//! - **No type checking**: the catalog is an enrichment input, +//! not a validator. Type compatibility, coercion, and nullability +//! are out of scope. +//! +//! ## Behavior notes +//! +//! - **Catalog is optional, but load-bearing for column lineage**. +//! Table-level extraction is robust catalog-free — a table's +//! identity comes straight from the FROM clause. Column-level +//! extraction degrades without one: an unqualified column across +//! multiple in-scope tables (`SELECT x FROM a JOIN b`) is not +//! determinable from the SQL text alone, so it resolves to +//! `table: None`. Qualified (`t.col`) and single-table refs resolve +//! fine catalog-free. The ambiguous / unresolved-column diagnostics +//! that explain those `None`s fire only *with* a catalog; without +//! one they are suppressed (every `Unknown` schema could contain +//! anything, so flagging would flood the output with noise). With a +//! catalog, those diagnostics fire and INSERT positional pairing +//! pairs source projections with target columns. +//! - **Per-statement isolation**: every extractor returns +//! `Vec>` so a bad statement in a multi-statement +//! batch doesn't take the rest down. +//! - **Fatal vs non-fatal split**: parser failures and structural +//! problems short-circuit as `Err`; semantic issues (unsupported +//! statement, ambiguity, suppressed wildcards) surface in the +//! per-statement `diagnostics` list instead. +//! - **[`TableReference`] / [`ColumnReference`] are identity-only**. +//! No `alias` field — alias is use-site decoration. `HashSet` +//! dedup behaves intuitively across statements. +//! - **Set operations follow the left side**: the result schema of +//! `UNION` / `INTERSECT` / `EXCEPT` takes its column names from +//! the left branch, mirroring SQL's conventional behaviour. +//! - **Public enums are exhaustive while the crate is pre-1.0.** Adding +//! a variant to [`StatementKind`] / [`ColumnLineageKind`] / +//! [`ColumnTarget`] / the diagnostic-kind enums is therefore a visible +//! breaking change — deliberate, so consumers re-acknowledge each new +//! case rather than silently routing it to a wildcard arm. They will +//! likely gain `#[non_exhaustive]` at the 1.0 freeze, once the variant +//! sets stabilize. +pub mod catalog; +pub mod diagnostic; pub mod error; pub mod extractor; pub mod formatter; pub mod normalizer; +pub mod reference; +pub(crate) mod resolver; +pub use catalog::{Catalog, ColumnSchema}; +pub use diagnostic::*; pub use extractor::*; pub use formatter::*; pub use normalizer::*; +pub use reference::*; pub use sqlparser; #[doc(hidden)] diff --git a/sql-insight/src/reference.rs b/sql-insight/src/reference.rs new file mode 100644 index 0000000..dfcf6e0 --- /dev/null +++ b/sql-insight/src/reference.rs @@ -0,0 +1,134 @@ +//! Reference (identity) types shared by SQL inspection features. +//! +//! [`TableReference`] / [`ColumnReference`] are *qualified names* that +//! denote a table / column in a catalog or schema — pure identity, not +//! a relation (no tuples) nor a schema (no attribute types). They carry +//! only enough to name the thing and compare two names for equality. + +use core::fmt; + +use crate::error::Error; +use sqlparser::ast::{Ident, Insert, ObjectName, TableFactor, TableObject}; + +/// Physical table identity — the `catalog.schema.name` triplet. +/// +/// `TableReference` deliberately carries no alias: aliasing is a +/// use-site decoration, not part of a table's identity. Two SQL +/// fragments that reference the same physical table produce equal +/// `TableReference`s regardless of how they alias it, so `HashSet` / +/// `HashMap` dedup behaves intuitively and cross-statement comparison +/// is direct. Use-site alias information, when needed, is carried by +/// the structures that wrap a `TableReference` (e.g. resolver bindings). +#[derive(Clone, Debug, PartialEq, Eq, Hash)] +pub struct TableReference { + pub catalog: Option, + pub schema: Option, + pub name: Ident, +} + +/// A column-level identity reference: an optional owning table plus the +/// column name. +/// +/// `table` is `Option` because some column references cannot be +/// resolved structurally (ambiguous unqualified columns, references to +/// derived tables we do not yet expand, etc.) — in that case a +/// diagnostic accompanies the operation. Identity is name-based: two +/// `ColumnReference`s with the same `table` and `name` compare equal, +/// independent of where they appeared in the SQL. +#[derive(Clone, Debug, PartialEq, Eq, Hash)] +pub struct ColumnReference { + pub table: Option, + pub name: Ident, +} + +impl TableReference { + pub fn has_qualifiers(&self) -> bool { + self.catalog.is_some() || self.schema.is_some() + } + + pub fn try_from_name(name: &ObjectName) -> Result { + match name.0.len() { + 0 => unreachable!("Parser should not allow empty identifiers"), + 1 => Ok(TableReference { + catalog: None, + schema: None, + name: name.0[0].as_ident().unwrap().clone(), + }), + 2 => Ok(TableReference { + catalog: None, + schema: Some(name.0[0].as_ident().unwrap().clone()), + name: name.0[1].as_ident().unwrap().clone(), + }), + 3 => Ok(TableReference { + catalog: Some(name.0[0].as_ident().unwrap().clone()), + schema: Some(name.0[1].as_ident().unwrap().clone()), + name: name.0[2].as_ident().unwrap().clone(), + }), + _ => Err(Error::AnalysisError( + "Too many identifiers provided".to_string(), + )), + } + } +} + +impl fmt::Display for TableReference { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let mut parts = Vec::new(); + if let Some(catalog) = &self.catalog { + parts.push(catalog.to_string()); + } + if let Some(schema) = &self.schema { + parts.push(schema.to_string()); + } + parts.push(self.name.to_string()); + write!(f, "{}", parts.join(".")) + } +} + +impl TryFrom<&Insert> for TableReference { + type Error = Error; + + fn try_from(value: &Insert) -> Result { + Self::from_insert_with_alias(value).map(|(table, _)| table) + } +} + +impl TryFrom<&TableFactor> for TableReference { + type Error = Error; + + fn try_from(table: &TableFactor) -> Result { + Self::from_table_factor_with_alias(table).map(|(table, _)| table) + } +} + +impl TryFrom<&ObjectName> for TableReference { + type Error = Error; + + fn try_from(obj_name: &ObjectName) -> Result { + Self::try_from_name(obj_name) + } +} + +impl TableReference { + /// Parse an INSERT statement's target into (identity, alias) pair. + pub fn from_insert_with_alias(value: &Insert) -> Result<(Self, Option), Error> { + let name = match &value.table { + TableObject::TableName(object_name) => object_name, + TableObject::TableFunction(function) => &function.name, + }; + Ok((Self::try_from_name(name)?, value.table_alias.clone())) + } + + /// Parse a TableFactor (must be `TableFactor::Table`) into (identity, alias) pair. + pub fn from_table_factor_with_alias( + table: &TableFactor, + ) -> Result<(Self, Option), Error> { + match table { + TableFactor::Table { name, alias, .. } => Ok(( + Self::try_from_name(name)?, + alias.as_ref().map(|a| a.name.clone()), + )), + _ => unreachable!("TableFactor::Table expected"), + } + } +} diff --git a/sql-insight/src/resolver.rs b/sql-insight/src/resolver.rs new file mode 100644 index 0000000..ddd2ea0 --- /dev/null +++ b/sql-insight/src/resolver.rs @@ -0,0 +1,251 @@ +//! Walks a `sqlparser` `Statement` once and produces a +//! [`Resolution`] carrying scope bindings, captured column +//! references, and lineage edges. Two post-passes +//! ([`Resolution::composed_lineage_edges`] and +//! [`Resolution::real_column_refs`]) refine the raw walk +//! data into the public extraction surfaces. +//! +//! Module layout (all sub-modules are crate-internal): +//! +//! - [`binding`]: scope arena, `Binding` enum, scope traversal, +//! binder methods on `Resolver`. +//! - [`context`]: the scoped `with_*` helpers that save / restore +//! `scope_kind` around a clause walk. +//! - [`column_ref`]: `RawColumnRef` and walk-time resolution of +//! identifier parts to owning tables. +//! - [`projection`]: `ProjectionGroup` / `ProjectionItem` and the +//! passthrough-vs-transformation classification helper. +//! - [`lineage`]: `LineageEdge` / `LineageTargetSpec` and the emit +//! helpers that drive INSERT / CTAS / QueryOutput edge construction. +//! - [`composition`]: post-walk passes that substitute synthetic +//! sources and filter synthetic reads. +//! - [`rename`]: CTE / derived column-alias renaming. +//! - Walker modules ([`expr`], [`query`], [`statement`], [`table`]): +//! `visit_*` methods on `Resolver`, one per major AST +//! region. + +mod binding; +mod column_ref; +mod composition; +mod context; +mod lineage; +mod projection; +mod rename; + +mod expr; +mod query; +mod statement; +mod table; + +pub(crate) use binding::{Binding, RelationSchema, Scope, ScopeId, ScopeKind, TableRole}; +pub(crate) use column_ref::RawColumnRef; +pub(crate) use lineage::{LineageEdge, LineageTargetSpec}; +pub(crate) use projection::{ProjectionGroup, ProjectionItem}; + +// Internal helpers used by walkers via `super::*`. Some are +// resolver-internal infrastructure (`BindingKey`, `ScopeStack`, +// binding helpers); rename helpers are surfaced for the CTE / +// derived-table walkers in walker/query.rs and walker/table.rs. +use binding::ScopeStack; +pub(super) use rename::{rename_projection_groups, rename_relation_schema}; + +use sqlparser::ast::Statement; + +use crate::catalog::Catalog; +use crate::diagnostic::ColumnLevelDiagnostic; +use crate::error::Error; + +/// The end-of-walk result the resolver produces. Holds the scope +/// arena and the raw column refs / lineage edges collected during the +/// walk, plus accumulated diagnostics. Two post-passes inside +/// [`Resolver::into_resolution`] refine +/// `column_refs` and `lineage_edges` before the resolution leaves the +/// resolver. +#[derive(Debug)] +pub(crate) struct Resolution { + pub(crate) diagnostics: Vec, + pub(crate) scopes: Vec, + /// Column refs that survive the synthetic-binding filter (see + /// [`Resolution::real_column_refs`]). + pub(crate) column_refs: Vec, + /// Lineage edges after end-to-end composition through CTE / derived + /// intermediates (see + /// [`Resolution::composed_lineage_edges`]). + pub(crate) lineage_edges: Vec, +} + +/// What `resolve_query` returns: the body's `output_schema` and the +/// body projections per top-level SELECT (one entry, or one per UNION +/// branch). Callers decide whether to emit `QueryOutput` edges +/// (default), pair positionally with relation target columns +/// (INSERT / CTAS), or bubble them through `SetExpr::Query`. +#[derive(Debug, Clone)] +pub(crate) struct ResolvedQuery { + pub(crate) output_schema: RelationSchema, + pub(crate) projections: Vec, +} + +/// The walker. Owns the scope stack, the in-progress refs / edges, +/// the current projection buffer, and the lexical `scope_kind`. All +/// `visit_*` methods (in the walker sub-modules) and the various +/// `bind_*` / `record_*` / `with_*` helpers live as `impl` blocks +/// across the sub-modules — this is just the data shape and the +/// top-level entry point. +#[derive(Debug)] +pub(crate) struct Resolver<'a> { + /// `None` means the resolver runs without external schema + /// enrichment; table schemas stay `RelationSchema::Unknown` in + /// that case. + catalog: Option<&'a dyn Catalog>, + diagnostics: Vec, + scopes: ScopeStack, + column_refs: Vec, + lineage_edges: Vec, + /// Per-query buffer of projection groups collected by + /// `visit_select`. `resolve_query` swaps a fresh buffer in for + /// the duration of its walk and packs the collected groups into + /// the returned `ResolvedQuery`, so each query gets exactly its + /// own projections. + current_projections: Vec, + /// Lexical context stamped onto every scope pushed while it is in + /// effect: `Body` by default, flipped to `Predicate` by + /// [`Resolver::with_filter_clause`] so subqueries nested in WHERE / + /// HAVING / JOIN ON etc. are excluded from table-lineage. Propagates + /// *through* subquery boundaries (a subquery in a predicate is itself + /// predicate-position). + scope_kind: ScopeKind, +} + +impl<'a> Resolver<'a> { + fn new(catalog: Option<&'a dyn Catalog>) -> Self { + Self { + catalog, + diagnostics: Vec::new(), + scopes: ScopeStack::default(), + column_refs: Vec::new(), + lineage_edges: Vec::new(), + current_projections: Vec::new(), + scope_kind: ScopeKind::Body, + } + } + + pub(crate) fn resolve_statement( + catalog: Option<&'a dyn Catalog>, + statement: &Statement, + ) -> Result { + let mut resolver = Self::new(catalog); + resolver.visit_statement(statement)?; + Ok(resolver.into_resolution()) + } + + fn into_resolution(self) -> Resolution { + let mut resolution = Resolution { + diagnostics: self.diagnostics, + scopes: self.scopes.into_scopes(), + column_refs: self.column_refs, + lineage_edges: self.lineage_edges, + }; + // Two post-passes, both rely on the scope arena being final: + // - compose lineage edges so synthetic-binding (Cte/Derived) + // sources are substituted with their body's source refs; + // - filter column refs so synthetic-owned ones don't surface + // in the public reads list. + resolution.lineage_edges = resolution.composed_lineage_edges(); + resolution.column_refs = resolution.real_column_refs(); + resolution + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::catalog::ColumnSchema; + use crate::reference::TableReference; + use sqlparser::dialect::GenericDialect; + use sqlparser::parser::Parser; + use std::collections::HashMap; + + #[derive(Debug, Default)] + struct TestCatalog { + tables: HashMap>, + } + + impl TestCatalog { + fn with(mut self, name: &str, cols: Vec<&'static str>) -> Self { + self.tables.insert(name.to_string(), cols); + self + } + } + + impl Catalog for TestCatalog { + fn columns(&self, table: &TableReference) -> Option> { + self.tables.get(table.name.value.as_str()).map(|cols| { + cols.iter() + .map(|c| ColumnSchema { + name: c.to_string(), + }) + .collect() + }) + } + } + + fn resolve(sql: &str, catalog: Option<&dyn Catalog>) -> Resolution { + let dialect = GenericDialect {}; + let statements = Parser::parse_sql(&dialect, sql).unwrap(); + Resolver::resolve_statement(catalog, &statements[0]).unwrap() + } + + fn first_table_schema(resolution: &Resolution) -> Option<&RelationSchema> { + resolution + .scopes + .iter() + .flat_map(|scope| scope.bindings.values()) + .find_map(|binding| match binding { + Binding::Table { schema, .. } => Some(schema), + _ => None, + }) + } + + #[test] + fn catalog_hit_populates_table_schema() { + let catalog = TestCatalog::default().with("users", vec!["id", "email"]); + let resolution = resolve("SELECT * FROM users", Some(&catalog)); + match first_table_schema(&resolution) { + Some(RelationSchema::Known(cols)) => { + assert_eq!(cols.len(), 2); + assert_eq!(cols[0].value, "id"); + assert_eq!(cols[1].value, "email"); + } + other => panic!("expected RelationSchema::Known(...), got {:?}", other), + } + } + + #[test] + fn catalog_miss_keeps_schema_unknown() { + let catalog = TestCatalog::default(); + let resolution = resolve("SELECT * FROM users", Some(&catalog)); + assert!(matches!( + first_table_schema(&resolution), + Some(RelationSchema::Unknown) + )); + } + + #[test] + fn no_catalog_keeps_schema_unknown() { + let resolution = resolve("SELECT * FROM users", None); + assert!(matches!( + first_table_schema(&resolution), + Some(RelationSchema::Unknown) + )); + } + + #[test] + fn catalog_lookup_ignores_alias() { + let catalog = TestCatalog::default().with("users", vec!["id"]); + let resolution = resolve("SELECT * FROM users AS u", Some(&catalog)); + assert!(matches!( + first_table_schema(&resolution), + Some(RelationSchema::Known(_)) + )); + } +} diff --git a/sql-insight/src/resolver/binding.rs b/sql-insight/src/resolver/binding.rs new file mode 100644 index 0000000..99adb35 --- /dev/null +++ b/sql-insight/src/resolver/binding.rs @@ -0,0 +1,591 @@ +//! Scope arena, `Binding` enum, and the resolver-side helpers that +//! create and inspect them. + +use indexmap::IndexMap; +use sqlparser::ast::{Ident, ObjectName, Statement}; +use sqlparser::tokenizer::Span; + +use crate::catalog::ColumnSchema; +use crate::diagnostic::{ColumnLevelDiagnostic, ColumnLevelDiagnosticKind}; +use crate::reference::TableReference; + +use super::{ProjectionGroup, Resolution, Resolver}; + +/// Internal role a table binding carries within a statement. Surfaced +/// to the operation extractor via [`Resolution::read_tables`] +/// and [`Resolution::write_tables`]; the public API exposes +/// two separate lists instead of this enum. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +pub(crate) enum TableRole { + Read, + Write, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +pub(crate) struct ScopeId(pub(super) usize); + +/// Whether a scope contributes data to its enclosing write target. +/// +/// - `Body`: data moves through — query bodies, CTE bodies, derived +/// tables, INSERT/MERGE sources, scalar subqueries in projection or +/// SET. Tables bound here participate in `TableLineageEdge` edges when the +/// statement has a write target. +/// - `Predicate`: scope is referenced only in a constraint — WHERE, +/// HAVING, JOIN ON, EXISTS, IN, QUALIFY. Tables bound under any +/// Predicate ancestor are filtered out of `TableLineageEdge` regardless of +/// their own kind, so `INSERT INTO t SELECT FROM s WHERE id IN +/// (SELECT id FROM x)` emits `s → t` but not `x → t`. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +pub(crate) enum ScopeKind { + Body, + Predicate, +} + +/// A normalized identifier key for binding lookup. +/// +/// Two identifiers match iff their normalized forms are equal. The +/// rule: fold an unquoted name to lowercase, keep a quoted name exact. +/// So `"id"` and unquoted `id` are the same column, while `"ID"` and +/// `id` are not. +/// +/// This is one fixed rule, applied uniformly — it is *not* varied by +/// dialect, nor by table-vs-column. Real dialects do diverge there +/// (e.g. MySQL / BigQuery / SQLite treat quoting as mere escaping and +/// keep quoted names case-insensitive; BigQuery columns are +/// case-insensitive but its tables are case-sensitive; ClickHouse is +/// fully case-sensitive). Modelling each faithfully would need a +/// per-dialect identifier-resolution strategy, which is deferred — the +/// fixed rule here is a deliberate common-denominator approximation: +/// +/// - **Unquoted → lowercase** makes unquoted matching case-insensitive, +/// which every supported dialect except ClickHouse does. (ClickHouse +/// is over-matched — sound, just imprecise.) The fold *direction* +/// only affects the quoted/unquoted edge; lowercase follows the +/// popular majority (PG / MySQL / SQLite / BigQuery / Redshift / Spark) +/// over the uppercase minority (ANSI / Oracle / Snowflake). +/// - **Quoted → exact** follows the ANSI / PostgreSQL family, where +/// quoting makes an identifier case-sensitive. The MySQL / BigQuery / +/// SQLite family instead treat quoting as escaping, so this is +/// stricter than they are for quoted names — accepted, since quoted +/// identifiers are rare in practice. +#[derive(Clone, Debug, PartialEq, Eq, Hash)] +pub(super) struct BindingKey(String); + +impl BindingKey { + pub(super) fn from_ident(ident: &Ident) -> Self { + Self(if ident.quote_style.is_some() { + ident.value.clone() + } else { + ident.value.to_ascii_lowercase() + }) + } +} + +#[derive(Clone, Debug, PartialEq, Eq)] +pub(crate) enum RelationSchema { + /// Column names of a relation with a known schema (from the + /// catalog). Just the names — the resolver needs identity, not + /// types. + Known(Vec), + Unknown, +} + +/// What's bound to a name in a [`Scope`] — a real Table or +/// one of the synthetic intermediates (CTE / derived subquery / table +/// function) that SQL exposes as a named row set. +#[derive(Clone, Debug, PartialEq, Eq)] +pub(crate) enum Binding { + // `table` is boxed because the variant otherwise dwarfs the others + // (TableReference is ~300B) and inflates the entire enum's size. + Table { + table: Box, + /// Alias given at this use-site, if any. Kept separately so + /// `TableReference` stays alias-free for catalog lookup and + /// cross-statement comparison. + alias: Option, + schema: RelationSchema, + roles: Vec, + }, + Cte { + name: Ident, + schema: RelationSchema, + /// The CTE body's projection groups, captured so that lineage + /// composition can substitute references to `cte.col` with the + /// body's source refs (transitive source → target lineage). + /// Empty for recursive CTEs where the body is walked under a + /// pre-bound stub and fixpoint-aware projection capture is + /// deferred. + body_projections: Vec, + }, + DerivedTable { + alias: Ident, + schema: RelationSchema, + /// Same role as `Cte::body_projections` — captured at the + /// derived subquery walk and consumed by lineage composition. + body_projections: Vec, + }, + TableFunction { + alias: Ident, + schema: RelationSchema, + }, +} + +#[derive(Debug)] +pub(crate) struct Scope { + pub(crate) id: ScopeId, + pub(crate) parent: Option, + pub(crate) kind: ScopeKind, + pub(super) bindings: IndexMap, +} + +impl Scope { + fn new(id: ScopeId, parent: Option, kind: ScopeKind) -> Self { + Self { + id, + parent, + kind, + bindings: IndexMap::new(), + } + } + + fn bind(&mut self, name: &Ident, binding: Binding) { + let key = BindingKey::from_ident(name); + // Re-binding the same name as a Table merges roles rather than + // replacing — this captures the `DELETE t1 FROM t1` style case + // where a single name plays multiple roles in one statement. + if let ( + Some(Binding::Table { + roles: existing, .. + }), + Binding::Table { roles: new, .. }, + ) = (self.bindings.get_mut(&key), &binding) + { + for role in new { + if !existing.contains(role) { + existing.push(*role); + } + } + return; + } + self.bindings.insert(key, binding); + } + + fn resolve(&self, name: &Ident) -> Option<&Binding> { + self.bindings.get(&BindingKey::from_ident(name)) + } + + pub(super) fn iter_bindings(&self) -> impl Iterator { + self.bindings.values() + } +} + +#[derive(Default, Debug)] +pub(super) struct ScopeStack { + pub(super) scopes: Vec, + stack: Vec, +} + +impl ScopeStack { + pub(super) fn scope(&self, id: ScopeId) -> &Scope { + &self.scopes[id.0] + } + + pub(super) fn into_scopes(self) -> Vec { + self.scopes + } + + pub(super) fn push_query_scope(&mut self, kind: ScopeKind) -> ScopeId { + let parent = self.stack.last().copied(); + self.push_scope(parent, kind) + } + + pub(super) fn pop_scope(&mut self) { + self.stack.pop(); + } + + pub(super) fn bind_current(&mut self, name: Ident, binding: Binding) { + self.current_scope_mut().bind(&name, binding); + } + + pub(super) fn resolve_unqualified_relation(&self, relation: &ObjectName) -> Option<&Binding> { + if relation.0.len() != 1 { + return None; + } + let name = relation.0[0].as_ident()?; + self.stack + .iter() + .rev() + .find_map(|scope_id| self.scopes[scope_id.0].resolve(name)) + } + + fn push_scope(&mut self, parent: Option, kind: ScopeKind) -> ScopeId { + let id = ScopeId(self.scopes.len()); + self.scopes.push(Scope::new(id, parent, kind)); + self.stack.push(id); + id + } + + pub(super) fn current_scope_id(&mut self) -> ScopeId { + if let Some(id) = self.stack.last() { + *id + } else { + self.push_scope(None, ScopeKind::Body) + } + } + + fn current_scope_mut(&mut self) -> &mut Scope { + let id = self.current_scope_id(); + &mut self.scopes[id.0] + } +} + +pub(super) fn is_synthetic_binding(binding: &Binding) -> bool { + matches!( + binding, + Binding::Cte { .. } | Binding::DerivedTable { .. } | Binding::TableFunction { .. } + ) +} + +pub(super) fn binding_alias_key(binding: &Binding) -> BindingKey { + match binding { + Binding::Table { table, alias, .. } => { + BindingKey::from_ident(alias.as_ref().unwrap_or(&table.name)) + } + Binding::Cte { name, .. } => BindingKey::from_ident(name), + Binding::DerivedTable { alias, .. } | Binding::TableFunction { alias, .. } => { + BindingKey::from_ident(alias) + } + } +} + +pub(super) fn binding_could_contain_column( + binding: &Binding, + name: &Ident, +) -> Option { + match binding { + Binding::Table { table, schema, .. } => { + schema_could_contain(schema, name).then(|| (**table).clone()) + } + Binding::Cte { + name: cte_name, + schema, + .. + } => schema_could_contain(schema, name).then(|| synthetic_table_ref(cte_name)), + Binding::DerivedTable { alias, schema, .. } => { + schema_could_contain(schema, name).then(|| synthetic_table_ref(alias)) + } + // TableFunction schemas are always Unknown for now, so any + // unqualified column could plausibly come from one. + Binding::TableFunction { alias, .. } => Some(synthetic_table_ref(alias)), + } +} + +/// Schema-confirmed membership: `true` iff the binding has a `Known` +/// schema that declares the column. Distinguished from +/// `binding_could_contain_column`, which also returns `Some` for +/// `Unknown` schemas. Used by diagnostic emit to separate "definitely +/// ambiguous" from "uncertain over Unknown schemas". +pub(super) fn binding_confirms_column(binding: &Binding, name: &Ident) -> bool { + matches!( + binding_schema(binding), + RelationSchema::Known(cols) + if cols.iter().any(|c| BindingKey::from_ident(c) == BindingKey::from_ident(name)) + ) +} + +/// `true` iff the binding's schema is `Known` (not `Unknown`). Used to +/// gate `UnresolvedColumn` diagnostics — without at least one Known +/// schema in scope, the resolver can't claim a column is missing. +pub(super) fn binding_has_known_schema(binding: &Binding) -> bool { + matches!(binding_schema(binding), RelationSchema::Known(_)) +} + +fn binding_schema(binding: &Binding) -> &RelationSchema { + match binding { + Binding::Table { schema, .. } + | Binding::Cte { schema, .. } + | Binding::DerivedTable { schema, .. } + | Binding::TableFunction { schema, .. } => schema, + } +} + +fn schema_could_contain(schema: &RelationSchema, name: &Ident) -> bool { + match schema { + RelationSchema::Unknown => true, + RelationSchema::Known(cols) => cols + .iter() + .any(|c| BindingKey::from_ident(c) == BindingKey::from_ident(name)), + } +} + +pub(super) fn synthetic_table_ref(name: &Ident) -> TableReference { + TableReference { + catalog: None, + schema: None, + name: name.clone(), + } +} + +/// Convert a raw sqlparser `Span` to the `Option` shape stored on +/// `ColumnLevelDiagnostic`: an empty span (sqlparser convention: `line == 0`) is +/// flattened to `None` so consumers can distinguish "no source location" +/// from "location at (0, 0)". +pub(super) fn normalize_span(span: Span) -> Option { + (span.start.line != 0).then_some(span) +} + +/// Format an `Option` as ` at L:C` for inclusion in +/// diagnostic messages, or an empty string when no location is known. +pub(super) fn span_suffix(span: Option) -> String { + match span { + Some(s) => format!(" at L{}:C{}", s.start.line, s.start.column), + None => String::new(), + } +} + +// ───────── Resolver binding-related methods ───────── + +impl<'a> Resolver<'a> { + pub(super) fn scopes(&self) -> &ScopeStack { + &self.scopes + } + + pub(super) fn scopes_mut(&mut self) -> &mut ScopeStack { + &mut self.scopes + } + + pub(super) fn is_cte_reference(&self, relation: &ObjectName) -> bool { + matches!( + self.scopes.resolve_unqualified_relation(relation), + Some(Binding::Cte { .. }) + ) + } + + pub(super) fn bind_base_table( + &mut self, + table: TableReference, + alias: Option, + role: TableRole, + ) { + let binding_name = alias.clone().unwrap_or_else(|| table.name.clone()); + let schema = self.lookup_table_schema(&table); + self.bind_relation( + binding_name, + Binding::Table { + table: Box::new(table), + alias, + schema, + roles: vec![role], + }, + ); + } + + /// Query the optional catalog for a table's columns. + /// `TableReference` is already alias-free, so it is a valid + /// catalog key as-is. + fn lookup_table_schema(&self, table: &TableReference) -> RelationSchema { + let Some(catalog) = self.catalog else { + return RelationSchema::Unknown; + }; + let lookup_key = table.clone(); + match catalog.columns(&lookup_key) { + Some(cols) => RelationSchema::Known( + cols.into_iter() + .map(|ColumnSchema { name }| Ident::new(name)) + .collect(), + ), + None => RelationSchema::Unknown, + } + } + + /// Resolve the effective target column list for INSERT-style + /// positional pairing: explicit list wins when non-empty, + /// otherwise the catalog-provided schema if known. Returns an + /// empty `Vec` when neither path yields names — the caller then + /// emits no Relation edges (matches the no-catalog + /// column-list-less INSERT behavior). + pub(super) fn effective_target_columns( + &self, + explicit: &[Ident], + target: &TableReference, + ) -> Vec { + if !explicit.is_empty() { + return explicit.to_vec(); + } + match self.lookup_table_schema(target) { + RelationSchema::Known(cols) => cols, + RelationSchema::Unknown => Vec::new(), + } + } + + /// Look up an in-scope CTE's body projections, for re-binding + /// under an alias (`FROM cte AS c`). Returns an empty `Vec` when + /// the reference is multi-segment, not bound, or not a Cte + /// binding — the caller (alias-bound Cte construction) treats + /// that as "no composition through this alias", matching + /// recursive-CTE behavior. + pub(super) fn cte_body_projections(&self, cte_name: &ObjectName) -> Vec { + match self.scopes.resolve_unqualified_relation(cte_name) { + Some(Binding::Cte { + body_projections, .. + }) => body_projections.clone(), + _ => Vec::new(), + } + } + + /// Look up an in-scope CTE's schema (companion to + /// [`Self::cte_body_projections`]). Returns `RelationSchema::Unknown` + /// when the lookup misses — same fallthrough semantics as the + /// body-projections accessor. + pub(super) fn cte_schema(&self, cte_name: &ObjectName) -> RelationSchema { + match self.scopes.resolve_unqualified_relation(cte_name) { + Some(Binding::Cte { schema, .. }) => schema.clone(), + _ => RelationSchema::Unknown, + } + } + + pub(super) fn bind_cte( + &mut self, + name: Ident, + schema: RelationSchema, + body_projections: Vec, + ) { + self.bind_relation( + name.clone(), + Binding::Cte { + name, + schema, + body_projections, + }, + ); + } + + pub(super) fn bind_derived_table( + &mut self, + alias: Ident, + schema: RelationSchema, + body_projections: Vec, + ) { + self.bind_relation( + alias.clone(), + Binding::DerivedTable { + alias, + schema, + body_projections, + }, + ); + } + + pub(super) fn bind_table_function(&mut self, alias: Ident) { + self.bind_relation( + alias.clone(), + Binding::TableFunction { + alias, + schema: RelationSchema::Unknown, + }, + ); + } + + pub(super) fn record_diagnostic(&mut self, diagnostic: ColumnLevelDiagnostic) { + self.diagnostics.push(diagnostic); + } + + pub(super) fn record_unsupported_statement(&mut self, statement: &Statement) { + self.record_diagnostic(ColumnLevelDiagnostic { + kind: ColumnLevelDiagnosticKind::UnsupportedStatement, + message: format!("Unsupported statement while inspecting SQL: {}", statement), + span: None, + }); + } + + pub(super) fn record_wildcard_suppressed(&mut self, description: &str, span: Span) { + let span = normalize_span(span); + self.record_diagnostic(ColumnLevelDiagnostic { + kind: ColumnLevelDiagnosticKind::WildcardSuppressed, + message: format!( + "{}{} left unexpanded — column lineage will be incomplete for this projection", + description, + span_suffix(span), + ), + span, + }); + } + + fn bind_relation(&mut self, name: Ident, binding: Binding) { + self.scopes.bind_current(name, binding); + } +} + +// ───────── Resolution binding-related queries ───────── + +impl Resolution { + /// All tables touched by the statement, in scope-arena order. The + /// union of [`Self::read_tables`] and [`Self::write_tables`] (with + /// duplicates when a single table carries both roles). + pub(crate) fn tables(&self) -> Vec { + self.scopes + .iter() + .flat_map(|scope| scope.iter_bindings()) + .filter_map(|binding| match binding { + Binding::Table { table, .. } => Some((**table).clone()), + _ => None, + }) + .collect() + } + + /// Every table referenced as a Read source, in scope-arena order. + /// Includes tables inside predicate subqueries (e.g. `x` in + /// `WHERE id IN (SELECT id FROM x)`). Use + /// [`Self::feeding_read_tables`] for the stricter "feeds the + /// enclosing write target" filter. + pub(crate) fn read_tables(&self) -> Vec { + self.collect_tables_by_role(TableRole::Read) + } + + /// Every table referenced as a Write target, in scope-arena order. + pub(crate) fn write_tables(&self) -> Vec { + self.collect_tables_by_role(TableRole::Write) + } + + fn collect_tables_by_role(&self, role: TableRole) -> Vec { + self.scopes + .iter() + .flat_map(|scope| scope.iter_bindings()) + .filter_map(|binding| match binding { + Binding::Table { table, roles, .. } if roles.contains(&role) => { + Some((**table).clone()) + } + _ => None, + }) + .collect() + } + + /// Read-role tables in a data-feeding position — Read role plus no + /// `Predicate` ancestor in their scope chain. The basis for + /// `TableLineageEdge` edge sources. + pub(crate) fn feeding_read_tables(&self) -> Vec { + self.scopes + .iter() + .filter(|scope| !self.has_predicate_ancestor(scope.id)) + .flat_map(|scope| scope.iter_bindings()) + .filter_map(|binding| match binding { + Binding::Table { table, roles, .. } if roles.contains(&TableRole::Read) => { + Some((**table).clone()) + } + _ => None, + }) + .collect() + } + + fn has_predicate_ancestor(&self, scope_id: ScopeId) -> bool { + let mut current = Some(scope_id); + while let Some(id) = current { + let scope = &self.scopes[id.0]; + if scope.kind == ScopeKind::Predicate { + return true; + } + current = scope.parent; + } + false + } +} diff --git a/sql-insight/src/resolver/column_ref.rs b/sql-insight/src/resolver/column_ref.rs new file mode 100644 index 0000000..4a5af37 --- /dev/null +++ b/sql-insight/src/resolver/column_ref.rs @@ -0,0 +1,233 @@ +//! `RawColumnRef` — column references captured during the walk — +//! plus the walk-time resolution that fills its `resolved` / +//! `synthetic` fields. + +use sqlparser::ast::Ident; + +use crate::diagnostic::{ColumnLevelDiagnostic, ColumnLevelDiagnosticKind}; +use crate::reference::TableReference; + +use super::binding::{ + binding_alias_key, binding_confirms_column, binding_could_contain_column, + binding_has_known_schema, is_synthetic_binding, normalize_span, span_suffix, BindingKey, +}; +use super::{Binding, Resolver, ScopeId}; + +/// A column reference captured by the resolver during the AST walk. +/// +/// `parts` mirrors `sqlparser`'s split — 1 part for bare `a`, 2 for +/// `t1.a`, 3 for `schema.t1.a`, 4 for `catalog.schema.t1.a`. +/// `scope_id` is the scope in which the reference appeared (kept for +/// diagnostics and for binding lookups at composition time). +/// +/// `resolved` and `synthetic` are computed at record time, when scope +/// state still reflects what was visible to the SQL author at that +/// point in the walk — necessary for multi-CTE chains where later +/// CTE bindings would otherwise ambify earlier resolutions. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub(crate) struct RawColumnRef { + pub(crate) parts: Vec, + pub(crate) scope_id: ScopeId, + /// Owning table captured at walk time. `None` for ambiguous / + /// no-candidate / unrecognized-qualifier-shape cases. + pub(crate) resolved: Option, + /// True iff the walk-time owning binding was synthetic + /// (`Cte` / `DerivedTable` / `TableFunction`). Drives reads + /// filtering and lineage composition. `false` when `resolved` is + /// `None`. + pub(crate) synthetic: bool, +} + +/// Decode a qualified ref's leading parts (everything before the +/// column name) into a `TableReference`. 1 part = bare name, 2 = +/// schema.name, 3 = catalog.schema.name. Other lengths (0 / 4+) +/// return `None` — they're either accidentally invalid or +/// struct-field accesses on a fully qualified column, which we don't +/// model yet. +pub(super) fn table_from_qualifier_parts(parts: &[Ident]) -> Option { + match parts.len() { + 1 => Some(TableReference { + catalog: None, + schema: None, + name: parts[0].clone(), + }), + 2 => Some(TableReference { + catalog: None, + schema: Some(parts[0].clone()), + name: parts[1].clone(), + }), + 3 => Some(TableReference { + catalog: Some(parts[0].clone()), + schema: Some(parts[1].clone()), + name: parts[2].clone(), + }), + _ => None, + } +} + +impl<'a> Resolver<'a> { + pub(super) fn column_refs_len(&self) -> usize { + self.column_refs.len() + } + + pub(super) fn column_refs_slice(&self, since: usize) -> &[RawColumnRef] { + &self.column_refs[since..] + } + + /// Record a column reference observed in the current scope. + /// Resolution (owning table) and synthetic-vs-real classification + /// are computed right now, while scope state is authoritative — + /// later CTE bindings won't ambify what this reference saw. + pub(super) fn record_column_ref(&mut self, parts: Vec) { + let scope_id = self.scopes_mut().current_scope_id(); + let (resolved, synthetic) = self.resolve_ref_at_walk(&parts, scope_id); + self.column_refs.push(RawColumnRef { + parts, + scope_id, + resolved, + synthetic, + }); + } + + fn resolve_ref_at_walk( + &mut self, + parts: &[Ident], + scope_id: ScopeId, + ) -> (Option, bool) { + match parts.len() { + 0 => (None, false), + 1 => self.resolve_unqualified_at_walk(&parts[0], scope_id), + n => self.resolve_qualified_at_walk(&parts[..n - 1], scope_id), + } + } + + /// Walk the scope chain for an unqualified column reference. Emits + /// `AmbiguousColumn` when two or more bindings with `Known` schemas + /// confirm the column, and `UnresolvedColumn` when no in-scope + /// binding contains it but at least one scope had a `Known` schema + /// (catalog-aware mode). Both diagnostics are suppressed when every + /// candidate / scope is `Unknown`, since `Unknown` schemas could + /// hold anything and silence is the safer default without catalog + /// enrichment. + fn resolve_unqualified_at_walk( + &mut self, + name: &Ident, + scope_id: ScopeId, + ) -> (Option, bool) { + let mut current = Some(scope_id); + let mut had_known_schemas_anywhere = false; + let mut resolved: Option<(TableReference, bool)> = None; + // (candidate tables, confirmed-by-Known count) + let mut ambiguity: Option<(Vec, usize)> = None; + + while let Some(id) = current { + let scope = self.scopes().scope(id); + if scope.iter_bindings().any(binding_has_known_schema) { + had_known_schemas_anywhere = true; + } + let matches: Vec<(TableReference, bool, bool)> = scope + .iter_bindings() + .filter_map(|b| { + let tbl = binding_could_contain_column(b, name)?; + Some(( + tbl, + binding_confirms_column(b, name), + is_synthetic_binding(b), + )) + }) + .collect(); + if !matches.is_empty() { + if matches.len() == 1 { + let (tbl, _, syn) = matches.into_iter().next().unwrap(); + resolved = Some((tbl, syn)); + } else { + let confirmed = matches.iter().filter(|(_, c, _)| *c).count(); + let candidates: Vec = + matches.into_iter().map(|(t, _, _)| t).collect(); + ambiguity = Some((candidates, confirmed)); + } + break; + } + current = scope.parent; + } + + if let Some((tbl, syn)) = resolved { + return (Some(tbl), syn); + } + if let Some((candidates, confirmed_count)) = ambiguity { + if confirmed_count >= 2 { + let span = normalize_span(name.span); + let names: Vec = candidates.iter().map(|t| t.name.value.clone()).collect(); + self.record_diagnostic(ColumnLevelDiagnostic { + kind: ColumnLevelDiagnosticKind::AmbiguousColumn, + message: format!( + "ambiguous column `{}`{} — matches in: [{}]", + name.value, + span_suffix(span), + names.join(", ") + ), + span, + }); + } + return (None, false); + } + if had_known_schemas_anywhere { + let span = normalize_span(name.span); + self.record_diagnostic(ColumnLevelDiagnostic { + kind: ColumnLevelDiagnosticKind::UnresolvedColumn, + message: format!( + "unresolved column `{}`{} — no in-scope relation with a known schema contains it", + name.value, + span_suffix(span), + ), + span, + }); + } + (None, false) + } + + fn resolve_qualified_at_walk( + &self, + qualifier_parts: &[Ident], + scope_id: ScopeId, + ) -> (Option, bool) { + // Look up the binding for the qualifier head in the scope chain. + // Multi-segment qualifiers (s.t.col) match only on the head — + // schema/catalog-qualified bound names are rare and we don't + // currently bind their full path anyway. + let binding = qualifier_parts + .first() + .and_then(|head| self.binding_for_qualifier(head, scope_id)); + let synthetic = binding.map(is_synthetic_binding).unwrap_or(false); + // Canonicalize a single-segment qualifier bound to a real table + // to that binding's alias-free underlying `TableReference`, so an + // aliased ref (`u.a` over `FROM t1 AS u`) surfaces the real table + // `t1` — matching how unqualified refs resolve. Synthetic bindings + // (CTE / derived / table function) keep the qualifier verbatim so + // lineage composition can re-find the owning binding by name; + // multi-segment qualifiers are already real identities and pass + // through untouched. + let table = match binding { + Some(Binding::Table { table, .. }) if qualifier_parts.len() == 1 => { + Some((**table).clone()) + } + _ => table_from_qualifier_parts(qualifier_parts), + }; + (table, synthetic) + } + + fn binding_for_qualifier(&self, head: &Ident, scope_id: ScopeId) -> Option<&Binding> { + let key = BindingKey::from_ident(head); + let mut current = Some(scope_id); + while let Some(id) = current { + let scope = self.scopes().scope(id); + for binding in scope.iter_bindings() { + if binding_alias_key(binding) == key { + return Some(binding); + } + } + current = scope.parent; + } + None + } +} diff --git a/sql-insight/src/resolver/composition.rs b/sql-insight/src/resolver/composition.rs new file mode 100644 index 0000000..6205126 --- /dev/null +++ b/sql-insight/src/resolver/composition.rs @@ -0,0 +1,146 @@ +//! Post-walk passes on `Resolution`: +//! +//! - [`Resolution::composed_lineage_edges`] rewrites each lineage +//! edge so its source resolves to a real (non-synthetic) reference +//! by walking back through CTE / derived body projections. +//! - [`Resolution::real_column_refs`] filters out refs whose +//! walk-time owner was synthetic, so the public `reads` surface +//! only shows real-storage references and unresolved names. + +use crate::extractor::column_operation_extractor::ColumnLineageKind; + +use super::binding::{binding_alias_key, BindingKey}; +use super::{Binding, LineageEdge, RawColumnRef, Resolution}; + +/// Recursion ceiling for `substitute_source` — guards against +/// accidental cycles (recursive CTEs are pre-bound with empty +/// body_projections, so the typical case stops there; this is a +/// defence for unexpected loops). +const MAX_COMPOSITION_DEPTH: usize = 64; + +impl Resolution { + /// Filter [`column_refs`](Resolution::column_refs) down + /// to "real reads": references whose walk-time owning binding was + /// a `Table` (or unresolved). Refs that pointed at a synthetic + /// intermediate (`Cte` / `DerivedTable` / `TableFunction`) are + /// dropped — those intermediates aren't storage, so they don't + /// belong in the public reads surface. + pub(crate) fn real_column_refs(&self) -> Vec { + self.column_refs + .iter() + .filter(|raw| !raw.synthetic) + .cloned() + .collect() + } + + /// Compose every lineage edge so its source resolves to a real + /// (non-synthetic) reference. References whose walk-time owner + /// is a Cte / DerivedTable with non-empty `body_projections` get + /// substituted by walking that body's matching `ProjectionItem` + /// and emitting one edge per inner source ref — recursively, + /// until the chain bottoms out at a real table or an unresolvable + /// ref. The outer edge's `kind` is combined with each body + /// item's kind via [`compose_lineage_kinds`] (Passthrough is + /// preserved only when both sides are Passthrough; any transforming + /// step yields Transformation). Bounded by [`MAX_COMPOSITION_DEPTH`] + /// as a cycle guard. + pub(crate) fn composed_lineage_edges(&self) -> Vec { + self.lineage_edges + .iter() + .flat_map(|edge| { + self.substitute_source(&edge.source, edge.kind, 0) + .into_iter() + .map(|(source, kind)| LineageEdge { + source, + target: edge.target.clone(), + kind, + }) + }) + .collect() + } + + fn substitute_source( + &self, + raw: &RawColumnRef, + outer_kind: ColumnLineageKind, + depth: usize, + ) -> Vec<(RawColumnRef, ColumnLineageKind)> { + if depth >= MAX_COMPOSITION_DEPTH { + return vec![(raw.clone(), outer_kind)]; + } + let body_projections = match self.synthetic_owning_binding(raw) { + Some(Binding::Cte { + body_projections, .. + }) => body_projections, + Some(Binding::DerivedTable { + body_projections, .. + }) => body_projections, + _ => return vec![(raw.clone(), outer_kind)], + }; + if body_projections.is_empty() { + return vec![(raw.clone(), outer_kind)]; + } + let Some(col_name) = raw.parts.last() else { + return vec![(raw.clone(), outer_kind)]; + }; + let key = BindingKey::from_ident(col_name); + let mut result = Vec::new(); + for group in body_projections { + for item in &group.items { + let matches = item + .name + .as_ref() + .is_some_and(|n| BindingKey::from_ident(n) == key); + if !matches { + continue; + } + let composed = compose_lineage_kinds(outer_kind, item.kind); + for source in &item.source_refs { + result.extend(self.substitute_source(source, composed, depth + 1)); + } + } + } + if result.is_empty() { + vec![(raw.clone(), outer_kind)] + } else { + result + } + } + + /// Look up the binding a synthetic-owning raw ref points at, by + /// matching the walk-time-captured table name against scope + /// bindings. Name match is unique within IndexMap, so this avoids + /// the column-membership ambiguity that scope-chain resolution + /// can hit when CTEs accumulate. Returns `None` for non-synthetic + /// refs. + fn synthetic_owning_binding(&self, raw: &RawColumnRef) -> Option<&Binding> { + if !raw.synthetic { + return None; + } + let table = raw.resolved.as_ref()?; + let key = BindingKey::from_ident(&table.name); + let mut current = Some(raw.scope_id); + while let Some(id) = current { + let scope = &self.scopes[id.0]; + for binding in scope.iter_bindings() { + if binding_alias_key(binding) == key { + return Some(binding); + } + } + current = scope.parent; + } + None + } +} + +/// Combine two lineage kinds along a substitution edge: the result is +/// `Passthrough` only when both sides are `Passthrough`; any +/// `Transformation` step makes the whole composed chain a +/// `Transformation`. +fn compose_lineage_kinds(outer: ColumnLineageKind, inner: ColumnLineageKind) -> ColumnLineageKind { + if outer == ColumnLineageKind::Passthrough && inner == ColumnLineageKind::Passthrough { + ColumnLineageKind::Passthrough + } else { + ColumnLineageKind::Transformation + } +} diff --git a/sql-insight/src/resolver/context.rs b/sql-insight/src/resolver/context.rs new file mode 100644 index 0000000..88a13e8 --- /dev/null +++ b/sql-insight/src/resolver/context.rs @@ -0,0 +1,35 @@ +//! Scoped `with_*` helpers that save / restore the resolver's +//! `scope_kind` for the duration of a closure, so lexical +//! predicate-position context is set and unset around a clause walk +//! without the caller having to remember to restore it. + +use super::{Resolver, ScopeKind}; + +impl<'a> Resolver<'a> { + /// Push a fresh scope, run `f`, then pop it. Use around each + /// branch of a `SetExpr::SetOperation` so the branches' FROM + /// bindings don't shadow each other and unqualified column refs + /// in each branch resolve only against its own FROMs — matching + /// SQL's per-SELECT name resolution. The current `scope_kind` is + /// propagated onto the pushed scope. + pub(crate) fn with_branch_scope(&mut self, f: impl FnOnce(&mut Self) -> R) -> R { + let kind = self.scope_kind; + self.scopes_mut().push_query_scope(kind); + let r = f(self); + self.scopes_mut().pop_scope(); + r + } + + /// Walk a filter-position clause with `scope_kind = Predicate`, so + /// any subquery pushed inside is classified as a predicate scope + /// and thus excluded from table-lineage. Used for WHERE, HAVING, + /// QUALIFY, JOIN ON, AsOf match, MERGE ON, CONNECT BY, pipe + /// `|> WHERE`, etc. The previous `scope_kind` is restored on return. + pub(crate) fn with_filter_clause(&mut self, f: impl FnOnce(&mut Self) -> R) -> R { + let prev = self.scope_kind; + self.scope_kind = ScopeKind::Predicate; + let r = f(self); + self.scope_kind = prev; + r + } +} diff --git a/sql-insight/src/resolver/expr.rs b/sql-insight/src/resolver/expr.rs new file mode 100644 index 0000000..0178a40 --- /dev/null +++ b/sql-insight/src/resolver/expr.rs @@ -0,0 +1,545 @@ +use super::Resolver; +use crate::error::Error; +use sqlparser::ast::{ + AccessExpr, Array, DictionaryField, Expr, Fetch, Function, FunctionArg, FunctionArgExpr, + FunctionArgumentClause, FunctionArgumentList, FunctionArguments, Interpolate, LimitClause, + ListAggOnOverflow, Map, OrderBy, OrderByExpr, OrderByKind, PipeOperator, Subscript, + WildcardAdditionalOptions, WindowFrameBound, WindowSpec, WindowType, +}; + +impl<'a> Resolver<'a> { + pub(super) fn visit_expr(&mut self, expr: &Expr) -> Result<(), Error> { + // Keep this match exhaustive so sqlparser Expr additions are reviewed here. + match expr { + // Subqueries in expression position (scalar / EXISTS / IN) + // resolve with raw `resolve_query`, NOT the + // QueryOutput-emitting wrapper — their transient projection + // is an intermediate, not a statement output. A scalar + // subquery in a projection has its source refs absorbed by + // the enclosing projection item (which emits the meaningful + // edge); a predicate subquery produces reads but no lineage. + // Same disposition as CTE / derived bodies. + Expr::Subquery(query) => self.resolve_query(query).map(|_| ()), + Expr::Exists { subquery, .. } => self.resolve_query(subquery).map(|_| ()), + Expr::InSubquery { expr, subquery, .. } => { + self.visit_expr(expr)?; + self.resolve_query(subquery).map(|_| ()) + } + Expr::BinaryOp { left, right, .. } + | Expr::IsDistinctFrom(left, right) + | Expr::IsNotDistinctFrom(left, right) + | Expr::AnyOp { left, right, .. } + | Expr::AllOp { left, right, .. } => { + self.visit_expr(left)?; + self.visit_expr(right) + } + Expr::UnaryOp { expr, .. } + | Expr::Nested(expr) + | Expr::OuterJoin(expr) + | Expr::Prior(expr) + | Expr::IsFalse(expr) + | Expr::IsNotFalse(expr) + | Expr::IsTrue(expr) + | Expr::IsNotTrue(expr) + | Expr::IsNull(expr) + | Expr::IsNotNull(expr) + | Expr::IsUnknown(expr) + | Expr::IsNotUnknown(expr) + | Expr::Cast { expr, .. } + | Expr::IsNormalized { expr, .. } + | Expr::Extract { expr, .. } + | Expr::Ceil { expr, .. } + | Expr::Floor { expr, .. } + | Expr::Collate { expr, .. } + | Expr::Prefixed { value: expr, .. } + | Expr::Named { expr, .. } => self.visit_expr(expr), + Expr::CompoundFieldAccess { root, access_chain } => { + self.visit_expr(root)?; + for access in access_chain { + self.visit_access_expr(access)?; + } + Ok(()) + } + Expr::JsonAccess { value, .. } => self.visit_expr(value), + Expr::InList { expr, list, .. } => { + self.visit_expr(expr)?; + for item in list { + self.visit_expr(item)?; + } + Ok(()) + } + Expr::InUnnest { + expr, array_expr, .. + } => { + self.visit_expr(expr)?; + self.visit_expr(array_expr) + } + Expr::Between { + expr, low, high, .. + } => { + self.visit_expr(expr)?; + self.visit_expr(low)?; + self.visit_expr(high) + } + Expr::Like { expr, pattern, .. } + | Expr::ILike { expr, pattern, .. } + | Expr::SimilarTo { expr, pattern, .. } + | Expr::RLike { expr, pattern, .. } => { + self.visit_expr(expr)?; + self.visit_expr(pattern) + } + Expr::Convert { expr, styles, .. } => { + self.visit_expr(expr)?; + for style in styles { + self.visit_expr(style)?; + } + Ok(()) + } + Expr::AtTimeZone { + timestamp, + time_zone, + } => { + self.visit_expr(timestamp)?; + self.visit_expr(time_zone) + } + Expr::Position { expr, r#in } => { + self.visit_expr(expr)?; + self.visit_expr(r#in) + } + Expr::Substring { + expr, + substring_from, + substring_for, + .. + } => { + self.visit_expr(expr)?; + if let Some(expr) = substring_from { + self.visit_expr(expr)?; + } + if let Some(expr) = substring_for { + self.visit_expr(expr)?; + } + Ok(()) + } + Expr::Trim { + expr, + trim_what, + trim_characters, + .. + } => { + self.visit_expr(expr)?; + if let Some(expr) = trim_what { + self.visit_expr(expr)?; + } + if let Some(exprs) = trim_characters { + for expr in exprs { + self.visit_expr(expr)?; + } + } + Ok(()) + } + Expr::Overlay { + expr, + overlay_what, + overlay_from, + overlay_for, + } => { + self.visit_expr(expr)?; + self.visit_expr(overlay_what)?; + self.visit_expr(overlay_from)?; + if let Some(expr) = overlay_for { + self.visit_expr(expr)?; + } + Ok(()) + } + Expr::Case { + operand, + conditions, + else_result, + .. + } => { + // All CASE sub-expressions (operand, WHEN conditions, + // THEN/ELSE results) are walked the same way — refs no + // longer carry a clause kind, so there is nothing to + // distinguish the condition position from the result. + if let Some(expr) = operand { + self.visit_expr(expr)?; + } + for condition in conditions { + self.visit_expr(&condition.condition)?; + self.visit_expr(&condition.result)?; + } + if let Some(expr) = else_result { + self.visit_expr(expr)?; + } + Ok(()) + } + Expr::GroupingSets(exprs) | Expr::Cube(exprs) | Expr::Rollup(exprs) => { + for group in exprs { + for expr in group { + self.visit_expr(expr)?; + } + } + Ok(()) + } + Expr::Tuple(exprs) => { + for expr in exprs { + self.visit_expr(expr)?; + } + Ok(()) + } + Expr::Struct { values, .. } => { + for expr in values { + self.visit_expr(expr)?; + } + Ok(()) + } + Expr::Function(function) => self.visit_function(function), + Expr::Dictionary(fields) => { + for field in fields { + self.visit_dictionary_field(field)?; + } + Ok(()) + } + Expr::Map(map) => self.visit_map(map), + Expr::Array(array) => self.visit_array(array), + Expr::Interval(interval) => self.visit_expr(&interval.value), + Expr::Lambda(lambda) => self.visit_expr(&lambda.body), + Expr::MemberOf(member_of) => { + self.visit_expr(&member_of.value)?; + self.visit_expr(&member_of.array) + } + Expr::Identifier(ident) => { + self.record_column_ref(vec![ident.clone()]); + Ok(()) + } + Expr::CompoundIdentifier(parts) => { + self.record_column_ref(parts.clone()); + Ok(()) + } + Expr::Value(_) + | Expr::TypedString(_) + | Expr::MatchAgainst { .. } + | Expr::Wildcard(_) + | Expr::QualifiedWildcard(_, _) => Ok(()), + } + } + + pub(super) fn visit_exprs(&mut self, exprs: &[Expr]) -> Result<(), Error> { + for expr in exprs { + self.visit_expr(expr)?; + } + Ok(()) + } + + pub(super) fn visit_order_by(&mut self, order_by: &OrderBy) -> Result<(), Error> { + if let OrderByKind::Expressions(exprs) = &order_by.kind { + for expr in exprs { + self.visit_order_by_expr(expr)?; + } + } + if let Some(interpolate) = &order_by.interpolate { + self.visit_interpolate(interpolate)?; + } + Ok(()) + } + + pub(super) fn visit_order_by_expr(&mut self, order_by: &OrderByExpr) -> Result<(), Error> { + self.visit_expr(&order_by.expr)?; + if let Some(with_fill) = &order_by.with_fill { + for expr in [ + with_fill.from.as_ref(), + with_fill.to.as_ref(), + with_fill.step.as_ref(), + ] + .into_iter() + .flatten() + { + self.visit_expr(expr)?; + } + } + Ok(()) + } + + fn visit_interpolate(&mut self, interpolate: &Interpolate) -> Result<(), Error> { + if let Some(exprs) = &interpolate.exprs { + for expr in exprs { + if let Some(expr) = &expr.expr { + self.visit_expr(expr)?; + } + } + } + Ok(()) + } + + pub(super) fn visit_limit_clause(&mut self, limit_clause: &LimitClause) -> Result<(), Error> { + match limit_clause { + LimitClause::LimitOffset { + limit, + offset, + limit_by, + } => { + if let Some(expr) = limit { + self.visit_expr(expr)?; + } + if let Some(offset) = offset { + self.visit_expr(&offset.value)?; + } + self.visit_exprs(limit_by) + } + LimitClause::OffsetCommaLimit { offset, limit } => { + self.visit_expr(offset)?; + self.visit_expr(limit) + } + } + } + + pub(super) fn visit_fetch(&mut self, fetch: &Fetch) -> Result<(), Error> { + if let Some(expr) = &fetch.quantity { + self.visit_expr(expr)?; + } + Ok(()) + } + + pub(super) fn visit_pipe_operator(&mut self, operator: &PipeOperator) -> Result<(), Error> { + match operator { + PipeOperator::Limit { expr, offset } => { + self.visit_expr(expr)?; + if let Some(expr) = offset { + self.visit_expr(expr)?; + } + Ok(()) + } + PipeOperator::Where { expr } => self.with_filter_clause(|r| r.visit_expr(expr)), + PipeOperator::OrderBy { exprs } => { + for expr in exprs { + self.visit_order_by_expr(expr)?; + } + Ok(()) + } + PipeOperator::Select { exprs } | PipeOperator::Extend { exprs } => { + for expr in exprs { + self.visit_select_item(expr)?; + } + Ok(()) + } + PipeOperator::Set { assignments } => { + for assignment in assignments { + self.visit_expr(&assignment.value)?; + } + Ok(()) + } + PipeOperator::Aggregate { + full_table_exprs, + group_by_expr, + } => { + for expr in full_table_exprs { + self.visit_expr(&expr.expr.expr)?; + } + for expr in group_by_expr { + self.visit_expr(&expr.expr.expr)?; + } + Ok(()) + } + PipeOperator::TableSample { sample } => self.visit_table_sample(sample), + PipeOperator::Union { queries, .. } + | PipeOperator::Intersect { queries, .. } + | PipeOperator::Except { queries, .. } => { + for query in queries { + self.resolve_query_emitting_query_output(query)?; + } + Ok(()) + } + PipeOperator::Call { function, alias } => { + self.visit_function(function)?; + if let Some(alias) = alias { + self.bind_table_function(alias.clone()); + } + Ok(()) + } + PipeOperator::Pivot { + aggregate_functions, + value_source, + .. + } => { + for expr in aggregate_functions { + self.visit_expr(&expr.expr)?; + } + self.visit_pivot_value_source(value_source) + } + PipeOperator::Join(join) => self.visit_join(join), + PipeOperator::Drop { .. } + | PipeOperator::As { .. } + | PipeOperator::Rename { .. } + | PipeOperator::Unpivot { .. } => Ok(()), + } + } + + pub(super) fn visit_wildcard_options( + &mut self, + options: &WildcardAdditionalOptions, + ) -> Result<(), Error> { + if let Some(replace) = &options.opt_replace { + for item in &replace.items { + self.visit_expr(&item.expr)?; + } + } + Ok(()) + } + + fn visit_function(&mut self, function: &Function) -> Result<(), Error> { + self.visit_function_arguments(&function.parameters)?; + self.visit_function_arguments(&function.args)?; + if let Some(expr) = &function.filter { + self.visit_expr(expr)?; + } + for expr in &function.within_group { + self.visit_order_by_expr(expr)?; + } + if let Some(over) = &function.over { + self.visit_window_type(over)?; + } + Ok(()) + } + + fn visit_function_arguments(&mut self, arguments: &FunctionArguments) -> Result<(), Error> { + match arguments { + FunctionArguments::None => Ok(()), + // A subquery as a function argument is an intermediate, not + // a statement output — raw resolve (no QueryOutput edge). + FunctionArguments::Subquery(query) => self.resolve_query(query).map(|_| ()), + FunctionArguments::List(args) => self.visit_function_argument_list(args), + } + } + + fn visit_function_argument_list(&mut self, args: &FunctionArgumentList) -> Result<(), Error> { + for arg in &args.args { + self.visit_function_arg(arg)?; + } + for clause in &args.clauses { + match clause { + FunctionArgumentClause::OrderBy(order_by) => { + for order_by in order_by { + self.visit_order_by_expr(order_by)?; + } + } + FunctionArgumentClause::Limit(expr) => self.visit_expr(expr)?, + FunctionArgumentClause::OnOverflow(on_overflow) => { + self.visit_list_agg_on_overflow(on_overflow)? + } + FunctionArgumentClause::Having(bound) => self.visit_expr(&bound.1)?, + FunctionArgumentClause::IgnoreOrRespectNulls(_) + | FunctionArgumentClause::Separator(_) + | FunctionArgumentClause::JsonNullClause(_) + | FunctionArgumentClause::JsonReturningClause(_) => {} + } + } + Ok(()) + } + + fn visit_list_agg_on_overflow(&mut self, on_overflow: &ListAggOnOverflow) -> Result<(), Error> { + match on_overflow { + ListAggOnOverflow::Error => Ok(()), + ListAggOnOverflow::Truncate { filler, .. } => { + if let Some(expr) = filler { + self.visit_expr(expr)?; + } + Ok(()) + } + } + } + + pub(super) fn visit_function_arg(&mut self, arg: &FunctionArg) -> Result<(), Error> { + match arg { + FunctionArg::Named { arg, .. } | FunctionArg::Unnamed(arg) => { + self.visit_function_arg_expr(arg) + } + FunctionArg::ExprNamed { name, arg, .. } => { + self.visit_expr(name)?; + self.visit_function_arg_expr(arg) + } + } + } + + fn visit_function_arg_expr(&mut self, arg: &FunctionArgExpr) -> Result<(), Error> { + match arg { + FunctionArgExpr::Expr(expr) => self.visit_expr(expr), + FunctionArgExpr::QualifiedWildcard(_) | FunctionArgExpr::Wildcard => Ok(()), + } + } + + fn visit_access_expr(&mut self, access: &AccessExpr) -> Result<(), Error> { + match access { + AccessExpr::Dot(expr) => self.visit_expr(expr), + AccessExpr::Subscript(subscript) => self.visit_subscript(subscript), + } + } + + fn visit_subscript(&mut self, subscript: &Subscript) -> Result<(), Error> { + match subscript { + Subscript::Index { index } => self.visit_expr(index), + Subscript::Slice { + lower_bound, + upper_bound, + stride, + } => { + for expr in [lower_bound.as_ref(), upper_bound.as_ref(), stride.as_ref()] + .into_iter() + .flatten() + { + self.visit_expr(expr)?; + } + Ok(()) + } + } + } + + fn visit_dictionary_field(&mut self, field: &DictionaryField) -> Result<(), Error> { + self.visit_expr(&field.value) + } + + fn visit_map(&mut self, map: &Map) -> Result<(), Error> { + for entry in &map.entries { + self.visit_expr(&entry.key)?; + self.visit_expr(&entry.value)?; + } + Ok(()) + } + + fn visit_array(&mut self, array: &Array) -> Result<(), Error> { + self.visit_exprs(&array.elem) + } + + fn visit_window_type(&mut self, window_type: &WindowType) -> Result<(), Error> { + match window_type { + WindowType::WindowSpec(spec) => self.visit_window_spec(spec), + WindowType::NamedWindow(_) => Ok(()), + } + } + + pub(super) fn visit_window_spec(&mut self, spec: &WindowSpec) -> Result<(), Error> { + // OVER (...) — PARTITION BY / ORDER BY / frame-bound refs are + // all walked as plain reads (no clause kind is recorded). + self.visit_exprs(&spec.partition_by)?; + for expr in &spec.order_by { + self.visit_order_by_expr(expr)?; + } + if let Some(frame) = &spec.window_frame { + self.visit_window_frame_bound(&frame.start_bound)?; + if let Some(bound) = &frame.end_bound { + self.visit_window_frame_bound(bound)?; + } + } + Ok(()) + } + + fn visit_window_frame_bound(&mut self, bound: &WindowFrameBound) -> Result<(), Error> { + match bound { + WindowFrameBound::CurrentRow => Ok(()), + WindowFrameBound::Preceding(Some(expr)) | WindowFrameBound::Following(Some(expr)) => { + self.visit_expr(expr) + } + WindowFrameBound::Preceding(None) | WindowFrameBound::Following(None) => Ok(()), + } + } +} diff --git a/sql-insight/src/resolver/lineage.rs b/sql-insight/src/resolver/lineage.rs new file mode 100644 index 0000000..36b2d66 --- /dev/null +++ b/sql-insight/src/resolver/lineage.rs @@ -0,0 +1,130 @@ +//! `LineageEdge` / `LineageTargetSpec` and the resolver helpers that emit +//! them — directly into the `lineage_edges` buffer, or fanned out from +//! a snapshot of recorded column refs, or driven by a projection +//! group via a closure-supplied target. + +use sqlparser::ast::{Ident, Query}; + +use crate::error::Error; +use crate::extractor::column_operation_extractor::ColumnLineageKind; +use crate::reference::TableReference; + +use super::{ProjectionGroup, ProjectionItem, RawColumnRef, ResolvedQuery, Resolver}; + +/// A pre-resolution column lineage record. `source` still needs +/// scope-chain resolution (for unqualified parts); `target` is fully +/// spec'd by the resolver; `kind` is the public `ColumnLineageKind` to +/// surface (composed further by `composed_lineage_edges` when the source +/// goes through a synthetic intermediate). +/// +/// Created by callers from [`ProjectionGroup`]s (for SELECT-style +/// lineage edges — INSERT pairs with target columns, top-level / nested +/// SELECTs emit `QueryOutput`) or directly by UPDATE / similar +/// walkers that already know their write target. +#[derive(Debug, Clone)] +pub(crate) struct LineageEdge { + pub(crate) source: RawColumnRef, + pub(crate) target: LineageTargetSpec, + pub(crate) kind: ColumnLineageKind, +} + +/// Target spec for a [`LineageEdge`]. `QueryOutput` is for transient +/// SELECT output columns; `Relation` is for INSERT / UPDATE / etc. +/// target columns that live in a real relation. +#[derive(Debug, Clone)] +pub(crate) enum LineageTargetSpec { + QueryOutput { + name: Option, + position: usize, + }, + Relation { + table: TableReference, + column: Ident, + }, +} + +impl<'a> Resolver<'a> { + pub(super) fn push_lineage_edge(&mut self, edge: LineageEdge) { + self.lineage_edges.push(edge); + } + + /// Emit one `LineageEdge` per `RawColumnRef` recorded into + /// `column_refs` since position `since`, all pointing to the same + /// `target` with the given `kind`. The typical caller snapshots + /// `column_refs_len()` before walking an expression, walks it, + /// then calls this with the snapshot to fan the new refs out as + /// edges. Used by UPDATE / MERGE assignment loops and MERGE + /// INSERT-VALUES emission. + pub(super) fn push_edges_from_refs_since( + &mut self, + since: usize, + target: LineageTargetSpec, + kind: ColumnLineageKind, + ) { + for offset in 0..(self.column_refs_len() - since) { + let source = self.column_refs_slice(since)[offset].clone(); + self.push_lineage_edge(LineageEdge { + source, + target: target.clone(), + kind, + }); + } + } + + /// For each `(group, position, item)` in `projections`, ask + /// `target_for(position, item)` to produce a `LineageTargetSpec`; + /// when it returns `Some(target)`, fan out one `LineageEdge` per + /// `item.source_refs` to that target, carrying the item's + /// `ColumnLineageKind`. The closure shape lets the same loop drive + /// `QueryOutput` emission, INSERT positional pairing, and CTAS / + /// view's explicit-or-inferred column pairing. + pub(super) fn emit_per_projection( + &mut self, + projections: &[ProjectionGroup], + mut target_for: F, + ) where + F: FnMut(usize, &ProjectionItem) -> Option, + { + for group in projections { + for (position, item) in group.items.iter().enumerate() { + let Some(target) = target_for(position, item) else { + continue; + }; + for source in &item.source_refs { + self.push_lineage_edge(LineageEdge { + source: source.clone(), + target: target.clone(), + kind: item.kind, + }); + } + } + } + } + + /// Emit `QueryOutput` lineage edges for every projection item in + /// `resolved`. The default disposition for queries whose output + /// is not bound to a relation target (top-level SELECT, scalar + /// subqueries, derived tables, CTE bodies, predicate subqueries). + pub(super) fn emit_query_output_edges(&mut self, resolved: &ResolvedQuery) { + self.emit_per_projection(&resolved.projections, |position, item| { + Some(LineageTargetSpec::QueryOutput { + name: item.name.clone(), + position, + }) + }); + } + + /// Convenience wrapper: resolve `query` and emit `QueryOutput` + /// edges for its projections in one shot. Use this from any + /// caller that doesn't have a special target — INSERT calls the + /// raw `resolve_query` instead so it can pair projections with + /// its target columns. + pub(super) fn resolve_query_emitting_query_output( + &mut self, + query: &Query, + ) -> Result { + let resolved = self.resolve_query(query)?; + self.emit_query_output_edges(&resolved); + Ok(resolved) + } +} diff --git a/sql-insight/src/resolver/projection.rs b/sql-insight/src/resolver/projection.rs new file mode 100644 index 0000000..c6bfbeb --- /dev/null +++ b/sql-insight/src/resolver/projection.rs @@ -0,0 +1,101 @@ +//! Per-SELECT projection facts captured by the resolver during the +//! walk, plus the classification helpers that derive each projection +//! item's name / kind (`Passthrough` / `Transformation`). + +use sqlparser::ast::{Expr, Ident, SelectItem}; + +use crate::extractor::column_operation_extractor::ColumnLineageKind; + +use super::{RawColumnRef, Resolver}; + +/// One SELECT's projection captured during the walk — one +/// [`ProjectionItem`] per output column, in projection order. Set +/// operations contribute one group per branch (so UNION INSERT pairs +/// each branch's items with the same target columns). +#[derive(Debug, Clone, PartialEq, Eq)] +pub(crate) struct ProjectionGroup { + pub(crate) items: Vec, +} + +/// A single projection slot's resolver-collected facts. +/// +/// `source_refs` are the raw column refs the projection item's +/// expression read, in walk order. `name` is the inferable output +/// name (explicit alias > bare ident name > `None`). `kind` +/// classifies how the source refs turn into the output value +/// (`Passthrough` for a bare forwarded column, `Transformation` for +/// anything value-changing); composed with the outer edge's kind when +/// this item participates in a CTE / derived table substitution. +#[derive(Debug, Clone, PartialEq, Eq)] +pub(crate) struct ProjectionItem { + pub(crate) name: Option, + pub(crate) source_refs: Vec, + pub(crate) kind: ColumnLineageKind, +} + +impl<'a> Resolver<'a> { + /// Push a fully-built `ProjectionGroup` into the active query's + /// projection buffer. Called by `visit_select` once per SELECT + /// body. + pub(super) fn push_projection_group(&mut self, group: ProjectionGroup) { + self.current_projections.push(group); + } + + /// Extend the active query's projection buffer with externally + /// produced groups — used by `SetExpr::Query` to bubble the inner + /// query's projections up into the enclosing query (so INSERT + /// pairing reaches through a parenthesized source). + pub(super) fn extend_projections(&mut self, groups: Vec) { + self.current_projections.extend(groups); + } +} + +/// Inferred output name for a projection item: +/// - explicit alias > bare identifier's name > `None` for computed +/// expressions and wildcards. +pub(super) fn projection_item_output_name(item: &SelectItem) -> Option { + match item { + SelectItem::ExprWithAlias { alias, .. } => Some(alias.clone()), + SelectItem::UnnamedExpr(expr) => expr_inferred_name(expr), + SelectItem::Wildcard(_) | SelectItem::QualifiedWildcard(_, _) => None, + } +} + +/// Classify a projection item for `ColumnLineageKind`. Wildcards don't +/// emit lineage edges currently, so the fallback `Transformation` here is +/// safe; if/when wildcard expansion lands, items will be classified +/// individually instead. +pub(super) fn projection_item_kind(item: &SelectItem) -> ColumnLineageKind { + match item { + SelectItem::ExprWithAlias { expr, .. } | SelectItem::UnnamedExpr(expr) => expr_kind(expr), + SelectItem::Wildcard(_) | SelectItem::QualifiedWildcard(_, _) => { + ColumnLineageKind::Transformation + } + } +} + +fn expr_inferred_name(expr: &Expr) -> Option { + match expr { + Expr::Identifier(ident) => Some(ident.clone()), + Expr::CompoundIdentifier(parts) => parts.last().cloned(), + _ => None, + } +} + +pub(super) fn expr_is_bare(expr: &Expr) -> bool { + matches!(expr, Expr::Identifier(_) | Expr::CompoundIdentifier(_)) +} + +/// Classify an expression for `ColumnLineageKind` — the one clean +/// distinction: +/// - bare `Identifier` / `CompoundIdentifier` → `Passthrough` (value +/// forwarded unchanged; a rename is still `Passthrough`) +/// - anything else (arithmetic, function calls incl. aggregates and +/// window functions, CASE, casts, …) → `Transformation` +pub(super) fn expr_kind(expr: &Expr) -> ColumnLineageKind { + if expr_is_bare(expr) { + ColumnLineageKind::Passthrough + } else { + ColumnLineageKind::Transformation + } +} diff --git a/sql-insight/src/resolver/query.rs b/sql-insight/src/resolver/query.rs new file mode 100644 index 0000000..820cb90 --- /dev/null +++ b/sql-insight/src/resolver/query.rs @@ -0,0 +1,316 @@ +use super::projection::{projection_item_kind, projection_item_output_name}; +use super::{ProjectionGroup, ProjectionItem, RelationSchema, ResolvedQuery, Resolver, TableRole}; +use crate::error::Error; +use crate::reference::TableReference; +use sqlparser::ast::{ + ConnectByKind, Distinct, Expr, GroupByExpr, GroupByWithModifier, Ident, NamedWindowExpr, Query, + Select, SelectItem, SelectItemQualifiedWildcardKind, SetExpr, Table, TopQuantity, Values, +}; + +impl<'a> Resolver<'a> { + pub(super) fn resolve_query(&mut self, query: &Query) -> Result { + // Push a fresh scope for the query body (the returned id isn't + // needed — bindings resolve via the stack walk). + self.scopes.push_query_scope(self.scope_kind); + // Swap in a fresh projection buffer for this query — restored on + // return — so each ResolvedQuery owns exactly its own groups + // without leaking into siblings or ancestors. + let prev_projections = std::mem::take(&mut self.current_projections); + // `scope_kind` intentionally propagates through the subquery + // boundary (a subquery in a predicate is itself predicate-position + // for table-lineage exclusion), so nothing to reset/restore here. + if let Some(with) = &query.with { + if with.recursive { + for cte in &with.cte_tables { + // Recursive CTEs pre-bind with empty body_projections; + // fixpoint-aware projection capture is deferred. + self.bind_cte(cte.alias.name.clone(), RelationSchema::Unknown, Vec::new()); + } + for cte in &with.cte_tables { + // Body output is discarded for recursive CTEs (no + // composition either). Raw resolve_query so the + // intermediate QueryOutput edges aren't emitted. + self.resolve_query(&cte.query)?; + } + } else { + for cte in &with.cte_tables { + // Raw resolve_query: the body's projections are + // stored in the binding for lineage composition, and + // no intermediate QueryOutput edges are emitted + // since the CTE output isn't a query result on its + // own — references through the CTE compose end to + // end at lineage-emission time. + let resolved = self.resolve_query(&cte.query)?; + let renames = &cte.alias.columns; + let renamed_schema = + super::rename_relation_schema(resolved.output_schema, renames); + let renamed_projections = + super::rename_projection_groups(resolved.projections, renames); + self.bind_cte(cte.alias.name.clone(), renamed_schema, renamed_projections); + } + } + } + let body_schema = self.visit_set_expr(&query.body)?; + if let Some(order_by) = &query.order_by { + self.visit_order_by(order_by)?; + } + if let Some(limit_clause) = &query.limit_clause { + self.visit_limit_clause(limit_clause)?; + } + if let Some(fetch) = &query.fetch { + self.visit_fetch(fetch)?; + } + if let Some(settings) = &query.settings { + for setting in settings { + self.visit_expr(&setting.value)?; + } + } + for pipe_operator in &query.pipe_operators { + self.visit_pipe_operator(pipe_operator)?; + } + self.scopes.pop_scope(); + let projections = std::mem::replace(&mut self.current_projections, prev_projections); + Ok(ResolvedQuery { + output_schema: body_schema, + projections, + }) + } + + fn visit_set_expr(&mut self, set_expr: &SetExpr) -> Result { + match set_expr { + SetExpr::Select(select) => self.visit_select(select), + SetExpr::Query(query) => { + // Parenthesized continuation of the enclosing query — + // bubble the inner projections up so an outer INSERT (or + // any other caller) sees them as if they were inline. + let resolved = self.resolve_query(query)?; + let output_schema = resolved.output_schema.clone(); + self.extend_projections(resolved.projections); + Ok(output_schema) + } + SetExpr::SetOperation { left, right, .. } => { + // Each branch lives in its own scope so name resolution + // doesn't see sibling branches' FROM bindings — matching + // SQL's per-SELECT name resolution. The branches' own + // visit_select calls each contribute a ProjectionGroup, + // so UNION INSERT naturally pairs every branch with the + // same target columns. Result schema conventionally + // follows the left side's column names. + let left_schema = self.with_branch_scope(|r| r.visit_set_expr(left))?; + self.with_branch_scope(|r| r.visit_set_expr(right))?; + Ok(left_schema) + } + SetExpr::Insert(statement) + | SetExpr::Update(statement) + | SetExpr::Delete(statement) + | SetExpr::Merge(statement) => { + // `WITH cte AS (...) ` — the DML statement runs in + // its own scope so its target binding doesn't share the + // enclosing query's scope with the CTEs. Without this, + // an unqualified predicate ref like `id` in + // `DELETE FROM t WHERE id IN (SELECT id FROM cte)` + // would see both `t` and `cte` in one scope and resolve + // ambiguously to None. CTEs stay reachable via the + // parent-scope walk-up. + self.with_branch_scope(|r| r.visit_statement(statement))?; + Ok(RelationSchema::Unknown) + } + SetExpr::Table(table) => { + self.visit_table_command(table); + Ok(RelationSchema::Unknown) + } + SetExpr::Values(values) => { + self.visit_values(values)?; + Ok(RelationSchema::Unknown) + } + } + } + + fn visit_select(&mut self, select: &Select) -> Result { + if let Some(Distinct::On(exprs)) = &select.distinct { + self.visit_exprs(exprs)?; + } + if let Some(top) = &select.top { + if let Some(TopQuantity::Expr(expr)) = &top.quantity { + self.visit_expr(expr)?; + } + } + for table in &select.from { + self.visit_table_with_joins(table, TableRole::Read)?; + } + let mut projection_items = Vec::with_capacity(select.projection.len()); + for item in &select.projection { + projection_items.push(self.build_projection_item(item)?); + } + self.push_projection_group(ProjectionGroup { + items: projection_items, + }); + if let Some(into) = &select.into { + // SELECT ... INTO new_table acts like CTAS — INTO is the write target. + self.bind_base_table( + TableReference::try_from(&into.name)?, + None, + TableRole::Write, + ); + } + for lateral_view in &select.lateral_views { + self.visit_expr(&lateral_view.lateral_view)?; + } + for expr in [ + select.prewhere.as_ref(), + select.selection.as_ref(), + select.having.as_ref(), + select.qualify.as_ref(), + ] + .into_iter() + .flatten() + { + self.with_filter_clause(|r| r.visit_expr(expr))?; + } + for connect_by in &select.connect_by { + // CONNECT BY / START WITH are predicate-style hierarchical + // join conditions (Oracle / Snowflake) — subqueries nested + // here do not feed the enclosing write target. + self.with_filter_clause(|r| match connect_by { + ConnectByKind::ConnectBy { relationships, .. } => r.visit_exprs(relationships), + ConnectByKind::StartWith { condition, .. } => r.visit_expr(condition), + })?; + } + self.visit_group_by(&select.group_by)?; + // CLUSTER BY / DISTRIBUTE BY (Hive / Spark) — partitioning / + // clustering directives, walked as plain reads. + self.visit_exprs(&select.cluster_by)?; + self.visit_exprs(&select.distribute_by)?; + for order_by in &select.sort_by { + self.visit_order_by_expr(order_by)?; + } + for window in &select.named_window { + if let NamedWindowExpr::WindowSpec(spec) = &window.1 { + self.visit_window_spec(spec)?; + } + } + Ok(projection_schema(&select.projection)) + } + + /// Walk a single projection item's expression and snapshot the + /// refs it records, packaging name / source_refs / kind into a + /// `ProjectionItem`. + pub(super) fn build_projection_item( + &mut self, + item: &SelectItem, + ) -> Result { + let refs_before = self.column_refs_len(); + self.visit_select_item(item)?; + let source_refs = self.column_refs_slice(refs_before).to_vec(); + Ok(ProjectionItem { + name: projection_item_output_name(item), + source_refs, + kind: projection_item_kind(item), + }) + } + + pub(super) fn visit_select_item(&mut self, item: &SelectItem) -> Result<(), Error> { + match item { + SelectItem::UnnamedExpr(expr) | SelectItem::ExprWithAlias { expr, .. } => { + self.visit_expr(expr) + } + SelectItem::QualifiedWildcard(SelectItemQualifiedWildcardKind::Expr(expr), options) => { + self.record_wildcard_suppressed( + "qualified wildcard `(expr).*`", + options.wildcard_token.0.span, + ); + self.visit_expr(expr) + } + SelectItem::QualifiedWildcard( + SelectItemQualifiedWildcardKind::ObjectName(name), + options, + ) => { + self.record_wildcard_suppressed( + &format!("qualified wildcard `{}.*`", name), + options.wildcard_token.0.span, + ); + self.visit_wildcard_options(options) + } + SelectItem::Wildcard(options) => { + self.record_wildcard_suppressed("wildcard `*`", options.wildcard_token.0.span); + self.visit_wildcard_options(options) + } + } + } + + fn visit_table_command(&mut self, table: &Table) { + let Some(name) = &table.table_name else { + return; + }; + // `TABLE foo` is sugar for `SELECT * FROM foo` — foo is read. + self.bind_base_table( + TableReference { + catalog: None, + schema: table + .schema_name + .as_ref() + .map(|schema| schema.as_str().into()), + name: name.as_str().into(), + }, + None, + TableRole::Read, + ); + } + + fn visit_values(&mut self, values: &Values) -> Result<(), Error> { + for row in &values.rows { + self.visit_exprs(row)?; + } + Ok(()) + } + + fn visit_group_by(&mut self, group_by: &GroupByExpr) -> Result<(), Error> { + match group_by { + GroupByExpr::All(modifiers) => self.visit_group_by_modifiers(modifiers), + GroupByExpr::Expressions(exprs, modifiers) => { + self.visit_exprs(exprs)?; + self.visit_group_by_modifiers(modifiers) + } + } + } + + fn visit_group_by_modifiers(&mut self, modifiers: &[GroupByWithModifier]) -> Result<(), Error> { + for modifier in modifiers { + if let GroupByWithModifier::GroupingSets(expr) = modifier { + self.visit_expr(expr)?; + } + } + Ok(()) + } +} + +/// Derive an output `RelationSchema` from a `SELECT` projection, structurally only. +/// Wildcards and computed expressions fall back to `RelationSchema::Unknown`; that +/// gap is filled in later phases once catalog and in-scope relation schemas +/// can drive expansion. +fn projection_schema(projection: &[SelectItem]) -> RelationSchema { + let mut columns = Vec::with_capacity(projection.len()); + for item in projection { + match column_from_select_item(item) { + Some(column) => columns.push(column), + None => return RelationSchema::Unknown, + } + } + RelationSchema::Known(columns) +} + +fn column_from_select_item(item: &SelectItem) -> Option { + match item { + SelectItem::ExprWithAlias { alias, .. } => Some(alias.clone()), + SelectItem::UnnamedExpr(expr) => column_from_expr(expr), + SelectItem::Wildcard(_) | SelectItem::QualifiedWildcard(_, _) => None, + } +} + +fn column_from_expr(expr: &Expr) -> Option { + match expr { + Expr::Identifier(ident) => Some(ident.clone()), + Expr::CompoundIdentifier(parts) => parts.last().cloned(), + _ => None, + } +} diff --git a/sql-insight/src/resolver/rename.rs b/sql-insight/src/resolver/rename.rs new file mode 100644 index 0000000..2443f58 --- /dev/null +++ b/sql-insight/src/resolver/rename.rs @@ -0,0 +1,59 @@ +//! Column-list rename for `WITH cte(a, b) AS (...)` and +//! `(SELECT ...) d(a, b)` aliases. Applied to both the body's +//! `output_schema` and its `projection_groups` so lineage composition's +//! name-match lookup finds the renamed columns. + +use super::{ProjectionGroup, RelationSchema}; + +/// Apply a column alias rename list to a body's `output_schema`. The +/// alias at position N overrides the body's inferred column at +/// position N; body columns past the alias list keep their inferred +/// names. An empty rename list returns `schema` unchanged; an +/// `Unknown` body schema is promoted to `Known` containing exactly +/// the declared rename columns (the only columns we can name with +/// certainty after a rename clause). +pub(crate) fn rename_relation_schema( + schema: RelationSchema, + renames: &[sqlparser::ast::TableAliasColumnDef], +) -> RelationSchema { + if renames.is_empty() { + return schema; + } + match schema { + RelationSchema::Unknown => { + RelationSchema::Known(renames.iter().map(|r| r.name.clone()).collect()) + } + RelationSchema::Known(mut cols) => { + for (position, rename) in renames.iter().enumerate() { + if let Some(col) = cols.get_mut(position) { + *col = rename.name.clone(); + } else { + cols.push(rename.name.clone()); + } + } + RelationSchema::Known(cols) + } + } +} + +/// Apply the same rename to the projection items' inferred names so +/// lineage composition's name-match lookup finds the renamed columns. +/// Position N in the rename list overrides position N's item name; +/// positions beyond the list keep their body-inferred names. Each +/// `ProjectionGroup` (set-op branch) is renamed independently. +pub(crate) fn rename_projection_groups( + mut groups: Vec, + renames: &[sqlparser::ast::TableAliasColumnDef], +) -> Vec { + if renames.is_empty() { + return groups; + } + for group in &mut groups { + for (position, item) in group.items.iter_mut().enumerate() { + if let Some(rename) = renames.get(position) { + item.name = Some(rename.name.clone()); + } + } + } + groups +} diff --git a/sql-insight/src/resolver/statement.rs b/sql-insight/src/resolver/statement.rs new file mode 100644 index 0000000..ede13ff --- /dev/null +++ b/sql-insight/src/resolver/statement.rs @@ -0,0 +1,664 @@ +use super::{LineageTargetSpec, ProjectionGroup, RelationSchema, Resolver, TableRole}; +use crate::error::Error; +use crate::reference::TableReference; +use sqlparser::ast::{ + Delete, FromTable, Ident, Merge, ObjectType, OnConflictAction, OnInsert, SelectItem, Statement, + TableWithJoins, Update, UpdateTableFromKind, +}; + +impl<'a> Resolver<'a> { + pub(super) fn visit_statement(&mut self, statement: &Statement) -> Result<(), Error> { + // Keep this match exhaustive. Unsupported variants are listed explicitly so sqlparser + // Statement additions become compile errors instead of silent misses. + match statement { + Statement::Query(query) => self.resolve_query_emitting_query_output(query).map(|_| ()), + Statement::Insert(insert) => self.visit_insert(insert), + Statement::Update(update) => self.visit_update(update), + Statement::Delete(delete) => self.visit_delete(delete), + Statement::Merge(merge) => self.visit_merge(merge), + Statement::CreateTable(create_table) => { + let target = TableReference::try_from(&create_table.name)?; + self.bind_base_table(target.clone(), None, TableRole::Write); + if let Some(query) = &create_table.query { + // CTAS: source projections pair with the new + // table's columns. Explicit column defs (if any) + // win over inferred names from the source SELECT. + let explicit: Vec = create_table + .columns + .iter() + .map(|c| c.name.clone()) + .collect(); + let resolved = self.resolve_query(query)?; + self.emit_relation_to_created(&target, &explicit, &resolved); + } + Ok(()) + } + Statement::CreateView(create_view) => { + let target = TableReference::try_from(&create_view.name)?; + self.bind_base_table(target.clone(), None, TableRole::Write); + let explicit: Vec = + create_view.columns.iter().map(|c| c.name.clone()).collect(); + let resolved = self.resolve_query(&create_view.query)?; + self.emit_relation_to_created(&target, &explicit, &resolved); + if let Some(to) = &create_view.to { + self.bind_base_table(TableReference::try_from(to)?, None, TableRole::Write); + } + Ok(()) + } + Statement::AlterView { + name, + query, + columns, + .. + } => { + let target = TableReference::try_from(name)?; + self.bind_base_table(target.clone(), None, TableRole::Write); + let resolved = self.resolve_query(query)?; + self.emit_relation_to_created(&target, columns, &resolved); + Ok(()) + } + Statement::CreateVirtualTable { name, .. } => { + self.bind_base_table(TableReference::try_from(name)?, None, TableRole::Write); + Ok(()) + } + Statement::AlterTable(alter_table) => { + self.bind_base_table( + TableReference::try_from(&alter_table.name)?, + None, + TableRole::Write, + ); + Ok(()) + } + Statement::Drop { + object_type, + names, + table, + .. + } => { + if matches!( + object_type, + ObjectType::Table | ObjectType::View | ObjectType::MaterializedView + ) { + for name in names { + self.bind_base_table( + TableReference::try_from(name)?, + None, + TableRole::Write, + ); + } + } + if let Some(table) = table { + self.bind_base_table(TableReference::try_from(table)?, None, TableRole::Write); + } + Ok(()) + } + Statement::Truncate(truncate) => { + for table in &truncate.table_names { + self.bind_base_table( + TableReference::try_from(&table.name)?, + None, + TableRole::Write, + ); + } + Ok(()) + } + Statement::Analyze(_) + | Statement::Set(_) + | Statement::Msck(_) + | Statement::Install { .. } + | Statement::Load { .. } + | Statement::Directory { .. } + | Statement::Case(_) + | Statement::If(_) + | Statement::While(_) + | Statement::Raise(_) + | Statement::Call(_) + | Statement::Copy { .. } + | Statement::CopyIntoSnowflake { .. } + | Statement::Open(_) + | Statement::Close { .. } + | Statement::CreateIndex(_) + | Statement::CreateRole(_) + | Statement::CreateSecret { .. } + | Statement::CreateServer(_) + | Statement::CreatePolicy(_) + | Statement::CreateConnector(_) + | Statement::CreateOperator(_) + | Statement::CreateOperatorFamily(_) + | Statement::CreateOperatorClass(_) + | Statement::AlterSchema(_) + | Statement::AlterIndex { .. } + | Statement::AlterType(_) + | Statement::AlterOperator(_) + | Statement::AlterOperatorFamily(_) + | Statement::AlterOperatorClass(_) + | Statement::AlterRole { .. } + | Statement::AlterPolicy(_) + | Statement::AlterConnector { .. } + | Statement::AlterSession { .. } + | Statement::AttachDatabase { .. } + | Statement::AttachDuckDBDatabase { .. } + | Statement::DetachDuckDBDatabase { .. } + | Statement::DropFunction(_) + | Statement::DropDomain(_) + | Statement::DropProcedure { .. } + | Statement::DropSecret { .. } + | Statement::DropPolicy(_) + | Statement::DropConnector { .. } + | Statement::Declare { .. } + | Statement::CreateExtension(_) + | Statement::DropExtension(_) + | Statement::DropOperator(_) + | Statement::DropOperatorFamily(_) + | Statement::DropOperatorClass(_) + | Statement::Fetch { .. } + | Statement::Flush { .. } + | Statement::Discard { .. } + | Statement::ShowFunctions { .. } + | Statement::ShowVariable { .. } + | Statement::ShowStatus { .. } + | Statement::ShowVariables { .. } + | Statement::ShowCreate { .. } + | Statement::ShowColumns { .. } + | Statement::ShowDatabases { .. } + | Statement::ShowSchemas { .. } + | Statement::ShowCharset(_) + | Statement::ShowObjects(_) + | Statement::ShowTables { .. } + | Statement::ShowViews { .. } + | Statement::ShowCollation { .. } + | Statement::Use(_) + | Statement::StartTransaction { .. } + | Statement::Comment { .. } + | Statement::Commit { .. } + | Statement::Rollback { .. } + | Statement::CreateSchema { .. } + | Statement::CreateDatabase { .. } + | Statement::CreateFunction(_) + | Statement::CreateTrigger(_) + | Statement::DropTrigger(_) + | Statement::CreateProcedure { .. } + | Statement::CreateMacro { .. } + | Statement::CreateStage { .. } + | Statement::Assert { .. } + | Statement::Grant(_) + | Statement::Deny(_) + | Statement::Revoke(_) + | Statement::Deallocate { .. } + | Statement::Execute { .. } + | Statement::Prepare { .. } + | Statement::Kill { .. } + | Statement::ExplainTable { .. } + | Statement::Explain { .. } + | Statement::Savepoint { .. } + | Statement::ReleaseSavepoint { .. } + | Statement::Cache { .. } + | Statement::UNCache { .. } + | Statement::CreateSequence { .. } + | Statement::CreateDomain(_) + | Statement::CreateType { .. } + | Statement::Pragma { .. } + | Statement::LockTables { .. } + | Statement::UnlockTables + | Statement::Unload { .. } + | Statement::OptimizeTable { .. } + | Statement::LISTEN { .. } + | Statement::UNLISTEN { .. } + | Statement::NOTIFY { .. } + | Statement::LoadData { .. } + | Statement::RenameTable(_) + | Statement::List(_) + | Statement::Remove(_) + | Statement::RaisError { .. } + | Statement::Print(_) + | Statement::Return(_) + | Statement::ExportData(_) + | Statement::CreateUser(_) + | Statement::AlterUser(_) + | Statement::Vacuum(_) + | Statement::Reset(_) => { + self.record_unsupported_statement(statement); + Ok(()) + } + } + } + + fn visit_insert(&mut self, insert: &sqlparser::ast::Insert) -> Result<(), Error> { + let (table, alias) = TableReference::from_insert_with_alias(insert)?; + let target_table = table.clone(); + self.bind_base_table(table, alias, TableRole::Write); + // Explicit column list wins; otherwise fall back to the + // catalog-provided schema (when present) for positional + // pairing. Without either, no lineage edges are emitted — + // we have no target column names to pair against. + let effective_columns = self.effective_target_columns(&insert.columns, &target_table); + let source_projections = if let Some(source) = &insert.source { + // Raw resolve_query (not the QueryOutput-emitting wrapper): + // INSERT pairs each projection item positionally with its + // target column instead, emitting Relation edges. UNION + // sources surface as multiple projection groups, so each + // branch pairs against the same target columns naturally. + let resolved = self.resolve_query(source)?; + self.emit_per_projection(&resolved.projections, |position, _item| { + effective_columns + .get(position) + .map(|col| LineageTargetSpec::Relation { + table: target_table.clone(), + column: col.clone(), + }) + }); + resolved.projections + } else { + Vec::new() + }; + for assignment in &insert.assignments { + self.visit_expr(&assignment.value)?; + } + // Walk RETURNING before the ON-clause so EXCLUDED isn't yet + // bound: RETURNING projects from the target table, never from + // the would-be-inserted pseudo-row, and an in-scope EXCLUDED + // would ambify unqualified refs that collide with INSERT cols. + self.visit_returning(insert.returning.as_deref())?; + if let Some(on) = &insert.on { + self.visit_insert_on(on, &target_table, &effective_columns, &source_projections)?; + } + Ok(()) + } + + /// Walk a `RETURNING ` clause. Each item is treated + /// like a top-level SELECT projection: it contributes refs to + /// `column_refs` and a `QueryOutput` lineage edge per item. The + /// target table is the only binding in scope (the source SELECT's + /// inner scope has been popped by the time this runs), so + /// unqualified refs resolve to it. + fn visit_returning(&mut self, returning: Option<&[SelectItem]>) -> Result<(), Error> { + let Some(items) = returning else { + return Ok(()); + }; + let mut projection_items = Vec::with_capacity(items.len()); + for item in items { + projection_items.push(self.build_projection_item(item)?); + } + let projections = vec![ProjectionGroup { + items: projection_items, + }]; + self.emit_per_projection(&projections, |position, item| { + Some(LineageTargetSpec::QueryOutput { + name: item.name.clone(), + position, + }) + }); + Ok(()) + } + + /// Walk the optional ON-clause attached to an `INSERT`: + /// `ON CONFLICT ... DO UPDATE SET ...` (Postgres / Sqlite) or + /// `ON DUPLICATE KEY UPDATE ...` (MySQL). Both update-style + /// actions reuse [`Self::emit_assignment_lineage`] so each + /// assignment's RHS feeds a Relation-target lineage edge into the + /// INSERT target's column, identical to a standalone `UPDATE`. + /// + /// The `EXCLUDED` pseudo-table (Postgres) is bound as a synthetic + /// derived-table with the INSERT target's column list as its + /// schema, so `EXCLUDED.` refs filter out of the public + /// `reads` surface (matching how CTE / derived refs behave) while + /// still emitting valid lineage sources for the assignment edges. + /// MySQL's equivalent (`VALUES()`) is a function-call form + /// that visit_expr already walks; no extra binding needed. + fn visit_insert_on( + &mut self, + on: &OnInsert, + target_table: &TableReference, + effective_columns: &[Ident], + source_projections: &[super::ProjectionGroup], + ) -> Result<(), Error> { + match on { + OnInsert::DuplicateKeyUpdate(assignments) => { + // MySQL ON DUPLICATE KEY UPDATE doesn't expose the + // would-be-inserted row as a pseudo-table; `VALUES(col)` + // is the implicit-row form, parsed as a regular + // function call. Don't bind EXCLUDED here — doing so + // would make unqualified column refs inside the SET + // expressions ambiguous against the INSERT target. + self.emit_assignment_lineage(assignments, Some(target_table))?; + } + OnInsert::OnConflict(on_conflict) => { + if let OnConflictAction::DoUpdate(do_update) = &on_conflict.action { + // EXCLUDED in Postgres / Sqlite exposes the + // would-be-inserted row as a row source. Bind it + // as a synthetic derived-table with: + // - schema: the INSERT target's column list, so + // `EXCLUDED.` refs filter out of the public + // `reads` surface (like CTE / derived); + // - body_projections: the INSERT source's + // projections renamed positionally to the target + // column names, so `substitute_source` composes + // `EXCLUDED.` back to the actual source ref + // (e.g. `EXCLUDED.b` → source's `y` when the + // INSERT pairs (a, b) ← (x, y)). + let excluded_schema = if effective_columns.is_empty() { + RelationSchema::Unknown + } else { + RelationSchema::Known(effective_columns.to_vec()) + }; + let body_projections = + excluded_body_projections(effective_columns, source_projections); + self.bind_derived_table( + Ident::new("EXCLUDED"), + excluded_schema, + body_projections, + ); + self.emit_assignment_lineage(&do_update.assignments, Some(target_table))?; + if let Some(selection) = &do_update.selection { + self.with_filter_clause(|r| r.visit_expr(selection))?; + } + } + } + // `OnInsert` is `#[non_exhaustive]` in sqlparser. New + // variants land silently here — revisit when sqlparser + // grows another conflict-action shape. + _ => {} + } + Ok(()) + } + + /// Emit Relation lineage edges for a CREATE-AS source: each + /// projection item pairs with the created relation's column at + /// the same position. Target column name comes from the explicit + /// column list when present, otherwise from the projection's + /// inferred name (alias > bare ident name); items without an + /// inferable name and no explicit slot are silently skipped. + /// Used by CTAS, CREATE VIEW, and ALTER VIEW. + /// + /// For UNION-bodied sources the result schema follows the LEFT + /// branch's names (SQL standard), so the inferred-name fallback + /// reads the first projection group's item names rather than the + /// current group's — making every branch pair against the same + /// target column at each position. Mirrors INSERT-SELECT-UNION + /// positional pairing. + fn emit_relation_to_created( + &mut self, + target: &TableReference, + explicit_columns: &[sqlparser::ast::Ident], + resolved: &super::ResolvedQuery, + ) { + let inferred_left_names: Vec> = resolved + .projections + .first() + .map(|g| g.items.iter().map(|i| i.name.clone()).collect()) + .unwrap_or_default(); + self.emit_per_projection(&resolved.projections, |position, _item| { + explicit_columns + .get(position) + .cloned() + .or_else(|| inferred_left_names.get(position).cloned().flatten()) + .map(|column| LineageTargetSpec::Relation { + table: target.clone(), + column, + }) + }); + } + + fn visit_update(&mut self, update: &Update) -> Result<(), Error> { + // The head of update.table is the write target; joined tables + // (inside visit_table_with_joins) are reads by definition. + self.visit_table_with_joins(&update.table, TableRole::Write)?; + if let Some(from) = &update.from { + let tables = match from { + UpdateTableFromKind::BeforeSet(tables) | UpdateTableFromKind::AfterSet(tables) => { + tables + } + }; + for table in tables { + self.visit_table_with_joins(table, TableRole::Read)?; + } + } + let target_table = try_target_table_from_factor(&update.table.relation); + self.emit_assignment_lineage(&update.assignments, target_table.as_ref())?; + if let Some(selection) = &update.selection { + self.with_filter_clause(|r| r.visit_expr(selection))?; + } + self.visit_returning(update.returning.as_deref())?; + Ok(()) + } + + /// Walk each SET-style assignment's RHS expression and emit + /// Relation lineage edges from any newly recorded source refs into + /// the assignment's target column. Shared by `visit_update` and + /// MERGE's `WHEN MATCHED UPDATE` branch — both have identical + /// per-assignment semantics. Target column qualifier resolution: + /// qualified target (`t.col`) wins; bare target falls back to + /// `default_table` (UPDATE head / MERGE INTO target). + fn emit_assignment_lineage( + &mut self, + assignments: &[sqlparser::ast::Assignment], + default_table: Option<&TableReference>, + ) -> Result<(), Error> { + for assignment in assignments { + let target_parts = assignment_target_parts(&assignment.target); + let kind = super::projection::expr_kind(&assignment.value); + let refs_before = self.column_refs_len(); + self.visit_expr(&assignment.value)?; + let Some(target_parts) = target_parts else { + continue; + }; + let Some(target_table_ref) = assignment_target_table(&target_parts, default_table) + else { + continue; + }; + let target = LineageTargetSpec::Relation { + table: target_table_ref, + column: target_parts.last().cloned().unwrap(), + }; + self.push_edges_from_refs_since(refs_before, target, kind); + } + Ok(()) + } + + fn visit_delete(&mut self, delete: &Delete) -> Result<(), Error> { + // Visit in alias-defining order so that later Write binds merge + // onto already-resolved `TableReference`s rather than overwriting + // them with bare names. + // + // The FROM clause's role depends on the shape of the DELETE: + // bare `DELETE FROM t` → FROM is write target + // `DELETE FROM target USING source` → FROM is write target, USING is read-and-alias-source + // `DELETE target FROM source` → FROM is read-and-alias-source, tables list is write target + // + // In the USING shape the alias-defining clause is USING, so visit + // USING first. In the explicit-target-list shape the + // alias-defining clause is FROM, which we also want visited before + // the tables list is merged on top. + if let Some(using) = &delete.using { + for table in using { + self.visit_table_with_joins(table, TableRole::Read)?; + } + } + let from_role = if delete.tables.is_empty() { + TableRole::Write + } else { + TableRole::Read + }; + for table in from_table_items(&delete.from) { + self.visit_table_with_joins(table, from_role)?; + } + for name in &delete.tables { + self.bind_base_table(TableReference::try_from_name(name)?, None, TableRole::Write); + } + if let Some(selection) = &delete.selection { + self.with_filter_clause(|r| r.visit_expr(selection))?; + } + self.visit_returning(delete.returning.as_deref())?; + Ok(()) + } + + fn visit_merge(&mut self, merge: &Merge) -> Result<(), Error> { + use sqlparser::ast::{MergeAction, MergeInsertKind}; + self.visit_table_factor(&merge.table, TableRole::Write)?; + self.visit_table_factor(&merge.source, TableRole::Read)?; + self.with_filter_clause(|r| r.visit_expr(&merge.on))?; + let target_table = try_target_table_from_factor(&merge.table); + for clause in &merge.clauses { + if let Some(predicate) = &clause.predicate { + self.with_filter_clause(|r| r.visit_expr(predicate))?; + } + match &clause.action { + MergeAction::Insert(insert_expr) => { + if let Some(pred) = &insert_expr.insert_predicate { + self.with_filter_clause(|r| r.visit_expr(pred))?; + } + if let MergeInsertKind::Values(values) = &insert_expr.kind { + self.emit_merge_insert_lineage( + values, + &insert_expr.columns, + target_table.as_ref(), + )?; + } + // MergeInsertKind::Row (BigQuery `INSERT ROW`) — the + // source row is inserted as-is; per-column pairing + // needs catalog knowledge of the target schema. + } + MergeAction::Update(update_expr) => { + self.emit_assignment_lineage(&update_expr.assignments, target_table.as_ref())?; + } + MergeAction::Delete { .. } => { + // DELETE has no column-level value lineage. + } + } + } + Ok(()) + } + + /// Emit per-position Relation lineage edges for MERGE's + /// `WHEN NOT MATCHED THEN INSERT (cols) VALUES (...)`. Each value + /// expression's source refs pair with the column at the same + /// position in `columns`. Walks values with default `Projection` + /// kind for read classification. + fn emit_merge_insert_lineage( + &mut self, + values: &sqlparser::ast::Values, + columns: &[sqlparser::ast::ObjectName], + target_table: Option<&TableReference>, + ) -> Result<(), Error> { + // Resolve effective target column idents up-front: when the + // INSERT clause has an explicit list, take each ObjectName's + // last segment; otherwise fall back to the catalog-provided + // schema (returns empty without catalog, matching the + // no-pairing behavior). + let explicit_idents: Vec = columns + .iter() + .filter_map(|c| c.0.last().and_then(|p| p.as_ident().cloned())) + .collect(); + let effective_idents = match target_table { + Some(target) => self.effective_target_columns(&explicit_idents, target), + None => explicit_idents, + }; + for row in &values.rows { + for (position, value_expr) in row.iter().enumerate() { + let kind = super::projection::expr_kind(value_expr); + let refs_before = self.column_refs_len(); + self.visit_expr(value_expr)?; + let (Some(target_table), Some(col_ident)) = + (target_table, effective_idents.get(position)) + else { + continue; + }; + let target = LineageTargetSpec::Relation { + table: target_table.clone(), + column: col_ident.clone(), + }; + self.push_edges_from_refs_since(refs_before, target, kind); + } + } + Ok(()) + } +} + +/// Rename each source projection group's items positionally to the +/// INSERT target's column names — the EXCLUDED pseudo-table exposes +/// the would-be-inserted row, so `EXCLUDED.` should +/// compose back to whatever expression feeds that position of the +/// source. Returns an empty `Vec` when there are no source +/// projections (e.g. `INSERT ... VALUES (...) ON CONFLICT ...`), +/// in which case `substitute_source` falls back to leaving +/// `EXCLUDED.` as the lineage source. +fn excluded_body_projections( + effective_columns: &[Ident], + source_projections: &[super::ProjectionGroup], +) -> Vec { + if source_projections.is_empty() || effective_columns.is_empty() { + return Vec::new(); + } + source_projections + .iter() + .map(|group| { + let mut g = group.clone(); + for (position, item) in g.items.iter_mut().enumerate() { + if let Some(name) = effective_columns.get(position) { + item.name = Some(name.clone()); + } + } + g + }) + .collect() +} + +fn from_table_items(from: &FromTable) -> &[TableWithJoins] { + match from { + FromTable::WithFromKeyword(items) | FromTable::WithoutKeyword(items) => items, + } +} + +/// Best-effort extraction of a write-target `TableReference` from a +/// `TableFactor`. Only the plain `TableFactor::Table` variant has a +/// resolvable identity; derived / pivot / table-function targets are +/// not valid SQL write targets and return `None`, leaving the caller's +/// assignment / pairing logic to fall back to qualifier-only target +/// derivation. +fn try_target_table_from_factor(factor: &sqlparser::ast::TableFactor) -> Option { + matches!(factor, sqlparser::ast::TableFactor::Table { .. }) + .then(|| TableReference::try_from(factor).ok()) + .flatten() +} + +fn assignment_target_parts( + target: &sqlparser::ast::AssignmentTarget, +) -> Option> { + match target { + sqlparser::ast::AssignmentTarget::ColumnName(name) => name + .0 + .iter() + .map(|p| p.as_ident().cloned()) + .collect::>>(), + sqlparser::ast::AssignmentTarget::Tuple(_) => None, + } +} + +/// Derive the owning `TableReference` for an UPDATE SET target. +/// `parts.len() == 1`: bare column, take the UPDATE head as default. +/// `parts.len() >= 2`: take the leading parts as catalog/schema/table. +fn assignment_target_table( + parts: &[sqlparser::ast::Ident], + default_table: Option<&TableReference>, +) -> Option { + match parts.len() { + 0 => None, + 1 => default_table.cloned(), + 2 => Some(TableReference { + catalog: None, + schema: None, + name: parts[0].clone(), + }), + 3 => Some(TableReference { + catalog: None, + schema: Some(parts[0].clone()), + name: parts[1].clone(), + }), + 4 => Some(TableReference { + catalog: Some(parts[0].clone()), + schema: Some(parts[1].clone()), + name: parts[2].clone(), + }), + _ => None, + } +} diff --git a/sql-insight/src/resolver/table.rs b/sql-insight/src/resolver/table.rs new file mode 100644 index 0000000..fb01812 --- /dev/null +++ b/sql-insight/src/resolver/table.rs @@ -0,0 +1,351 @@ +use super::{RelationSchema, Resolver, TableRole}; +use crate::error::Error; +use crate::reference::TableReference; +use sqlparser::ast::{ + FunctionArg, Join, JoinConstraint, JoinOperator, PivotValueSource, TableFactor, TableSample, + TableSampleKind, TableWithJoins, +}; + +impl<'a> Resolver<'a> { + /// Visit a `TableWithJoins`. `role` applies only to the head relation; + /// joined tables are always read-position (a write target makes no + /// sense in a JOIN for any of our statement kinds). + pub(super) fn visit_table_with_joins( + &mut self, + table: &TableWithJoins, + role: TableRole, + ) -> Result<(), Error> { + self.visit_table_factor(&table.relation, role)?; + for join in &table.joins { + self.visit_join(join)?; + } + Ok(()) + } + + pub(super) fn visit_join(&mut self, join: &Join) -> Result<(), Error> { + self.visit_table_factor(&join.relation, TableRole::Read)?; + match &join.join_operator { + JoinOperator::Join(constraint) + | JoinOperator::Inner(constraint) + | JoinOperator::Left(constraint) + | JoinOperator::LeftOuter(constraint) + | JoinOperator::Right(constraint) + | JoinOperator::RightOuter(constraint) + | JoinOperator::FullOuter(constraint) + | JoinOperator::CrossJoin(constraint) + | JoinOperator::Semi(constraint) + | JoinOperator::LeftSemi(constraint) + | JoinOperator::RightSemi(constraint) + | JoinOperator::Anti(constraint) + | JoinOperator::LeftAnti(constraint) + | JoinOperator::RightAnti(constraint) + | JoinOperator::StraightJoin(constraint) => self.visit_join_constraint(constraint), + JoinOperator::AsOf { + match_condition, + constraint, + } => { + self.with_filter_clause(|r| r.visit_expr(match_condition))?; + self.visit_join_constraint(constraint) + } + JoinOperator::CrossApply | JoinOperator::OuterApply => Ok(()), + } + } + + fn visit_join_constraint(&mut self, constraint: &JoinConstraint) -> Result<(), Error> { + match constraint { + JoinConstraint::On(expr) => self.with_filter_clause(|r| r.visit_expr(expr)), + JoinConstraint::Using(_) | JoinConstraint::Natural | JoinConstraint::None => Ok(()), + } + } + + /// Visit a `TableFactor`. `role` is consumed only by the `Table` + /// variant where it controls how the resulting binding is stamped; + /// the other variants (Derived, NestedJoin, Pivot, ...) only bind + /// aliases that are `DerivedTable` / `TableFunction` — they don't + /// carry a table role. + pub(super) fn visit_table_factor( + &mut self, + table_factor: &TableFactor, + role: TableRole, + ) -> Result<(), Error> { + match table_factor { + TableFactor::Table { + name, + alias, + args, + with_hints, + sample, + .. + } => { + if self.is_cte_reference(name) { + // Carry the original CTE's schema + body_projections + // to the local binding so: + // 1. lineage composition works through the use site + // (`FROM cte AS c` → `c.col` and `FROM cte` → + // `cte.col` both compose to the body's source); + // 2. catalog-aware strictness still applies — refs + // against a Known schema that doesn't list the + // column still surface as unresolved instead of + // getting absorbed by the synthetic binding; + // 3. unqualified refs in the current scope have a + // single in-scope candidate — without this + // re-bind, bare refs in `WITH cte AS (...) + // INSERT INTO t ... SELECT x FROM cte` would + // walk up and ambify against the outer-bound + // INSERT target. + let body = self.cte_body_projections(name); + let schema = self.cte_schema(name); + let bind_name = match alias { + Some(a) => a.name.clone(), + // `is_cte_reference` already returned true, + // so `name` is a single-segment ObjectName + // whose head is an Ident. + None => name.0[0].as_ident().cloned().unwrap(), + }; + self.bind_cte(bind_name, schema, body); + return Ok(()); + } + let (table, alias_ident) = + TableReference::from_table_factor_with_alias(table_factor)?; + self.bind_base_table(table, alias_ident, role); + if let Some(args) = args { + self.visit_table_function_args(&args.args)?; + if let Some(settings) = &args.settings { + for setting in settings { + self.visit_expr(&setting.value)?; + } + } + } + self.visit_exprs(with_hints)?; + if let Some(sample) = sample { + self.visit_table_sample_kind(sample)?; + } + } + TableFactor::Derived { + subquery, + alias, + sample, + .. + } => { + // Raw resolve_query — same rationale as CTE bodies: + // the derived subquery's projection isn't a query + // result on its own, and storing its projections on + // the binding lets lineage composition substitute + // through the derived alias. + let resolved = self.resolve_query(subquery)?; + if let Some(alias) = alias { + let renames = &alias.columns; + let renamed_schema = + super::rename_relation_schema(resolved.output_schema, renames); + let renamed_projections = + super::rename_projection_groups(resolved.projections, renames); + self.bind_derived_table( + alias.name.clone(), + renamed_schema, + renamed_projections, + ); + } + if let Some(sample) = sample { + self.visit_table_sample_kind(sample)?; + } + } + TableFactor::NestedJoin { + table_with_joins, + alias, + } => { + self.visit_table_with_joins(table_with_joins, TableRole::Read)?; + if let Some(alias) = alias { + self.bind_derived_table( + alias.name.clone(), + RelationSchema::Unknown, + Vec::new(), + ); + } + } + TableFactor::Pivot { + table, + aggregate_functions, + value_column, + value_source, + default_on_null, + alias, + .. + } => { + self.visit_table_factor(table, TableRole::Read)?; + for expr in aggregate_functions { + self.visit_expr(&expr.expr)?; + } + self.visit_exprs(value_column)?; + self.visit_pivot_value_source(value_source)?; + if let Some(expr) = default_on_null { + self.visit_expr(expr)?; + } + if let Some(alias) = alias { + self.bind_derived_table( + alias.name.clone(), + RelationSchema::Unknown, + Vec::new(), + ); + } + } + TableFactor::Unpivot { + table, + value, + columns, + alias, + .. + } => { + self.visit_table_factor(table, TableRole::Read)?; + self.visit_expr(value)?; + for expr in columns { + self.visit_expr(&expr.expr)?; + } + if let Some(alias) = alias { + self.bind_derived_table( + alias.name.clone(), + RelationSchema::Unknown, + Vec::new(), + ); + } + } + TableFactor::MatchRecognize { + table, + partition_by, + order_by, + measures, + symbols, + alias, + .. + } => { + self.visit_table_factor(table, TableRole::Read)?; + self.visit_exprs(partition_by)?; + for order_by in order_by { + self.visit_order_by_expr(order_by)?; + } + for measure in measures { + self.visit_expr(&measure.expr)?; + } + for symbol in symbols { + self.visit_expr(&symbol.definition)?; + } + if let Some(alias) = alias { + self.bind_derived_table( + alias.name.clone(), + RelationSchema::Unknown, + Vec::new(), + ); + } + } + TableFactor::TableFunction { expr, alias } => { + self.visit_expr(expr)?; + if let Some(alias) = alias { + self.bind_table_function(alias.name.clone()); + } + } + TableFactor::Function { args, alias, .. } => { + self.visit_table_function_args(args)?; + if let Some(alias) = alias { + self.bind_table_function(alias.name.clone()); + } + } + TableFactor::UNNEST { + alias, array_exprs, .. + } => { + self.visit_exprs(array_exprs)?; + if let Some(alias) = alias { + self.bind_table_function(alias.name.clone()); + } + } + TableFactor::JsonTable { + json_expr, alias, .. + } + | TableFactor::OpenJsonTable { + json_expr, alias, .. + } => { + self.visit_expr(json_expr)?; + if let Some(alias) = alias { + self.bind_table_function(alias.name.clone()); + } + } + TableFactor::XmlTable { + row_expression, + passing, + alias, + .. + } => { + self.visit_expr(row_expression)?; + for argument in &passing.arguments { + self.visit_expr(&argument.expr)?; + } + if let Some(alias) = alias { + self.bind_table_function(alias.name.clone()); + } + } + TableFactor::SemanticView { + dimensions, + metrics, + facts, + where_clause, + alias, + .. + } => { + self.visit_exprs(dimensions)?; + self.visit_exprs(metrics)?; + self.visit_exprs(facts)?; + if let Some(expr) = where_clause { + self.visit_expr(expr)?; + } + if let Some(alias) = alias { + self.bind_table_function(alias.name.clone()); + } + } + } + Ok(()) + } + + fn visit_table_function_args(&mut self, args: &[FunctionArg]) -> Result<(), Error> { + for arg in args { + self.visit_function_arg(arg)?; + } + Ok(()) + } + + fn visit_table_sample_kind(&mut self, sample: &TableSampleKind) -> Result<(), Error> { + match sample { + TableSampleKind::BeforeTableAlias(sample) + | TableSampleKind::AfterTableAlias(sample) => self.visit_table_sample(sample), + } + } + + pub(super) fn visit_table_sample(&mut self, sample: &TableSample) -> Result<(), Error> { + if let Some(quantity) = &sample.quantity { + self.visit_expr(&quantity.value)?; + } + if let Some(expr) = &sample.offset { + self.visit_expr(expr)?; + } + Ok(()) + } + + pub(super) fn visit_pivot_value_source( + &mut self, + value_source: &PivotValueSource, + ) -> Result<(), Error> { + match value_source { + PivotValueSource::List(values) => { + for value in values { + self.visit_expr(&value.expr)?; + } + Ok(()) + } + PivotValueSource::Any(order_by) => { + for expr in order_by { + self.visit_order_by_expr(expr)?; + } + Ok(()) + } + // PIVOT value subquery is an intermediate — raw resolve. + PivotValueSource::Subquery(query) => self.resolve_query(query).map(|_| ()), + } + } +} diff --git a/sql-insight/tests/integration.rs b/sql-insight/tests/integration.rs index bd483b0..cbe3344 100644 --- a/sql-insight/tests/integration.rs +++ b/sql-insight/tests/integration.rs @@ -1,132 +1,748 @@ -#[cfg(test)] -mod integration { - use sql_insight::test_utils::all_dialects; - use sql_insight::{CrudTables, NormalizerOptions}; - use sql_insight::{TableReference, Tables}; - - mod format { - use super::*; - - #[test] - fn test_format() { - let sql = "SELECT a FROM t1 WHERE b = 1 AND c in (2, 3) AND d LIKE '%foo'"; - for dialect in all_dialects() { - let result = sql_insight::format(dialect.as_ref(), sql).unwrap(); - assert_eq!( - result, - ["SELECT a FROM t1 WHERE b = 1 AND c IN (2, 3) AND d LIKE '%foo'"], - "Failed for dialect: {dialect:?}" - ) - } +//! Integration tests covering the public API surface end-to-end. +//! +//! `tests/integration.rs` is compiled as its own crate, so the +//! top-level items are equivalent to a `mod tests` in the library — +//! no extra wrapper module needed. + +use sql_insight::sqlparser::dialect::GenericDialect; +use sql_insight::test_utils::all_dialects; +use sql_insight::{ + extract_column_operations, extract_crud_tables, extract_table_operations, extract_tables, + Catalog, ColumnLevelDiagnostic, ColumnLevelDiagnosticKind, ColumnLineageKind, ColumnSchema, + ColumnTarget, CrudTables, NormalizerOptions, StatementKind, TableExtraction, + TableLevelDiagnosticKind, TableReference, Tables, +}; +use std::collections::HashMap; + +mod format { + use super::*; + + #[test] + fn test_format() { + let sql = "SELECT a FROM t1 WHERE b = 1 AND c in (2, 3) AND d LIKE '%foo'"; + for dialect in all_dialects() { + let result = sql_insight::format(dialect.as_ref(), sql).unwrap(); + assert_eq!( + result, + ["SELECT a FROM t1 WHERE b = 1 AND c IN (2, 3) AND d LIKE '%foo'"], + "Failed for dialect: {dialect:?}" + ) } } +} - mod normalize { - use super::*; +mod normalize { + use super::*; - #[test] - fn test_normalize() { - let sql = "SELECT a FROM t1 WHERE b = 1 AND c in (2, 3) AND d LIKE '%foo'"; - for dialect in all_dialects() { - let result = sql_insight::normalize(dialect.as_ref(), sql).unwrap(); - assert_eq!( - result, - ["SELECT a FROM t1 WHERE b = ? AND c IN (?, ?) AND d LIKE ?"], - "Failed for dialect: {dialect:?}" - ) + #[test] + fn test_normalize() { + let sql = "SELECT a FROM t1 WHERE b = 1 AND c in (2, 3) AND d LIKE '%foo'"; + for dialect in all_dialects() { + let result = sql_insight::normalize(dialect.as_ref(), sql).unwrap(); + assert_eq!( + result, + ["SELECT a FROM t1 WHERE b = ? AND c IN (?, ?) AND d LIKE ?"], + "Failed for dialect: {dialect:?}" + ) + } + } + + #[test] + fn test_normalize_with_options() { + let sql = "SELECT a FROM t1 WHERE b = 1 AND c in (2, 3, 4); INSERT INTO t2 (a, b, c) VALUES (1, 2, 3), (4, 5, 6)"; + for dialect in all_dialects() { + let result = sql_insight::normalize_with_options( + dialect.as_ref(), + sql, + NormalizerOptions::new() + .with_unify_in_list(true) + .with_unify_values(true), + ) + .unwrap(); + assert_eq!( + result, + [ + "SELECT a FROM t1 WHERE b = ? AND c IN (...)", + "INSERT INTO t2 (a, b, c) VALUES (...)" + ], + "Failed for dialect: {dialect:?}" + ) + } + } +} + +mod extract_crud_tables { + use super::*; + + #[test] + fn test_extract_crud_tables() { + let sql = "SELECT a FROM t1 WHERE b = 1 AND c in (2, 3) AND d LIKE '%foo'; SELECT b FROM t2 WHERE c = 4"; + for dialect in all_dialects() { + let result = extract_crud_tables(dialect.as_ref(), sql).unwrap(); + assert_eq!( + result, + vec![ + Ok(CrudTables { + create_tables: vec![], + read_tables: vec![TableReference { + catalog: None, + schema: None, + name: "t1".into(), + }], + update_tables: vec![], + delete_tables: vec![], + diagnostics: vec![], + }), + Ok(CrudTables { + create_tables: vec![], + read_tables: vec![TableReference { + catalog: None, + schema: None, + name: "t2".into(), + }], + update_tables: vec![], + delete_tables: vec![], + diagnostics: vec![], + }), + ], + "Failed for dialect: {dialect:?}" + ) + } + } + + #[test] + fn test_extract_crud_tables_with_cte() { + let sql = "WITH t2 AS (SELECT id FROM t1) SELECT * FROM t2"; + for dialect in all_dialects() { + let result = extract_crud_tables(dialect.as_ref(), sql).unwrap(); + assert_eq!( + result, + vec![Ok(CrudTables { + create_tables: vec![], + read_tables: vec![TableReference { + catalog: None, + schema: None, + name: "t1".into(), + }], + update_tables: vec![], + delete_tables: vec![], + diagnostics: vec![], + })], + "Failed for dialect: {dialect:?}" + ) + } + } +} + +mod extract_tables { + use super::*; + + #[test] + fn test_extract_tables() { + let sql = "SELECT a FROM t1 WHERE b = 1 AND c in (2, 3) AND d LIKE '%foo'; SELECT b FROM t2 WHERE c = 4"; + for dialect in all_dialects() { + let result = extract_tables(dialect.as_ref(), sql).unwrap(); + let result = result + .into_iter() + .map(|result| result.map(TableExtraction::into_tables)) + .collect::>>(); + assert_eq!( + result, + vec![ + Ok(Tables(vec![TableReference { + catalog: None, + schema: None, + name: "t1".into(), + }])), + Ok(Tables(vec![TableReference { + catalog: None, + schema: None, + name: "t2".into(), + }])), + ], + "Failed for dialect: {dialect:?}" + ) + } + } + + #[test] + fn test_extract_tables_with_cte() { + let sql = "WITH t2 AS (SELECT id FROM t1) SELECT * FROM t2"; + for dialect in all_dialects() { + let result = extract_tables(dialect.as_ref(), sql).unwrap(); + let result = result + .into_iter() + .map(|result| result.map(TableExtraction::into_tables)) + .collect::>>(); + assert_eq!( + result, + vec![Ok(Tables(vec![TableReference { + catalog: None, + schema: None, + name: "t1".into(), + }]))], + "Failed for dialect: {dialect:?}" + ) + } + } + + #[test] + fn test_extract_tables_reports_diagnostics() { + let result = extract_tables(&GenericDialect {}, "SET x = 1").unwrap(); + let extraction = result.into_iter().next().unwrap().unwrap(); + assert_eq!(extraction.tables, vec![]); + assert_eq!(extraction.diagnostics.len(), 1); + assert_eq!( + extraction.diagnostics[0].kind, + TableLevelDiagnosticKind::UnsupportedStatement + ); + } +} + +mod extract_table_operations { + use super::*; + + fn table(name: &str) -> TableReference { + TableReference { + catalog: None, + schema: None, + name: name.into(), + } + } + + #[test] + fn select_classifies_kind_and_collects_reads() { + let result = + extract_table_operations(&GenericDialect {}, "SELECT a FROM t1", None).unwrap(); + let ops = result[0].as_ref().unwrap(); + assert_eq!(ops.statement_kind, StatementKind::Select); + assert_eq!(ops.reads.len(), 1); + assert_eq!(ops.reads[0], table("t1")); + assert!(ops.writes.is_empty()); + assert!(ops.lineage.is_empty()); + } + + #[test] + fn insert_select_emits_source_to_target_lineage() { + let sql = "INSERT INTO orders (id, total) SELECT id, amount FROM staging"; + let result = extract_table_operations(&GenericDialect {}, sql, None).unwrap(); + let ops = result[0].as_ref().unwrap(); + assert_eq!(ops.statement_kind, StatementKind::Insert); + assert_eq!(ops.reads, vec![table("staging")]); + assert_eq!(ops.writes, vec![table("orders")]); + assert_eq!(ops.lineage.len(), 1); + assert_eq!(ops.lineage[0].source, table("staging")); + assert_eq!(ops.lineage[0].target, table("orders")); + } + + #[test] + fn multi_statement_batch_returns_per_statement_results() { + let sql = "SELECT * FROM t1; INSERT INTO t2 SELECT * FROM t3"; + let result = extract_table_operations(&GenericDialect {}, sql, None).unwrap(); + assert_eq!(result.len(), 2); + assert_eq!( + result[0].as_ref().unwrap().statement_kind, + StatementKind::Select + ); + assert_eq!( + result[1].as_ref().unwrap().statement_kind, + StatementKind::Insert + ); + } + + #[test] + fn unsupported_statement_surfaces_diagnostic() { + let result = + extract_table_operations(&GenericDialect {}, "CREATE INDEX idx ON t1 (a)", None) + .unwrap(); + let ops = result[0].as_ref().unwrap(); + assert_eq!(ops.statement_kind, StatementKind::Unsupported); + assert!(ops + .diagnostics + .iter() + .any(|d| matches!(d.kind, TableLevelDiagnosticKind::UnsupportedStatement))); + } +} + +mod extract_column_operations { + use super::*; + + fn col(table: &str, name: &str) -> sql_insight::ColumnReference { + sql_insight::ColumnReference { + table: Some(TableReference { + catalog: None, + schema: None, + name: table.into(), + }), + name: name.into(), + } + } + + #[test] + fn select_collects_per_column_reads() { + let sql = "SELECT a FROM t1 WHERE b > 0"; + let result = extract_column_operations(&GenericDialect {}, sql, None).unwrap(); + let ops = result[0].as_ref().unwrap(); + // Both the projection `a` and the filter `b` surface as reads + // (occurrence list, no clause tag). value-vs-filter is + // recovered structurally: `a` is also a lineage source, `b` is not. + let names: Vec<_> = ops.reads.iter().map(|r| r.name.value.as_str()).collect(); + assert_eq!(names, vec!["a", "b"]); + let lineage_sources: Vec<_> = ops + .lineage + .iter() + .map(|f| f.source.name.value.as_str()) + .collect(); + assert_eq!(lineage_sources, vec!["a"]); // `b` (filter) is not a lineage source + } + + #[test] + fn insert_select_emits_per_column_lineage() { + let sql = "INSERT INTO orders (id, total) SELECT id, amount FROM staging"; + let result = extract_column_operations(&GenericDialect {}, sql, None).unwrap(); + let ops = result[0].as_ref().unwrap(); + assert_eq!(ops.lineage.len(), 2); + // Both lineage edges are Passthrough into Relation targets. + for edge in &ops.lineage { + assert!(matches!(edge.kind, ColumnLineageKind::Passthrough)); + assert!(matches!(edge.target, ColumnTarget::Relation(_))); + } + } + + #[test] + fn aggregate_projection_marks_transformation() { + let sql = "INSERT INTO summary (total) SELECT SUM(amount) FROM staging"; + let result = extract_column_operations(&GenericDialect {}, sql, None).unwrap(); + let ops = result[0].as_ref().unwrap(); + assert_eq!(ops.lineage.len(), 1); + assert_eq!(ops.lineage[0].source, col("staging", "amount")); + // SUM changes the value → Transformation (the 2-way kind no + // longer distinguishes aggregation from other transforms). + assert!(matches!( + ops.lineage[0].kind, + ColumnLineageKind::Transformation + )); + } + + #[test] + fn wildcard_in_projection_yields_wildcard_suppressed_diagnostic() { + let result = + extract_column_operations(&GenericDialect {}, "SELECT * FROM t1", None).unwrap(); + let ops = result[0].as_ref().unwrap(); + assert!(ops + .diagnostics + .iter() + .any(|d| matches!(d.kind, ColumnLevelDiagnosticKind::WildcardSuppressed))); + } +} + +mod catalog { + use super::*; + + #[derive(Debug, Default)] + struct TestCatalog { + tables: HashMap>, + } + + impl TestCatalog { + fn with(mut self, name: &str, cols: Vec<&'static str>) -> Self { + self.tables.insert(name.to_string(), cols); + self + } + } + + impl Catalog for TestCatalog { + fn columns(&self, table: &TableReference) -> Option> { + self.tables.get(table.name.value.as_str()).map(|cols| { + cols.iter() + .map(|c| ColumnSchema { + name: c.to_string(), + }) + .collect() + }) + } + } + + fn count_kind(diagnostics: &[ColumnLevelDiagnostic], kind: ColumnLevelDiagnosticKind) -> usize { + diagnostics.iter().filter(|d| d.kind == kind).count() + } + + #[test] + fn insert_without_explicit_columns_pairs_via_catalog() { + // Without explicit `(a, b)`, the resolver needs the catalog to + // know the target's columns and pair source projections. + let catalog = TestCatalog::default() + .with("orders", vec!["id", "total"]) + .with("staging", vec!["id", "amount"]); + let sql = "INSERT INTO orders SELECT id, amount FROM staging"; + let result = extract_column_operations(&GenericDialect {}, sql, Some(&catalog)).unwrap(); + let ops = result[0].as_ref().unwrap(); + // Two lineage edges into Relation targets orders.id / orders.total. + let relation_targets: Vec<_> = ops + .lineage + .iter() + .filter_map(|f| match &f.target { + ColumnTarget::Relation(c) => Some(c.name.value.as_str()), + _ => None, + }) + .collect(); + assert!(relation_targets.contains(&"id")); + assert!(relation_targets.contains(&"total")); + } + + #[test] + fn ambiguous_column_diagnostic_only_with_catalog() { + let catalog = TestCatalog::default() + .with("t1", vec!["a"]) + .with("t2", vec!["a"]); + let sql = "SELECT a FROM t1 JOIN t2 ON t1.a = t2.a"; + + let with = extract_column_operations(&GenericDialect {}, sql, Some(&catalog)).unwrap(); + let without = extract_column_operations(&GenericDialect {}, sql, None).unwrap(); + + let with_count = count_kind( + &with[0].as_ref().unwrap().diagnostics, + ColumnLevelDiagnosticKind::AmbiguousColumn, + ); + let without_count = count_kind( + &without[0].as_ref().unwrap().diagnostics, + ColumnLevelDiagnosticKind::AmbiguousColumn, + ); + assert_eq!(with_count, 1, "with catalog should report AmbiguousColumn"); + assert_eq!( + without_count, 0, + "without catalog should stay silent (Unknown schemas)" + ); + } + + #[test] + fn unresolved_column_diagnostic_only_with_catalog() { + let catalog = TestCatalog::default().with("t1", vec!["a", "b"]); + let sql = "SELECT missing FROM t1"; + + let with = extract_column_operations(&GenericDialect {}, sql, Some(&catalog)).unwrap(); + let without = extract_column_operations(&GenericDialect {}, sql, None).unwrap(); + + let with_count = count_kind( + &with[0].as_ref().unwrap().diagnostics, + ColumnLevelDiagnosticKind::UnresolvedColumn, + ); + let without_count = count_kind( + &without[0].as_ref().unwrap().diagnostics, + ColumnLevelDiagnosticKind::UnresolvedColumn, + ); + assert_eq!(with_count, 1); + assert_eq!(without_count, 0); + } +} + +mod diagnostics { + use super::*; + + #[test] + fn unsupported_statement_kind_surfaces_via_table_operations() { + let result = + extract_table_operations(&GenericDialect {}, "CREATE INDEX idx ON t (a)", None) + .unwrap(); + let ops = result[0].as_ref().unwrap(); + assert!(ops + .diagnostics + .iter() + .any(|d| matches!(d.kind, TableLevelDiagnosticKind::UnsupportedStatement))); + } + + #[test] + fn wildcard_diagnostic_carries_precise_span() { + // Pin down line *and* column for the `*` token. The wildcard + // sits at column 8 of `SELECT * FROM t1` (1-indexed, + // immediately after `SELECT `). This pin-down means that if + // span propagation regresses — e.g. the resolver starts using + // the surrounding SELECT node's span instead of the wildcard + // token's — this test will fail with a concrete diff. + let result = + extract_column_operations(&GenericDialect {}, "SELECT * FROM t1", None).unwrap(); + let ops = result[0].as_ref().unwrap(); + let wildcard = ops + .diagnostics + .iter() + .find(|d| matches!(d.kind, ColumnLevelDiagnosticKind::WildcardSuppressed)) + .expect("WildcardSuppressed not found"); + assert!( + wildcard.message.contains("at L1:"), + "message should embed source location, got: {}", + wildcard.message + ); + let span = wildcard.span.expect("wildcard token carries a span"); + assert_eq!(span.start.line, 1, "wildcard line"); + assert_eq!(span.start.column, 8, "wildcard column"); + } + + #[test] + fn unresolved_column_diagnostic_carries_precise_span() { + // The catalog is needed to fire UnresolvedColumn — without it + // the resolver stays silent (Unknown schemas could contain + // anything). With the catalog, `missing` is unambiguously + // not a column of t1. + // + // `missing` starts at column 8 in `SELECT missing FROM t1`. + // Pinning down the column here is the regression net for span + // plumbing through the resolver's catalog-aware path — + // separate from the wildcard path, which goes through + // projection.rs. + #[derive(Debug, Default)] + struct C(HashMap>); + impl Catalog for C { + fn columns(&self, table: &TableReference) -> Option> { + self.0.get(table.name.value.as_str()).map(|cols| { + cols.iter() + .map(|c| ColumnSchema { + name: c.to_string(), + }) + .collect() + }) } } + let mut catalog = C::default(); + catalog.0.insert("t1".to_string(), vec!["a", "b"]); - #[test] - fn test_normalize_with_options() { - let sql = "SELECT a FROM t1 WHERE b = 1 AND c in (2, 3, 4); INSERT INTO t2 (a, b, c) VALUES (1, 2, 3), (4, 5, 6)"; - for dialect in all_dialects() { - let result = sql_insight::normalize_with_options( - dialect.as_ref(), - sql, - NormalizerOptions::new() - .with_unify_in_list(true) - .with_unify_values(true), - ) + let result = + extract_column_operations(&GenericDialect {}, "SELECT missing FROM t1", Some(&catalog)) .unwrap(); + let ops = result[0].as_ref().unwrap(); + let unresolved = ops + .diagnostics + .iter() + .find(|d| matches!(d.kind, ColumnLevelDiagnosticKind::UnresolvedColumn)) + .expect("UnresolvedColumn not found"); + let span = unresolved.span.expect("ident token carries a span"); + assert_eq!(span.start.line, 1); + assert_eq!(span.start.column, 8); + } +} + +/// Cross-cutting properties that should hold for every parseable SQL +/// statement, regardless of shape. These are the safety net for +/// future resolver / extractor changes: a hand-written corpus walks +/// through both extractors and each statement is checked against a +/// handful of structural invariants. +/// +/// On failure the assertion panics with the SQL + statement index + +/// which invariant tripped, so a single regression points straight at +/// what changed. +mod invariants { + use super::*; + use sql_insight::{ColumnLineageEdge, ColumnOperation, ColumnReference, TableOperation}; + use std::collections::HashSet; + + /// Curated corpus chosen to stress the major shapes the resolver + /// handles. New patterns should be added here as the resolver + /// grows, not as one-off tests scattered across the codebase. + fn corpus() -> &'static [&'static str] { + &[ + // SELECT shapes + "SELECT a FROM t1", + "SELECT t1.a, t2.b FROM t1 JOIN t2 ON t1.id = t2.id", + "SELECT a FROM t1 WHERE b > 0 GROUP BY a HAVING COUNT(*) > 1", + "SELECT a FROM t1 ORDER BY b", + "SELECT SUM(x) OVER (PARTITION BY p ORDER BY o) AS total FROM t1", + "SELECT CASE WHEN a > 0 THEN b ELSE c END FROM t1", + // CTE / derived / subquery + "WITH cte AS (SELECT id FROM t1) SELECT id FROM cte", + "SELECT x FROM (SELECT a + 1 AS x FROM t1) sub", + "SELECT a FROM t1 WHERE id IN (SELECT id FROM t2)", + // Set operations + "SELECT a FROM t1 UNION SELECT b FROM t2", + "SELECT a FROM t1 INTERSECT SELECT b FROM t2", + // DML + "INSERT INTO t1 (a, b) VALUES (1, 2)", + "INSERT INTO t1 (a, b) SELECT x, y FROM s", + "UPDATE t1 SET a = b + 1 WHERE id = 5", + "UPDATE t1 SET a = (SELECT max(x) FROM s) WHERE id = 5", + "DELETE FROM t1 WHERE id = 5", + // DDL with body + "CREATE TABLE dst AS SELECT a, b FROM src", + "CREATE VIEW v AS SELECT a AS x FROM t1", + // MERGE + "MERGE INTO t1 USING t2 ON t1.id = t2.id \ + WHEN MATCHED THEN UPDATE SET a = t2.a \ + WHEN NOT MATCHED THEN INSERT (id, a) VALUES (t2.id, t2.a)", + ] + } + + /// Collected pair of outputs for the same statement — both + /// extractors run in lockstep so per-statement invariants can be + /// checked side by side. + struct StatementPair { + col: ColumnOperation, + tab: TableOperation, + } + + fn extract_paired(sql: &str) -> Vec { + let col = extract_column_operations(&GenericDialect {}, sql, None).unwrap(); + let tab = extract_table_operations(&GenericDialect {}, sql, None).unwrap(); + assert_eq!( + col.len(), + tab.len(), + "statement count mismatch between column_op and table_op for SQL: {sql}" + ); + col.into_iter() + .zip(tab) + .map(|(c, t)| StatementPair { + col: c.expect("column_op extraction succeeded"), + tab: t.expect("table_op extraction succeeded"), + }) + .collect() + } + + fn table_set( + items: I, + mut key: impl FnMut(&T) -> Option, + ) -> HashSet + where + I: IntoIterator, + { + items.into_iter().filter_map(|i| key(&i)).collect() + } + + fn column_read_table(r: &ColumnReference) -> Option { + r.table.clone() + } + + fn column_write_table(w: &ColumnReference) -> Option { + w.table.clone() + } + + fn edge_relation_table(f: &ColumnLineageEdge) -> Option { + match &f.target { + ColumnTarget::Relation(c) => c.table.clone(), + ColumnTarget::QueryOutput { .. } => None, + } + } + + #[test] + fn statement_kind_agrees_between_extractors() { + for sql in corpus() { + for (idx, pair) in extract_paired(sql).into_iter().enumerate() { assert_eq!( - result, - [ - "SELECT a FROM t1 WHERE b = ? AND c IN (...)", - "INSERT INTO t2 (a, b, c) VALUES (...)" - ], - "Failed for dialect: {dialect:?}" - ) + pair.col.statement_kind, pair.tab.statement_kind, + "column_op vs table_op kind disagrees \ + for statement {idx} of SQL: {sql}" + ); + } + } + } + + #[test] + fn column_op_read_tables_appear_in_table_op_reads_or_writes() { + // Column-level reads include refs from the RHS of UPDATE SET, + // the predicate of DELETE WHERE, etc. — even when those refs + // point at the statement's *target* table. table_op's UPDATE + // / DELETE conventions surface the target in `writes` only + // (unless the statement also has a separate read source like + // `DELETE ... USING t2` or `UPDATE ... FROM t2`). The + // invariant relaxes accordingly: column_op read tables must + // be in the union of table_op reads + writes. + for sql in corpus() { + for (idx, pair) in extract_paired(sql).into_iter().enumerate() { + let table_op_reads: HashSet<_> = + table_set(pair.tab.reads.clone(), |r| Some(r.clone())); + let table_op_writes: HashSet<_> = + table_set(pair.tab.writes.clone(), |w| Some(w.clone())); + let known: HashSet<_> = table_op_reads.union(&table_op_writes).cloned().collect(); + let column_op_read_tables = table_set(pair.col.reads.clone(), column_read_table); + for t in &column_op_read_tables { + assert!( + known.contains(t), + "column_op read table {t:?} missing from table_op reads ∪ writes \ + for statement {idx} of SQL: {sql}\n\ + table_op reads: {table_op_reads:?}\n\ + table_op writes: {table_op_writes:?}" + ); + } } } } - mod extract_crud_tables { - use super::*; + #[test] + fn column_op_write_tables_appear_in_table_op_writes() { + for sql in corpus() { + for (idx, pair) in extract_paired(sql).into_iter().enumerate() { + let table_op_writes = table_set(pair.tab.writes.clone(), |w| Some(w.clone())); + let column_op_write_tables = table_set(pair.col.writes.clone(), column_write_table); + for t in &column_op_write_tables { + assert!( + table_op_writes.contains(t), + "column_op write table {t:?} missing from table_op writes \ + for statement {idx} of SQL: {sql}\n\ + table_op writes: {table_op_writes:?}" + ); + } + } + } + } - #[test] - fn test_extract_crud_tables() { - let sql = "SELECT a FROM t1 WHERE b = 1 AND c in (2, 3) AND d LIKE '%foo'; SELECT b FROM t2 WHERE c = 4"; - for dialect in all_dialects() { - let result = sql_insight::extract_crud_tables(dialect.as_ref(), sql).unwrap(); - assert_eq!( - result, - vec![ - Ok(CrudTables { - create_tables: vec![], - read_tables: vec![TableReference { - catalog: None, - schema: None, - name: "t1".into(), - alias: None, - }], - update_tables: vec![], - delete_tables: vec![], - }), - Ok(CrudTables { - create_tables: vec![], - read_tables: vec![TableReference { - catalog: None, - schema: None, - name: "t2".into(), - alias: None, - }], - update_tables: vec![], - delete_tables: vec![], - }), - ], - "Failed for dialect: {dialect:?}" - ) + #[test] + fn relation_lineage_targets_resolve_to_known_write_tables() { + for sql in corpus() { + for (idx, pair) in extract_paired(sql).into_iter().enumerate() { + let table_op_writes = table_set(pair.tab.writes.clone(), |w| Some(w.clone())); + for f in &pair.col.lineage { + if let Some(target_table) = edge_relation_table(f) { + assert!( + table_op_writes.contains(&target_table), + "Relation lineage target {target_table:?} not in table_op writes \ + for statement {idx} of SQL: {sql}\n\ + table_op writes: {table_op_writes:?}" + ); + } + } } } } - mod extract_tables { - use super::*; + #[test] + fn select_statements_emit_no_writes() { + for sql in corpus() { + for (idx, pair) in extract_paired(sql).into_iter().enumerate() { + if pair.col.statement_kind == StatementKind::Select { + assert!( + pair.col.writes.is_empty(), + "SELECT statement has non-empty column_op writes \ + for statement {idx} of SQL: {sql}\n\ + writes: {:?}", + pair.col.writes + ); + assert!( + pair.tab.writes.is_empty(), + "SELECT statement has non-empty table_op writes \ + for statement {idx} of SQL: {sql}\n\ + writes: {:?}", + pair.tab.writes + ); + } + } + } + } - #[test] - fn test_extract_tables() { - let sql = "SELECT a FROM t1 WHERE b = 1 AND c in (2, 3) AND d LIKE '%foo'; SELECT b FROM t2 WHERE c = 4"; - for dialect in all_dialects() { - let result = sql_insight::extract_tables(dialect.as_ref(), sql).unwrap(); - assert_eq!( - result, - vec![ - Ok(Tables(vec![TableReference { - catalog: None, - schema: None, - name: "t1".into(), - alias: None, - }])), - Ok(Tables(vec![TableReference { - catalog: None, - schema: None, - name: "t2".into(), - alias: None, - }])), - ], - "Failed for dialect: {dialect:?}" - ) + #[test] + fn writing_statements_emit_writes() { + for sql in corpus() { + for (idx, pair) in extract_paired(sql).into_iter().enumerate() { + let writes_expected = matches!( + pair.col.statement_kind, + StatementKind::Insert + | StatementKind::Update + | StatementKind::CreateTable + | StatementKind::CreateView + | StatementKind::Merge + ); + if writes_expected { + assert!( + !pair.tab.writes.is_empty(), + "writing statement has empty table_op writes \ + for statement {idx} of SQL: {sql}" + ); + } } } }