From 98cdad04ca887823cc8f0b4ef8c4008b77d8da13 Mon Sep 17 00:00:00 2001 From: Qi Zhu <821684824@qq.com> Date: Tue, 23 Jun 2026 14:03:29 +0800 Subject: [PATCH 1/2] expose SpjNormalForm::predicate() + Predicate Display impl `SpjNormalForm` already exposes public getters for `output_schema`, `output_exprs`, and `referenced_tables`, but the `predicate` field is private. Downstream consumers that want to surface the SPJ normal form (e.g. atlas catalog tables exposing `mv_normal_form` for debugging why view matching does or does not engage) currently have to walk the original `LogicalPlan` and re-collect Filter / TableScan filters as a string, which loses the normalization the analyzer applied. This change: * Promotes `Predicate` and `ColumnEquivalenceClass` from private to `pub` so they can be referenced from public APIs. * Adds `pub fn predicate(&self) -> &Predicate` on `SpjNormalForm`. * Adds a `Display` impl on `Predicate` that AND-joins: - pairwise equalities derived from each multi-column equivalence class (singletons emit nothing), - narrowed range intervals per equivalence class (classes whose interval is still the default unbounded interval are skipped, so only meaningful constraints surface), - residual filter expressions, sorted for deterministic output. The internal fields stay private, so this does not lock in any particular representation; only the read-only view is published. A unit test exercises the new getter + Display on a predicate that populates all three buckets (eq class, range, residual). --- src/rewrite/normal_form.rs | 114 ++++++++++++++++++++++++++++++++++++- 1 file changed, 112 insertions(+), 2 deletions(-) diff --git a/src/rewrite/normal_form.rs b/src/rewrite/normal_form.rs index b5f2343..1b8190d 100644 --- a/src/rewrite/normal_form.rs +++ b/src/rewrite/normal_form.rs @@ -167,6 +167,7 @@ As one can see, all compensating filters are included, and the query only uses t use std::{ collections::{BTreeSet, HashMap, HashSet}, + fmt, sync::Arc, }; @@ -225,6 +226,14 @@ impl SpjNormalForm { &self.referenced_tables } + /// The normalized predicate of this plan (column equivalence classes, + /// per-class range intervals, and residual filter expressions). Useful + /// for surfacing the SPJ normal form via catalog metadata or for + /// debugging why view matching does or does not engage on a given query. + pub fn predicate(&self) -> &Predicate { + &self.predicate + } + /// Analyze an existing `LogicalPlan` and rewrite it in select-project-join normal form. pub fn new(original_plan: &LogicalPlan) -> Result { let predicate = Predicate::new(original_plan)?; @@ -353,8 +362,14 @@ impl SpjNormalForm { } /// Stores information on filters from a Select-Project-Join plan. +/// +/// Built from the original `LogicalPlan` in [`SpjNormalForm::new`] and +/// exposed read-only via [`SpjNormalForm::predicate`]. Field accessors +/// describe the three filter buckets that view matching uses (equality +/// classes, per-class range intervals, and residuals); see the module +/// docs for how they participate in the subsumption tests. #[derive(Debug, Clone)] -struct Predicate { +pub struct Predicate { /// Full table schema, including all possible columns. schema: DFSchema, /// List of column equivalence classes. @@ -369,6 +384,60 @@ struct Predicate { referenced_tables: Vec, } +/// Renders the predicate as an AND-joined list of filter conditions. +/// +/// Three sources are emitted in order: +/// 1. Pairwise equalities derived from each column equivalence class (e.g. +/// `t.a = t.b` for a class `{a, b}`); singleton classes emit nothing. +/// 2. Narrowed range intervals per equivalence class. Classes whose +/// interval is still the default unbounded interval (no filter applied) +/// are skipped, so only meaningful constraints surface. +/// 3. Residual filter expressions, rendered via their `Expr` `Display` +/// and sorted for deterministic output across runs. +/// +/// Intended for human inspection / catalog surfacing rather than as a +/// canonical SQL form -- callers that need the original `Expr`s should +/// reach for the underlying logical plan instead. +impl fmt::Display for Predicate { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let mut parts: Vec = Vec::new(); + + for eq_class in &self.eq_classes { + let mut cols = eq_class.columns.iter(); + let Some(first) = cols.next() else { continue }; + for other in cols { + parts.push(format!("{first} = {other}")); + } + } + + for (idx, range) in self.ranges_by_equivalence_class.iter().enumerate() { + let Some(interval) = range else { continue }; + let Some(eq_class) = self.eq_classes.get(idx) else { + continue; + }; + let Some(col) = eq_class.columns.iter().next() else { + continue; + }; + let Ok(field) = self.schema.field_from_column(col) else { + continue; + }; + let Ok(unbounded) = Interval::make_unbounded(field.data_type()) else { + continue; + }; + if interval == &unbounded { + continue; + } + parts.push(format!("{col} in {interval}")); + } + + let mut residuals: Vec = self.residuals.iter().map(|e| e.to_string()).collect(); + residuals.sort(); + parts.extend(residuals); + + write!(f, "{}", parts.join(" AND ")) + } +} + impl Predicate { /// Create a new Predicate by analyzing the given logical plan. /// Uses single-pass traversal to collect schema, columns, filters, and referenced tables. @@ -891,7 +960,7 @@ impl Predicate { /// in place of any other columns in the class. /// This normal representative is chosen arbitrarily. #[derive(Debug, Clone, Default)] -struct ColumnEquivalenceClass { +pub struct ColumnEquivalenceClass { // first element is the normal representative of the equivalence class columns: BTreeSet, } @@ -1651,4 +1720,45 @@ mod test { Ok(()) } + + #[tokio::test] + async fn test_predicate_getter_and_display() -> Result<()> { + let ctx = SessionContext::new(); + ctx.sql("CREATE TABLE t (a INT, b INT, c VARCHAR)") + .await? + .collect() + .await?; + + // Residual: `c LIKE 'foo%'` falls outside eq classes / ranges and + // should land in `residuals`. The eq `a = b` populates an + // equivalence class, and `a >= 5` narrows its range. + let plan = ctx + .sql("SELECT a FROM t WHERE a = b AND a >= 5 AND c LIKE 'foo%'") + .await? + .into_optimized_plan()?; + let normal_form = SpjNormalForm::new(&plan)?; + + // Getter returns a reference to the same Predicate that drives + // subsumption tests internally; callers can stringify or inspect + // it for catalog surfacing / debugging. + let rendered = normal_form.predicate().to_string(); + + // Display output is AND-joined; check each expected fragment + // independently because residuals are hash-set ordered before the + // internal sort, so positional matching would be brittle. + assert!( + rendered.contains("t.a = t.b"), + "expected equality from class {{a, b}}, got: {rendered}" + ); + assert!( + rendered.contains("t.c LIKE"), + "expected LIKE residual, got: {rendered}" + ); + assert!( + !rendered.is_empty(), + "predicate Display should not be empty when filters exist" + ); + + Ok(()) + } } From 7016a0ee3cdb6bca880affd2582e14a683704bae Mon Sep 17 00:00:00 2001 From: Qi Zhu <821684824@qq.com> Date: Tue, 23 Jun 2026 16:23:51 +0800 Subject: [PATCH 2/2] address Copilot review feedback - Reword the `Predicate` doc comment so it no longer implies field accessors that don't exist; describe the internal buckets instead. - Make `Display for Predicate` render `FALSE` when any equivalence class has an empty range (`None` in `ranges_by_equivalence_class` after a prior `Interval::intersect` collapsed). Previously the unsatisfiable predicate looked identical to "no filter", which is misleading for catalog surfacing. - Revert `pub struct ColumnEquivalenceClass` to `struct`; the type is not referenced from any public signature, so keeping it private avoids needlessly expanding the crate's public surface. - Add `debug_assert!` calls on the in-loop `continue` paths in Display that fire only on broken internal invariants (length mismatches, empty classes, missing schema entries). The intentional unbounded skip stays a silent continue. - Strengthen `test_predicate_getter_and_display` so it actually asserts the range bucket (the `a >= 5` narrowing) renders, not just the equality + residual buckets. --- src/rewrite/normal_form.rs | 53 +++++++++++++++++++++++++++++++++----- 1 file changed, 46 insertions(+), 7 deletions(-) diff --git a/src/rewrite/normal_form.rs b/src/rewrite/normal_form.rs index 1b8190d..4481582 100644 --- a/src/rewrite/normal_form.rs +++ b/src/rewrite/normal_form.rs @@ -364,10 +364,17 @@ impl SpjNormalForm { /// Stores information on filters from a Select-Project-Join plan. /// /// Built from the original `LogicalPlan` in [`SpjNormalForm::new`] and -/// exposed read-only via [`SpjNormalForm::predicate`]. Field accessors -/// describe the three filter buckets that view matching uses (equality -/// classes, per-class range intervals, and residuals); see the module -/// docs for how they participate in the subsumption tests. +/// exposed read-only via [`SpjNormalForm::predicate`]. Internally it +/// groups filters into three buckets that the view-matching subsumption +/// tests consume: +/// +/// 1. column equivalence classes (`col_a = col_b` chains), +/// 2. per-class range intervals (e.g. `col_a >= 5`), and +/// 3. residual filter expressions that don't fit the first two buckets. +/// +/// The internal fields stay private; consumers can render a human-readable +/// summary via the [`Display`](std::fmt::Display) impl, or reach for the +/// underlying `LogicalPlan` if they need raw access. #[derive(Debug, Clone)] pub struct Predicate { /// Full table schema, including all possible columns. @@ -386,7 +393,13 @@ pub struct Predicate { /// Renders the predicate as an AND-joined list of filter conditions. /// -/// Three sources are emitted in order: +/// If any equivalence class has an empty range (stored as `None` after a +/// `Interval::intersect` collapsed to the empty set), the predicate is +/// unsatisfiable and renders as a single `FALSE` -- otherwise downstream +/// consumers would see an unsatisfiable filter as if it had no constraint +/// at all, which is misleading. +/// +/// For a satisfiable predicate, three sources are emitted in order: /// 1. Pairwise equalities derived from each column equivalence class (e.g. /// `t.a = t.b` for a class `{a, b}`); singleton classes emit nothing. /// 2. Narrowed range intervals per equivalence class. Classes whose @@ -400,28 +413,43 @@ pub struct Predicate { /// reach for the underlying logical plan instead. impl fmt::Display for Predicate { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + // Any None range means a prior intersection produced the empty set, + // i.e. the predicate is unsatisfiable. Bail with FALSE so consumers + // don't see an unsatisfiable predicate as "no constraints". + if self.ranges_by_equivalence_class.iter().any(|r| r.is_none()) { + return write!(f, "FALSE"); + } + let mut parts: Vec = Vec::new(); for eq_class in &self.eq_classes { let mut cols = eq_class.columns.iter(); - let Some(first) = cols.next() else { continue }; + let Some(first) = cols.next() else { + debug_assert!(false, "empty ColumnEquivalenceClass"); + continue; + }; for other in cols { parts.push(format!("{first} = {other}")); } } for (idx, range) in self.ranges_by_equivalence_class.iter().enumerate() { + // None was handled above; this is the satisfiable path only. let Some(interval) = range else { continue }; let Some(eq_class) = self.eq_classes.get(idx) else { + debug_assert!(false, "ranges/eq_classes length mismatch at {idx}"); continue; }; let Some(col) = eq_class.columns.iter().next() else { + debug_assert!(false, "empty ColumnEquivalenceClass at {idx}"); continue; }; let Ok(field) = self.schema.field_from_column(col) else { + debug_assert!(false, "column {col} missing from predicate schema"); continue; }; let Ok(unbounded) = Interval::make_unbounded(field.data_type()) else { + debug_assert!(false, "make_unbounded failed for {}", field.data_type()); continue; }; if interval == &unbounded { @@ -960,7 +988,7 @@ impl Predicate { /// in place of any other columns in the class. /// This normal representative is chosen arbitrarily. #[derive(Debug, Clone, Default)] -pub struct ColumnEquivalenceClass { +struct ColumnEquivalenceClass { // first element is the normal representative of the equivalence class columns: BTreeSet, } @@ -1750,6 +1778,17 @@ mod test { rendered.contains("t.a = t.b"), "expected equality from class {{a, b}}, got: {rendered}" ); + // `a >= 5` should narrow the equivalence class range away from + // the default unbounded interval, so Display emits the column + + // its non-default interval. Match liberally on `t.a` followed by + // a literal `5` so we don't depend on the exact `Interval` + // Display formatting. + let has_range = + rendered.contains("t.a") && rendered.contains("in ") && rendered.contains('5'); + assert!( + has_range, + "expected a narrowed range constraint from `a >= 5` to appear, got: {rendered}" + ); assert!( rendered.contains("t.c LIKE"), "expected LIKE residual, got: {rendered}"