From b697aa29d6000c3c80c3b788cbb97ecf003e0672 Mon Sep 17 00:00:00 2001 From: Dan Tasse Date: Fri, 26 Jun 2026 11:58:05 -0400 Subject: [PATCH] fix: don't quote column names in list_indices --- python/src/indices.rs | 2 +- rust/lance-core/src/datatypes/schema.rs | 110 ++++++++++++++++++++++++ rust/lance-namespace-impls/src/dir.rs | 2 +- 3 files changed, 112 insertions(+), 2 deletions(-) diff --git a/python/src/indices.rs b/python/src/indices.rs index 7ce7a297924..aff74e153bd 100644 --- a/python/src/indices.rs +++ b/python/src/indices.rs @@ -701,7 +701,7 @@ impl PyIndexDescription { .map(|field| { dataset .schema() - .field_path(*field as i32) + .field_path_minimal(*field as i32) .unwrap_or_else(|_| "".to_string()) }) .collect(); diff --git a/rust/lance-core/src/datatypes/schema.rs b/rust/lance-core/src/datatypes/schema.rs index d13eb476359..b4f8baa7da8 100644 --- a/rust/lance-core/src/datatypes/schema.rs +++ b/rust/lance-core/src/datatypes/schema.rs @@ -766,6 +766,9 @@ impl Schema { /// Returns the properly formatted path from root to the field. /// Field names containing dots are quoted (e.g., struct.`field.with.dot`) + /// + /// The result is suitable for SQL parsing. For a human-readable path + /// (e.g. for display in index metadata), use [`Self::field_path_minimal`]. pub fn field_path(&self, field_id: i32) -> Result { self.field_ancestry_by_id(field_id) .map(|ancestry| { @@ -777,6 +780,28 @@ impl Schema { }) } + /// Returns the path from root to the field using *minimal* quoting. + /// + /// A segment is wrapped in backticks only when it contains a character that + /// [`parse_field_path`] treats specially (a `.` separator or a `` ` `` quote); + /// any other character — including hyphens — is left bare. Unlike + /// [`Self::field_path`] (which quotes for SQL-expression safety and so wraps + /// e.g. `my-col` in backticks), the result here is both human-readable and + /// round-trips back through `parse_field_path`, so it is safe to feed into + /// field-path APIs such as `drop_columns` / `update_field_metadata`. + /// + /// This is what should be exposed as the column name in index metadata. + pub fn field_path_minimal(&self, field_id: i32) -> Result { + self.field_ancestry_by_id(field_id) + .map(|ancestry| { + let field_refs: Vec<&str> = ancestry.iter().map(|f| f.name.as_str()).collect(); + format_field_path_minimal(&field_refs) + }) + .ok_or_else(|| { + Error::index(format!("Could not find field ancestry for id {}", field_id)) + }) + } + pub fn verify_primary_key(&self) -> Result<()> { let pk = self.unenforced_primary_key(); for pk_col in pk.into_iter() { @@ -1600,6 +1625,33 @@ pub fn format_field_path(fields: &[&str]) -> String { .join(".") } +/// Like [`format_field_path`], but quotes a segment only when strictly required +/// for the result to round-trip back through [`parse_field_path`]. +/// +/// `parse_field_path` only treats `.` (segment separator) and `` ` `` (quote) +/// specially, so those are the only characters that force quoting here. Notably +/// a hyphen does NOT force quoting (`my-col` stays `my-col`), unlike +/// `format_field_path` which quotes any non-identifier character for +/// SQL-expression safety. Use this for human-readable, round-trippable paths +/// (e.g. column names in index metadata); use `format_field_path` when the +/// result will be embedded in a SQL expression. +pub fn format_field_path_minimal(fields: &[&str]) -> String { + fields + .iter() + .map(|field| { + let needs_quoting = field.contains('.') || field.contains('`'); + if needs_quoting { + // Escape embedded backticks by doubling them, matching parse_field_path. + let escaped = field.replace('`', "``"); + format!("`{}`", escaped) + } else { + field.to_string() + } + }) + .collect::>() + .join(".") +} + /// Escape a field path for project /// /// Parses the field path and formats it for SQL usage. @@ -2852,6 +2904,64 @@ mod tests { assert!(paths.contains(&"name".to_string())); } + #[test] + fn test_field_path_minimal() { + // A struct child whose own NAME contains a dot is the case that makes + // "just strip all backticks" wrong: it must stay quoted to round-trip. + let arrow_schema = ArrowSchema::new(vec![ + ArrowField::new("mycol", ArrowDataType::Int32, false), + ArrowField::new("my_col", ArrowDataType::Int32, false), + ArrowField::new("my-col", ArrowDataType::Int32, false), + ArrowField::new( + "parent", + ArrowDataType::Struct(ArrowFields::from(vec![ + ArrowField::new("child-field", ArrowDataType::Int32, true), + ArrowField::new("child.x", ArrowDataType::Int32, true), + ])), + true, + ), + ]); + let schema = Schema::try_from(&arrow_schema).unwrap(); + let id_of = |path: &str| schema.field(path).unwrap().id; + let child_id = |name: &str| { + schema + .field("parent") + .unwrap() + .children + .iter() + .find(|c| c.name == name) + .unwrap() + .id + }; + + // Plain identifiers: unchanged by either method. + assert_eq!(schema.field_path_minimal(id_of("mycol")).unwrap(), "mycol"); + assert_eq!(schema.field_path_minimal(id_of("my_col")).unwrap(), "my_col"); + + // Hyphen is NOT special to parse_field_path, so minimal quoting leaves it + // bare (field_path would quote it for SQL safety). + assert_eq!(schema.field_path(id_of("my-col")).unwrap(), "`my-col`"); + assert_eq!( + schema.field_path_minimal(id_of("my-col")).unwrap(), + "my-col" + ); + + // Nested hyphenated leaf: bare under minimal quoting. + assert_eq!( + schema.field_path_minimal(child_id("child-field")).unwrap(), + "parent.child-field" + ); + + // Nested leaf whose NAME contains a dot: MUST stay quoted so it + // round-trips through parse_field_path (this is the regression guard). + let dotted = schema.field_path_minimal(child_id("child.x")).unwrap(); + assert_eq!(dotted, "parent.`child.x`"); + assert_eq!( + parse_field_path(&dotted).unwrap(), + vec!["parent".to_string(), "child.x".to_string()] + ); + } + #[test] fn test_validate_rejects_zero_dimension_fixed_size_list() { // A zero dimension divides-by-zero further down the write path (#5102) diff --git a/rust/lance-namespace-impls/src/dir.rs b/rust/lance-namespace-impls/src/dir.rs index e97c5c836b7..1464aa01156 100644 --- a/rust/lance-namespace-impls/src/dir.rs +++ b/rust/lance-namespace-impls/src/dir.rs @@ -3254,7 +3254,7 @@ impl LanceNamespace for DirectoryNamespace { .map(|field_id| { dataset .schema() - .field_path(i32::try_from(*field_id).map_err(|e| { + .field_path_minimal(i32::try_from(*field_id).map_err(|e| { lance_core::Error::from(NamespaceError::Internal { message: format!( "Field id {} does not fit in i32 for table '{}': {}",