Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion python/src/indices.rs
Original file line number Diff line number Diff line change
Expand Up @@ -701,7 +701,7 @@ impl PyIndexDescription {
.map(|field| {
dataset
.schema()
.field_path(*field as i32)
.field_path_minimal(*field as i32)
.unwrap_or_else(|_| "<unknown>".to_string())
})
.collect();
Expand Down
110 changes: 110 additions & 0 deletions rust/lance-core/src/datatypes/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -766,6 +766,9 @@ impl Schema {

/// Returns the properly formatted path from root to the field.
/// Field names containing dots are quoted (e.g., struct.`field.with.dot`)
///
/// The result is suitable for SQL parsing. For a human-readable path
/// (e.g. for display in index metadata), use [`Self::field_path_minimal`].
pub fn field_path(&self, field_id: i32) -> Result<String> {
self.field_ancestry_by_id(field_id)
.map(|ancestry| {
Expand All @@ -777,6 +780,28 @@ impl Schema {
})
}

/// Returns the path from root to the field using *minimal* quoting.
///
/// A segment is wrapped in backticks only when it contains a character that
/// [`parse_field_path`] treats specially (a `.` separator or a `` ` `` quote);
/// any other character — including hyphens — is left bare. Unlike
/// [`Self::field_path`] (which quotes for SQL-expression safety and so wraps
/// e.g. `my-col` in backticks), the result here is both human-readable and
/// round-trips back through `parse_field_path`, so it is safe to feed into
/// field-path APIs such as `drop_columns` / `update_field_metadata`.
///
/// This is what should be exposed as the column name in index metadata.
pub fn field_path_minimal(&self, field_id: i32) -> Result<String> {
self.field_ancestry_by_id(field_id)
.map(|ancestry| {
let field_refs: Vec<&str> = ancestry.iter().map(|f| f.name.as_str()).collect();
format_field_path_minimal(&field_refs)
})
.ok_or_else(|| {
Error::index(format!("Could not find field ancestry for id {}", field_id))
})
}

pub fn verify_primary_key(&self) -> Result<()> {
let pk = self.unenforced_primary_key();
for pk_col in pk.into_iter() {
Expand Down Expand Up @@ -1600,6 +1625,33 @@ pub fn format_field_path(fields: &[&str]) -> String {
.join(".")
}

/// Like [`format_field_path`], but quotes a segment only when strictly required
/// for the result to round-trip back through [`parse_field_path`].
///
/// `parse_field_path` only treats `.` (segment separator) and `` ` `` (quote)
/// specially, so those are the only characters that force quoting here. Notably
/// a hyphen does NOT force quoting (`my-col` stays `my-col`), unlike
/// `format_field_path` which quotes any non-identifier character for
/// SQL-expression safety. Use this for human-readable, round-trippable paths
/// (e.g. column names in index metadata); use `format_field_path` when the
/// result will be embedded in a SQL expression.
pub fn format_field_path_minimal(fields: &[&str]) -> String {
fields
.iter()
.map(|field| {
let needs_quoting = field.contains('.') || field.contains('`');
if needs_quoting {
// Escape embedded backticks by doubling them, matching parse_field_path.
let escaped = field.replace('`', "``");
format!("`{}`", escaped)
} else {
field.to_string()
}
})
.collect::<Vec<_>>()
.join(".")
}

/// Escape a field path for project
///
/// Parses the field path and formats it for SQL usage.
Expand Down Expand Up @@ -2852,6 +2904,64 @@ mod tests {
assert!(paths.contains(&"name".to_string()));
}

#[test]
fn test_field_path_minimal() {
// A struct child whose own NAME contains a dot is the case that makes
// "just strip all backticks" wrong: it must stay quoted to round-trip.
let arrow_schema = ArrowSchema::new(vec![
ArrowField::new("mycol", ArrowDataType::Int32, false),
ArrowField::new("my_col", ArrowDataType::Int32, false),
ArrowField::new("my-col", ArrowDataType::Int32, false),
ArrowField::new(
"parent",
ArrowDataType::Struct(ArrowFields::from(vec![
ArrowField::new("child-field", ArrowDataType::Int32, true),
ArrowField::new("child.x", ArrowDataType::Int32, true),
])),
true,
),
]);
let schema = Schema::try_from(&arrow_schema).unwrap();
let id_of = |path: &str| schema.field(path).unwrap().id;
let child_id = |name: &str| {
schema
.field("parent")
.unwrap()
.children
.iter()
.find(|c| c.name == name)
.unwrap()
.id
};

// Plain identifiers: unchanged by either method.
assert_eq!(schema.field_path_minimal(id_of("mycol")).unwrap(), "mycol");
assert_eq!(schema.field_path_minimal(id_of("my_col")).unwrap(), "my_col");

// Hyphen is NOT special to parse_field_path, so minimal quoting leaves it
// bare (field_path would quote it for SQL safety).
assert_eq!(schema.field_path(id_of("my-col")).unwrap(), "`my-col`");
assert_eq!(
schema.field_path_minimal(id_of("my-col")).unwrap(),
"my-col"
);

// Nested hyphenated leaf: bare under minimal quoting.
assert_eq!(
schema.field_path_minimal(child_id("child-field")).unwrap(),
"parent.child-field"
);

// Nested leaf whose NAME contains a dot: MUST stay quoted so it
// round-trips through parse_field_path (this is the regression guard).
let dotted = schema.field_path_minimal(child_id("child.x")).unwrap();
assert_eq!(dotted, "parent.`child.x`");
assert_eq!(
parse_field_path(&dotted).unwrap(),
vec!["parent".to_string(), "child.x".to_string()]
);
}

#[test]
fn test_validate_rejects_zero_dimension_fixed_size_list() {
// A zero dimension divides-by-zero further down the write path (#5102)
Expand Down
2 changes: 1 addition & 1 deletion rust/lance-namespace-impls/src/dir.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3540,7 +3540,7 @@ impl LanceNamespace for DirectoryNamespace {
.map(|field_id| {
dataset
.schema()
.field_path(i32::try_from(*field_id).map_err(|e| {
.field_path_minimal(i32::try_from(*field_id).map_err(|e| {
lance_core::Error::from(NamespaceError::Internal {
message: format!(
"Field id {} does not fit in i32 for table '{}': {}",
Expand Down
Loading