From 6e42bf6205ae6ae2d8154c139ada8eaf1e359d35 Mon Sep 17 00:00:00 2001 From: yangjie01 Date: Fri, 22 May 2026 14:39:52 +0800 Subject: [PATCH] feat: add lance_dataset_drop_columns for metadata-only column removal MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit First of three PRs covering the schema-evolution roadmap entry. Exposes upstream's `drop_columns` — a metadata-only manifest commit that removes the named columns from the schema without rewriting data files. --- include/lance/lance.h | 31 +++++ include/lance/lance.hpp | 26 ++++ src/drop_columns.rs | 102 ++++++++++++++ src/lib.rs | 2 + tests/c_api_test.rs | 268 +++++++++++++++++++++++++++++++++++++ tests/cpp/test_c_api.c | 78 +++++++++++ tests/cpp/test_cpp_api.cpp | 58 ++++++++ 7 files changed, 565 insertions(+) create mode 100644 src/drop_columns.rs diff --git a/include/lance/lance.h b/include/lance/lance.h index e02c964..db6d557 100644 --- a/include/lance/lance.h +++ b/include/lance/lance.h @@ -453,6 +453,37 @@ int32_t lance_dataset_compact_files( LanceCompactionMetrics* out_metrics ); +/* ─── lance_dataset_drop_columns ──────────────────────────────────────────── */ + +/** + * Drop one or more columns from the dataset's schema, committing a new + * manifest. This is a metadata-only operation: the data files on storage + * are not rewritten until a later `lance_dataset_compact_files` call + * materializes the projection (after which the previous version's files + * can be removed by a future cleanup operation). + * + * Mutates `dataset` in place — the same handle remains valid afterward + * and sees the new version. Scanners already in flight against this + * dataset keep their pre-drop schema view. + * + * @param dataset Open dataset (not consumed). Mutated in place to + * see the new version. Must not be NULL. + * @param columns Array of NUL-terminated UTF-8 column names to drop. + * Must not be NULL; entries must be non-NULL and + * non-empty. + * @param num_columns Length of `columns`. Must be > 0. + * @return 0 on success, -1 on error. Error codes: + * LANCE_ERR_INVALID_ARGUMENT for NULL/empty inputs, NULL or empty + * entries, non-UTF-8 column names, unknown columns, or an attempt + * to drop every column; + * LANCE_ERR_COMMIT_CONFLICT for a concurrent writer. + */ +int32_t lance_dataset_drop_columns( + LanceDataset* dataset, + const char* const* columns, + size_t num_columns +); + /** * Export the dataset schema via Arrow C Data Interface. * @param out Pointer to caller-allocated ArrowSchema struct diff --git a/include/lance/lance.hpp b/include/lance/lance.hpp index 6a3cef0..eeb55d6 100644 --- a/include/lance/lance.hpp +++ b/include/lance/lance.hpp @@ -437,6 +437,32 @@ class Dataset { return metrics; } + /// Drop columns from the dataset's schema and commit a new manifest. + /// Metadata-only — data files remain until a later `compact_files()` + /// call rewrites them. Mutates this dataset in place; the handle + /// continues to point at the new version. + /// + /// `columns` must be non-empty. Throws lance::Error on failure (empty + /// list, unknown column, attempt to drop every column, commit + /// conflict, ...). + void drop_columns(const std::vector& columns) { + std::vector col_ptrs; + col_ptrs.reserve(columns.size()); + for (const auto& c : columns) { + col_ptrs.push_back(c.c_str()); + } + // Pass `col_ptrs.data()` unconditionally — matches the `update` + // and `merge_insert` siblings whose inputs are also required to + // be non-empty. The Rust layer rejects `num_columns == 0` before + // dereferencing the pointer, so an empty vector still surfaces + // INVALID_ARGUMENT with the precise "num_columns must be > 0" + // message rather than the misleading "columns must not be NULL". + if (lance_dataset_drop_columns( + handle_.get(), col_ptrs.data(), columns.size()) != 0) { + check_error(); + } + } + /// Export the schema as an Arrow C Data Interface struct. void schema(ArrowSchema* out) const { if (lance_dataset_schema(handle_.get(), out) != 0) { diff --git a/src/drop_columns.rs b/src/drop_columns.rs new file mode 100644 index 0000000..77a908d --- /dev/null +++ b/src/drop_columns.rs @@ -0,0 +1,102 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Drop columns C API: remove columns from the dataset's schema, committing a +//! new manifest. Metadata-only — data files are not rewritten until a later +//! `lance_dataset_compact_files` call materializes the projection. +//! +//! Mutates the dataset in place under an exclusive write lock; existing +//! scanners that already cloned the inner Arc keep their pre-drop schema view. + +use std::ffi::c_char; + +use lance_core::Result; +use snafu::location; + +use crate::dataset::LanceDataset; +use crate::error::ffi_try; +use crate::helpers; +use crate::runtime::block_on; + +/// Drop one or more columns from the dataset's schema and commit a new +/// manifest. This is a metadata-only operation: data files remain on storage +/// until they are rewritten by `lance_dataset_compact_files` (and then +/// cleaned up by version cleanup). +/// +/// - `dataset`: Open dataset (mutated; same handle remains valid afterward). +/// Must not be NULL. +/// - `columns`: Pointer to an array of NUL-terminated C strings naming the +/// columns to drop. Must not be NULL. Entries must be non-NULL and +/// non-empty UTF-8. +/// - `num_columns`: Length of the `columns` array. Must be non-zero. +/// +/// Returns 0 on success, -1 on error. Error codes: +/// `LANCE_ERR_INVALID_ARGUMENT` for NULL/empty args, NULL or empty entries, +/// non-UTF-8 names, unknown columns, or an attempt to drop every column +/// (upstream rejects that since a Lance dataset must retain at least one +/// field). `LANCE_ERR_COMMIT_CONFLICT` for a concurrent writer. +#[unsafe(no_mangle)] +pub unsafe extern "C" fn lance_dataset_drop_columns( + dataset: *mut LanceDataset, + columns: *const *const c_char, + num_columns: usize, +) -> i32 { + ffi_try!( + unsafe { drop_columns_inner(dataset, columns, num_columns) }, + neg + ) +} + +unsafe fn drop_columns_inner( + dataset: *mut LanceDataset, + columns: *const *const c_char, + num_columns: usize, +) -> Result { + if dataset.is_null() { + return Err(lance_core::Error::InvalidInput { + source: "dataset must not be NULL".into(), + location: location!(), + }); + } + if columns.is_null() { + return Err(lance_core::Error::InvalidInput { + source: "columns must not be NULL".into(), + location: location!(), + }); + } + if num_columns == 0 { + return Err(lance_core::Error::InvalidInput { + source: "num_columns must be > 0".into(), + location: location!(), + }); + } + + // Materialize the column names up front so any per-index validation + // error fires before the dataset's write lock is taken — matches the + // pre-lock validation pattern used by `update.rs`. + let mut names: Vec = Vec::with_capacity(num_columns); + for i in 0..num_columns { + // SAFETY: `columns` is non-NULL (checked above) and the caller + // guarantees the array has at least `num_columns` entries. + let entry = unsafe { *columns.add(i) }; + // SAFETY: each entry is either NULL (rejected below) or a + // NUL-terminated C string the caller keeps alive for this call. + let name = unsafe { helpers::parse_c_string(entry)? } + .filter(|s| !s.is_empty()) + .ok_or_else(|| lance_core::Error::InvalidInput { + source: format!("columns[{i}] must not be NULL or empty").into(), + location: location!(), + })?; + names.push(name.to_string()); + } + + // SAFETY: `dataset` is non-NULL (checked above) and the caller guarantees + // it points to a live `LanceDataset`. `with_mut` takes an exclusive + // write lock on the inner `Arc` before yielding `&mut Dataset`, + // so a shared `&*dataset` borrow here is sound — interior mutability + // is the synchronization point. + let ds = unsafe { &*dataset }; + let names_refs: Vec<&str> = names.iter().map(String::as_str).collect(); + ds.with_mut(|d| block_on(d.drop_columns(&names_refs)))?; + Ok(0) +} diff --git a/src/lib.rs b/src/lib.rs index 666ec05..5c34477 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -20,6 +20,7 @@ mod batch; mod compact; mod dataset; mod delete; +mod drop_columns; mod error; mod fragment_writer; mod helpers; @@ -37,6 +38,7 @@ pub use batch::*; pub use compact::*; pub use dataset::*; pub use delete::*; +pub use drop_columns::*; pub use error::{ LanceErrorCode, lance_free_string, lance_last_error_code, lance_last_error_message, }; diff --git a/tests/c_api_test.rs b/tests/c_api_test.rs index d6c7b86..ebc8260 100644 --- a/tests/c_api_test.rs +++ b/tests/c_api_test.rs @@ -6268,3 +6268,271 @@ fn test_scanner_set_index_segments_null_safety() { unsafe { lance_scanner_close(scanner) }; unsafe { lance_dataset_close(ds) }; } + +// =========================================================================== +// lance_dataset_drop_columns +// =========================================================================== + +/// Return the dataset's column names by exporting its schema through the +/// Arrow C Data Interface. +fn schema_field_names(ds: *const LanceDataset) -> Vec { + debug_assert!(!ds.is_null(), "schema_field_names called with NULL dataset"); + let mut ffi_schema = FFI_ArrowSchema::empty(); + let rc = unsafe { lance_dataset_schema(ds, &mut ffi_schema) }; + assert_eq!(rc, 0, "lance_dataset_schema failed"); + let arrow_schema = + arrow_schema::Schema::try_from(&ffi_schema).expect("FFI_ArrowSchema -> Schema"); + arrow_schema + .fields() + .iter() + .map(|f| f.name().to_string()) + .collect() +} + +#[test] +fn test_drop_columns_single() { + let (_tmp, uri) = create_large_dataset(10); + let c_uri = c_str(&uri); + let ds = unsafe { lance_dataset_open(c_uri.as_ptr(), ptr::null(), 0) }; + assert!(!ds.is_null()); + + let col = c_str("value"); + let cols = [col.as_ptr()]; + let rc = unsafe { lance_dataset_drop_columns(ds, cols.as_ptr(), cols.len()) }; + assert_eq!(rc, 0); + + // Row count is unchanged — drop is metadata-only. + assert_eq!(unsafe { lance_dataset_count_rows(ds) }, 10); + // Schema reflects the dropped column. + let names = schema_field_names(ds); + assert_eq!(names, vec!["id".to_string(), "label".to_string()]); + + unsafe { lance_dataset_close(ds) }; +} + +#[test] +fn test_drop_columns_multiple() { + let (_tmp, uri) = create_large_dataset(5); + let c_uri = c_str(&uri); + let ds = unsafe { lance_dataset_open(c_uri.as_ptr(), ptr::null(), 0) }; + assert!(!ds.is_null()); + + let c_value = c_str("value"); + let c_label = c_str("label"); + let cols = [c_value.as_ptr(), c_label.as_ptr()]; + let rc = unsafe { lance_dataset_drop_columns(ds, cols.as_ptr(), cols.len()) }; + assert_eq!(rc, 0); + + let names = schema_field_names(ds); + assert_eq!(names, vec!["id".to_string()]); + + unsafe { lance_dataset_close(ds) }; +} + +#[test] +fn test_drop_columns_bumps_version() { + let (_tmp, uri) = create_large_dataset(3); + let c_uri = c_str(&uri); + let ds = unsafe { lance_dataset_open(c_uri.as_ptr(), ptr::null(), 0) }; + assert!(!ds.is_null()); + + let v_before = unsafe { lance_dataset_version(ds) }; + let col = c_str("label"); + let cols = [col.as_ptr()]; + let rc = unsafe { lance_dataset_drop_columns(ds, cols.as_ptr(), cols.len()) }; + assert_eq!(rc, 0); + let v_after = unsafe { lance_dataset_version(ds) }; + assert!( + v_after > v_before, + "version should increase: before={v_before}, after={v_after}" + ); + + unsafe { lance_dataset_close(ds) }; +} + +#[test] +fn test_drop_columns_preserves_remaining_data() { + let (_tmp, uri) = create_large_dataset(4); + let c_uri = c_str(&uri); + let ds = unsafe { lance_dataset_open(c_uri.as_ptr(), ptr::null(), 0) }; + assert!(!ds.is_null()); + + let col = c_str("value"); + let cols = [col.as_ptr()]; + let rc = unsafe { lance_dataset_drop_columns(ds, cols.as_ptr(), cols.len()) }; + assert_eq!(rc, 0); + + // Materialize the remaining columns to confirm row data is intact. + let indices: [u64; 4] = [0, 1, 2, 3]; + let mut stream = FFI_ArrowArrayStream::empty(); + let rc = unsafe { + lance_dataset_take( + ds, + indices.as_ptr(), + indices.len(), + ptr::null(), + &mut stream, + ) + }; + assert_eq!(rc, 0); + + let reader = unsafe { ArrowArrayStreamReader::from_raw(&mut stream) }.unwrap(); + let schema = reader.schema(); + let names: Vec<&str> = schema.fields().iter().map(|f| f.name().as_str()).collect(); + assert_eq!(names, vec!["id", "label"]); + + // Concatenate the resulting batches and assert the surviving column + // values are unchanged — a regression that zeroed surviving data + // would slip past a shape-only check. + let batches: Vec<_> = reader.into_iter().collect::, _>>().unwrap(); + let total: usize = batches.iter().map(|b| b.num_rows()).sum(); + assert_eq!(total, 4); + + let mut ids: Vec = Vec::with_capacity(4); + let mut labels: Vec = Vec::with_capacity(4); + for batch in &batches { + let id_col = batch + .column_by_name("id") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let label_col = batch + .column_by_name("label") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + for i in 0..batch.num_rows() { + ids.push(id_col.value(i)); + labels.push(label_col.value(i).to_string()); + } + } + assert_eq!(ids, vec![0, 1, 2, 3]); + assert_eq!(labels, vec!["row_0", "row_1", "row_2", "row_3"]); + + unsafe { lance_dataset_close(ds) }; +} + +#[test] +fn test_drop_columns_unknown_column_rejected() { + let (_tmp, uri) = create_large_dataset(3); + let c_uri = c_str(&uri); + let ds = unsafe { lance_dataset_open(c_uri.as_ptr(), ptr::null(), 0) }; + assert!(!ds.is_null()); + + let col = c_str("no_such_column"); + let cols = [col.as_ptr()]; + let rc = unsafe { lance_dataset_drop_columns(ds, cols.as_ptr(), cols.len()) }; + assert_eq!(rc, -1); + // Upstream surfaces this as InvalidInput → InvalidArgument at the FFI + // boundary. The dataset is left untouched on the error path. + assert_eq!(lance_last_error_code(), LanceErrorCode::InvalidArgument); + let names = schema_field_names(ds); + assert_eq!(names, vec!["id", "value", "label"]); + + unsafe { lance_dataset_close(ds) }; +} + +#[test] +fn test_drop_columns_cannot_drop_all() { + let (_tmp, uri) = create_large_dataset(2); + let c_uri = c_str(&uri); + let ds = unsafe { lance_dataset_open(c_uri.as_ptr(), ptr::null(), 0) }; + assert!(!ds.is_null()); + + let c_id = c_str("id"); + let c_value = c_str("value"); + let c_label = c_str("label"); + let cols = [c_id.as_ptr(), c_value.as_ptr(), c_label.as_ptr()]; + let rc = unsafe { lance_dataset_drop_columns(ds, cols.as_ptr(), cols.len()) }; + assert_eq!(rc, -1); + assert_eq!(lance_last_error_code(), LanceErrorCode::InvalidArgument); + // Schema is unchanged. + let names = schema_field_names(ds); + assert_eq!(names, vec!["id", "value", "label"]); + + unsafe { lance_dataset_close(ds) }; +} + +#[test] +fn test_drop_columns_null_dataset_rejected() { + let col = c_str("value"); + let cols = [col.as_ptr()]; + let rc = unsafe { lance_dataset_drop_columns(ptr::null_mut(), cols.as_ptr(), cols.len()) }; + assert_eq!(rc, -1); + assert_eq!(lance_last_error_code(), LanceErrorCode::InvalidArgument); +} + +#[test] +fn test_drop_columns_null_columns_rejected() { + let (_tmp, uri) = create_large_dataset(2); + let c_uri = c_str(&uri); + let ds = unsafe { lance_dataset_open(c_uri.as_ptr(), ptr::null(), 0) }; + assert!(!ds.is_null()); + + let rc = unsafe { lance_dataset_drop_columns(ds, ptr::null(), 1) }; + assert_eq!(rc, -1); + assert_eq!(lance_last_error_code(), LanceErrorCode::InvalidArgument); + assert_eq!(unsafe { lance_dataset_count_rows(ds) }, 2); + + unsafe { lance_dataset_close(ds) }; +} + +#[test] +fn test_drop_columns_zero_count_rejected() { + let (_tmp, uri) = create_large_dataset(2); + let c_uri = c_str(&uri); + let ds = unsafe { lance_dataset_open(c_uri.as_ptr(), ptr::null(), 0) }; + assert!(!ds.is_null()); + + let v_before = unsafe { lance_dataset_version(ds) }; + let dummy = c_str("value"); + let cols = [dummy.as_ptr()]; + let rc = unsafe { lance_dataset_drop_columns(ds, cols.as_ptr(), 0) }; + assert_eq!(rc, -1); + assert_eq!(lance_last_error_code(), LanceErrorCode::InvalidArgument); + // No version bump on the error path — compare against the pre-call value. + assert_eq!(unsafe { lance_dataset_version(ds) }, v_before); + + unsafe { lance_dataset_close(ds) }; +} + +#[test] +fn test_drop_columns_null_entry_rejected() { + let (_tmp, uri) = create_large_dataset(2); + let c_uri = c_str(&uri); + let ds = unsafe { lance_dataset_open(c_uri.as_ptr(), ptr::null(), 0) }; + assert!(!ds.is_null()); + + let c_value = c_str("value"); + let cols: [*const c_char; 2] = [c_value.as_ptr(), ptr::null()]; + let rc = unsafe { lance_dataset_drop_columns(ds, cols.as_ptr(), cols.len()) }; + assert_eq!(rc, -1); + assert_eq!(lance_last_error_code(), LanceErrorCode::InvalidArgument); + // Dataset is unchanged. + let names = schema_field_names(ds); + assert_eq!(names, vec!["id", "value", "label"]); + + unsafe { lance_dataset_close(ds) }; +} + +#[test] +fn test_drop_columns_empty_entry_rejected() { + let (_tmp, uri) = create_large_dataset(2); + let c_uri = c_str(&uri); + let ds = unsafe { lance_dataset_open(c_uri.as_ptr(), ptr::null(), 0) }; + assert!(!ds.is_null()); + + // Empty string is rejected by the `.filter(!empty)` guard at the FFI + // boundary, distinct from the NULL-entry path. + let empty = c_str(""); + let cols = [empty.as_ptr()]; + let rc = unsafe { lance_dataset_drop_columns(ds, cols.as_ptr(), cols.len()) }; + assert_eq!(rc, -1); + assert_eq!(lance_last_error_code(), LanceErrorCode::InvalidArgument); + let names = schema_field_names(ds); + assert_eq!(names, vec!["id", "value", "label"]); + + unsafe { lance_dataset_close(ds) }; +} diff --git a/tests/cpp/test_c_api.c b/tests/cpp/test_c_api.c index 89f64d5..4ff81c8 100644 --- a/tests/cpp/test_c_api.c +++ b/tests/cpp/test_c_api.c @@ -345,6 +345,83 @@ static void test_merge_insert(const char *write_uri) { printf("OK\n"); } +/* Re-opens the dataset just written by `test_dataset_write_roundtrip` and + * exercises `lance_dataset_drop_columns`. Drops the `name` column so the + * dataset is left with `id` only; subsequent tests (`compact_files`, + * `delete`) do not reference any dropped column. Must run after + * `test_update` / `test_merge_insert`, which both still need `name` / `id`. */ +static void test_drop_columns(const char *write_uri) { + printf(" test_drop_columns... "); + + LanceDataset *ds = lance_dataset_open(write_uri, NULL, 0); + ASSERT(ds != NULL, "open failed"); + uint64_t before_rows = lance_dataset_count_rows(ds); + CHECK_OK(); + ASSERT(before_rows > 0, "fixture expected to have rows"); + uint64_t v_before = lance_dataset_version(ds); + + /* Snapshot the schema so we can confirm the field count decreased + * (otherwise a bug that bumped the version without modifying the + * schema would pass silently). The ArrowSchema struct owns its + * children — release it before any potentially-aborting assert so + * we don't leak under sanitizer runs in CI. */ + struct ArrowSchema schema_before; + memset(&schema_before, 0, sizeof(schema_before)); + int32_t rc = lance_dataset_schema(ds, &schema_before); + ASSERT(rc == 0, "schema export failed"); + int64_t fields_before = schema_before.n_children; + if (schema_before.release) schema_before.release(&schema_before); + + const char *cols[] = {"name"}; + rc = lance_dataset_drop_columns(ds, cols, 1); + ASSERT(rc == 0, "drop_columns failed"); + uint64_t after_rows = lance_dataset_count_rows(ds); + CHECK_OK(); + ASSERT(after_rows == before_rows, + "metadata-only drop must not change row count"); + ASSERT(lance_dataset_version(ds) > v_before, + "drop_columns must bump the version"); + + /* Schema field count must have decreased by exactly 1. The C-test + * fixture has 2 columns (`id`, `name`) — assert `fields_after == 1` + * so this self-documents the post-drop expectation and trips if the + * fixture ever grows additional columns. Release the exported + * schema before any assertion so we never leak it on failure. */ + struct ArrowSchema schema_after; + memset(&schema_after, 0, sizeof(schema_after)); + rc = lance_dataset_schema(ds, &schema_after); + ASSERT(rc == 0, "schema export failed after drop"); + int64_t fields_after = schema_after.n_children; + if (schema_after.release) schema_after.release(&schema_after); + ASSERT(fields_after == fields_before - 1, + "schema field count must decrease by 1 after drop"); + ASSERT(fields_after == 1, + "C-test fixture should be left with `id` only after dropping `name`"); + + /* NULL `columns` and num_columns == 0 must both be rejected. */ + rc = lance_dataset_drop_columns(ds, NULL, 1); + ASSERT(rc == -1, "NULL columns must fail"); + ASSERT(lance_last_error_code() == LANCE_ERR_INVALID_ARGUMENT, + "expected INVALID_ARGUMENT"); + + rc = lance_dataset_drop_columns(ds, cols, 0); + ASSERT(rc == -1, "num_columns=0 must fail"); + ASSERT(lance_last_error_code() == LANCE_ERR_INVALID_ARGUMENT, + "expected INVALID_ARGUMENT"); + + /* Dropping the sole remaining column (`id`) must fail with + * INVALID_ARGUMENT — upstream refuses to leave a dataset with zero + * fields. */ + const char *last_col[] = {"id"}; + rc = lance_dataset_drop_columns(ds, last_col, 1); + ASSERT(rc == -1, "dropping last column must fail"); + ASSERT(lance_last_error_code() == LANCE_ERR_INVALID_ARGUMENT, + "expected INVALID_ARGUMENT"); + + lance_dataset_close(ds); + printf("OK\n"); +} + /* Re-opens the dataset just written by `test_dataset_write_roundtrip` and * exercises `lance_dataset_compact_files`. The smoke fixture is a single * fragment, so the default planner has nothing to compact — we expect @@ -423,6 +500,7 @@ int main(int argc, char **argv) { test_dataset_write_roundtrip(uri, write_uri); test_update(write_uri); test_merge_insert(write_uri); + test_drop_columns(write_uri); test_compact_files(write_uri); test_delete(write_uri); diff --git a/tests/cpp/test_cpp_api.cpp b/tests/cpp/test_cpp_api.cpp index 1a28446..ea1f693 100644 --- a/tests/cpp/test_cpp_api.cpp +++ b/tests/cpp/test_cpp_api.cpp @@ -397,6 +397,63 @@ static void test_merge_insert(const std::string& dst_uri) { PASS(); } +// Re-opens the dataset just written by `test_dataset_write_roundtrip` and +// exercises `Dataset::drop_columns`. Drops the `name` column so the dataset +// is left with `id` only; subsequent tests (`compact_files`, `delete_rows`) +// do not reference any dropped column. Must run after `test_update` / +// `test_merge_insert`. +static void test_drop_columns(const std::string& dst_uri) { + TEST(test_drop_columns); + + auto ds = lance::Dataset::open(dst_uri); + uint64_t before_rows = ds.count_rows(); + uint64_t v_before = ds.version(); + + ds.drop_columns({"name"}); + assert(ds.count_rows() == before_rows + && "metadata-only drop must preserve row count"); + assert(ds.version() > v_before + && "drop_columns must bump the version"); + uint64_t v_after_drop = ds.version(); + + // Dropping an unknown column must throw with INVALID_ARGUMENT and + // leave the dataset unchanged (no version bump on the error path). + bool caught_unknown = false; + try { + ds.drop_columns({"no_such_column"}); + } catch (const lance::Error& e) { + caught_unknown = true; + assert(e.code == LANCE_ERR_INVALID_ARGUMENT); + } + assert(caught_unknown); + assert(ds.version() == v_after_drop + && "failed drop must not bump the version"); + + // Empty column list must throw with INVALID_ARGUMENT. + bool caught_empty = false; + try { + ds.drop_columns({}); + } catch (const lance::Error& e) { + caught_empty = true; + assert(e.code == LANCE_ERR_INVALID_ARGUMENT); + } + assert(caught_empty); + + // Dropping the sole remaining column (`id`) must throw with + // INVALID_ARGUMENT — upstream refuses to leave a dataset with zero + // fields. + bool caught_last = false; + try { + ds.drop_columns({"id"}); + } catch (const lance::Error& e) { + caught_last = true; + assert(e.code == LANCE_ERR_INVALID_ARGUMENT); + } + assert(caught_last); + + PASS(); +} + // Re-opens the dataset just written by `test_dataset_write_roundtrip` and // exercises `Dataset::compact_files`. The smoke fixture is a single fragment // so the default planner has nothing to compact — we expect a no-op (zero @@ -468,6 +525,7 @@ int main(int argc, char** argv) { test_dataset_write_roundtrip(uri, write_uri); test_update(write_uri); test_merge_insert(write_uri); + test_drop_columns(write_uri); test_compact_files(write_uri); test_delete_rows(write_uri);