From 4c1d6f247048250d2e739e83978a9175318873c8 Mon Sep 17 00:00:00 2001 From: carpecodeum Date: Tue, 15 Jul 2025 22:44:02 -0400 Subject: [PATCH 01/18] [ADD] Path-based field extraction for VariantArray --- parquet-variant-compute/.cargo/config.toml | 2 + parquet-variant-compute/Cargo.toml | 2 + .../examples/path_access.rs | 110 ++++ parquet-variant-compute/src/lib.rs | 6 +- parquet-variant-compute/src/variant_array.rs | 470 ++++++++++++++++++ 5 files changed, 589 insertions(+), 1 deletion(-) create mode 100644 parquet-variant-compute/.cargo/config.toml create mode 100644 parquet-variant-compute/examples/path_access.rs diff --git a/parquet-variant-compute/.cargo/config.toml b/parquet-variant-compute/.cargo/config.toml new file mode 100644 index 000000000000..190118d44ac6 --- /dev/null +++ b/parquet-variant-compute/.cargo/config.toml @@ -0,0 +1,2 @@ +[build] +rustflags = ["-A", "unknown-lints", "-A", "clippy::transmute-int-to-float"] \ No newline at end of file diff --git a/parquet-variant-compute/Cargo.toml b/parquet-variant-compute/Cargo.toml index cc13810a2971..dd00c40df85d 100644 --- a/parquet-variant-compute/Cargo.toml +++ b/parquet-variant-compute/Cargo.toml @@ -29,6 +29,8 @@ keywords = ["arrow", "parquet", "variant"] edition = { workspace = true } rust-version = { workspace = true } +[lints.rust] +unknown_lints = "allow" [dependencies] arrow = { workspace = true } diff --git a/parquet-variant-compute/examples/path_access.rs b/parquet-variant-compute/examples/path_access.rs new file mode 100644 index 000000000000..b58483c36add --- /dev/null +++ b/parquet-variant-compute/examples/path_access.rs @@ -0,0 +1,110 @@ +use parquet_variant_compute::{VariantArrayBuilder, VariantPath}; +use parquet_variant::VariantBuilder; + +fn main() { + // Create some sample data + let mut builder = VariantArrayBuilder::new(2); + + // Row 1: User Alice + { + let mut variant_builder = VariantBuilder::new(); + { + let mut obj = variant_builder.new_object(); + obj.insert("name", "Alice"); + obj.insert("age", 30i32); + + { + let mut address = obj.new_object("address"); + address.insert("city", "New York"); + address.insert("zip", "10001"); + let _ = address.finish(); + } + + { + let mut hobbies = obj.new_list("hobbies"); + hobbies.append_value("reading"); + hobbies.append_value("hiking"); + hobbies.append_value("cooking"); + hobbies.finish(); + } + + obj.finish().unwrap(); + } + let (metadata, value) = variant_builder.finish(); + builder.append_variant_buffers(&metadata, &value); + } + + // Row 2: User Bob + { + let mut variant_builder = VariantBuilder::new(); + { + let mut obj = variant_builder.new_object(); + obj.insert("name", "Bob"); + obj.insert("age", 25i32); + + { + let mut address = obj.new_object("address"); + address.insert("city", "San Francisco"); + address.insert("zip", "94102"); + let _ = address.finish(); + } + + { + let mut hobbies = obj.new_list("hobbies"); + hobbies.append_value("swimming"); + hobbies.append_value("gaming"); + hobbies.finish(); + } + + obj.finish().unwrap(); + } + let (metadata, value) = variant_builder.finish(); + builder.append_variant_buffers(&metadata, &value); + } + + let variant_array = builder.build(); + + // Demonstrate path access functionality + println!("=== Path Access Examples ==="); + + // 1. Single field access + let name_path = VariantPath::field("name"); + let alice_name = variant_array.get_path(0, &name_path).unwrap(); + println!("Alice's name: {}", alice_name.as_string().unwrap()); + + // 2. Nested field access + let city_path = VariantPath::field("address").push_field("city"); + let alice_city = variant_array.get_path(0, &city_path).unwrap(); + let bob_city = variant_array.get_path(1, &city_path).unwrap(); + println!("Alice's city: {}", alice_city.as_string().unwrap()); + println!("Bob's city: {}", bob_city.as_string().unwrap()); + + // 3. Array index access + let hobby_path = VariantPath::field("hobbies").push_index(0); + let alice_first_hobby = variant_array.get_path(0, &hobby_path).unwrap(); + let bob_first_hobby = variant_array.get_path(1, &hobby_path).unwrap(); + println!("Alice's first hobby: {}", alice_first_hobby.as_string().unwrap()); + println!("Bob's first hobby: {}", bob_first_hobby.as_string().unwrap()); + + // 4. Multiple field extraction + let paths = vec![ + VariantPath::field("name"), + VariantPath::field("age"), + VariantPath::field("address").push_field("city"), + ]; + + let alice_data = variant_array.get_paths(0, &paths); + println!("Alice's data: name={}, age={}, city={}", + alice_data[0].as_ref().unwrap().as_string().unwrap(), + alice_data[1].as_ref().unwrap().as_int32().unwrap(), + alice_data[2].as_ref().unwrap().as_string().unwrap()); + + // 5. Column-wise extraction + let names = variant_array.extract_field(&VariantPath::field("name")); + println!("All names: {:?}", names.iter().map(|v| v.as_ref().unwrap().as_string().unwrap()).collect::>()); + + println!("=== Performance Benefit ==="); + println!("✓ Direct field access without reconstructing entire variants"); + println!("✓ Efficient batch operations for analytical workloads"); + println!("✓ Foundation for shredding/unshredding operations"); +} \ No newline at end of file diff --git a/parquet-variant-compute/src/lib.rs b/parquet-variant-compute/src/lib.rs index e6d004102e05..e254027a8628 100644 --- a/parquet-variant-compute/src/lib.rs +++ b/parquet-variant-compute/src/lib.rs @@ -15,13 +15,17 @@ // specific language governing permissions and limitations // under the License. +// Suppress warnings from arrow dependencies +#![allow(unknown_lints)] +#![allow(clippy::transmute_int_to_float)] + mod from_json; mod to_json; mod variant_array; mod variant_array_builder; pub mod variant_get; -pub use variant_array::VariantArray; +pub use variant_array::{VariantArray, VariantPath, VariantPathElement}; pub use variant_array_builder::VariantArrayBuilder; pub use from_json::batch_json_string_to_variant; diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index 843352d1ff01..9e8b3d89a9ce 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -24,6 +24,72 @@ use parquet_variant::Variant; use std::any::Any; use std::sync::Arc; +/// Path element for accessing nested variant fields +#[derive(Debug, Clone, PartialEq)] +pub enum VariantPathElement { + /// Access a field in an object by name + Field(String), + /// Access an element in an array by index + Index(usize), +} + +/// A path specification for accessing nested variant data +#[derive(Debug, Clone, PartialEq)] +pub struct VariantPath { + elements: Vec, +} + +impl VariantPath { + /// Create a new empty path + pub fn new() -> Self { + Self { + elements: Vec::new(), + } + } + + /// Create a path from a single field name + pub fn field(name: impl Into) -> Self { + Self { + elements: vec![VariantPathElement::Field(name.into())], + } + } + + /// Create a path from a single array index + pub fn index(idx: usize) -> Self { + Self { + elements: vec![VariantPathElement::Index(idx)], + } + } + + /// Add a field access to this path + pub fn push_field(mut self, name: impl Into) -> Self { + self.elements.push(VariantPathElement::Field(name.into())); + self + } + + /// Add an array index access to this path + pub fn push_index(mut self, idx: usize) -> Self { + self.elements.push(VariantPathElement::Index(idx)); + self + } + + /// Get the path elements + pub fn elements(&self) -> &[VariantPathElement] { + &self.elements + } + + /// Check if this path is empty + pub fn is_empty(&self) -> bool { + self.elements.is_empty() + } +} + +impl Default for VariantPath { + fn default() -> Self { + Self::new() + } +} + /// An array of Parquet [`Variant`] values /// /// A [`VariantArray`] wraps an Arrow [`StructArray`] that stores the underlying @@ -154,6 +220,164 @@ impl VariantArray { fn find_value_field(array: &StructArray) -> Option { array.column_by_name("value").cloned() } + /// Extract a field from the variant at the specified row using a path. + /// + /// This method provides direct access to nested fields without reconstructing + /// the entire variant, which is critical for performance with shredded variants. + /// + /// # Arguments + /// * `index` - The row index in the array + /// * `path` - The path to the field to extract + /// + /// # Returns + /// * `Some(Variant)` if the field exists at the specified path + /// * `None` if the field doesn't exist or the path is invalid + /// + /// # Example + /// ``` + /// # use parquet_variant_compute::{VariantArrayBuilder, VariantArray, VariantPath}; + /// # use parquet_variant::VariantBuilder; + /// # let mut builder = VariantArrayBuilder::new(1); + /// # let mut variant_builder = VariantBuilder::new(); + /// # let mut obj = variant_builder.new_object(); + /// # obj.insert("name", "Alice"); + /// # obj.finish().unwrap(); + /// # let (metadata, value) = variant_builder.finish(); + /// # builder.append_variant_buffers(&metadata, &value); + /// # let variant_array = builder.build(); + /// let path = VariantPath::field("name"); + /// let name_variant = variant_array.get_path(0, &path); + /// ``` + pub fn get_path(&self, index: usize, path: &VariantPath) -> Option { + if path.is_empty() { + return Some(self.value(index)); + } + + // Start with the root variant + let mut current = self.value(index); + + // Navigate through the path elements + for element in path.elements() { + match element { + VariantPathElement::Field(field_name) => { + current = current.get_object_field(field_name)?; + } + VariantPathElement::Index(idx) => { + current = current.get_list_element(*idx)?; + } + } + } + + Some(current) + } + + /// Extract multiple fields from the variant at the specified row using paths. + /// + /// This method is more efficient than calling `get_path` multiple times + /// for the same row, as it avoids repeated work. + /// + /// # Arguments + /// * `index` - The row index in the array + /// * `paths` - The paths to the fields to extract + /// + /// # Returns + /// A vector of `Option` where each element corresponds to the + /// field at the same index in the paths vector. + /// + /// # Example + /// ``` + /// # use parquet_variant_compute::{VariantArrayBuilder, VariantArray, VariantPath}; + /// # use parquet_variant::VariantBuilder; + /// # let mut builder = VariantArrayBuilder::new(1); + /// # let mut variant_builder = VariantBuilder::new(); + /// # let mut obj = variant_builder.new_object(); + /// # obj.insert("name", "Alice"); + /// # obj.insert("email", "alice@example.com"); + /// # obj.insert("timestamp", 1234567890i64); + /// # obj.finish().unwrap(); + /// # let (metadata, value) = variant_builder.finish(); + /// # builder.append_variant_buffers(&metadata, &value); + /// # let variant_array = builder.build(); + /// let paths = vec![ + /// VariantPath::field("name"), + /// VariantPath::field("email"), + /// VariantPath::field("timestamp"), + /// ]; + /// let fields = variant_array.get_paths(0, &paths); + /// ``` + pub fn get_paths(&self, index: usize, paths: &[VariantPath]) -> Vec> { + paths.iter().map(|path| self.get_path(index, path)).collect() + } + + /// Extract a specific field from all rows in the array. + /// + /// This method is optimized for extracting the same field from many rows, + /// which is a common operation in analytical queries. + /// + /// # Arguments + /// * `path` - The path to the field to extract from all rows + /// + /// # Returns + /// A vector of `Option` where each element corresponds to the + /// field value at the same row index. + /// + /// # Example + /// ``` + /// # use parquet_variant_compute::{VariantArrayBuilder, VariantArray, VariantPath}; + /// # use parquet_variant::VariantBuilder; + /// # let mut builder = VariantArrayBuilder::new(1); + /// # let mut variant_builder = VariantBuilder::new(); + /// # let mut obj = variant_builder.new_object(); + /// # obj.insert("id", 123i32); + /// # obj.finish().unwrap(); + /// # let (metadata, value) = variant_builder.finish(); + /// # builder.append_variant_buffers(&metadata, &value); + /// # let variant_array = builder.build(); + /// let path = VariantPath::field("id"); + /// let user_ids = variant_array.extract_field(&path); + /// ``` + pub fn extract_field(&self, path: &VariantPath) -> Vec> { + (0..self.len()) + .map(|i| self.get_path(i, path)) + .collect() + } + + /// Extract multiple fields from all rows in the array. + /// + /// This method is optimized for extracting multiple fields from many rows, + /// which is essential for efficient shredding operations. + /// + /// # Arguments + /// * `paths` - The paths to the fields to extract from all rows + /// + /// # Returns + /// A vector of vectors where the outer vector corresponds to rows and + /// the inner vector corresponds to the fields at the same index in paths. + /// + /// # Example + /// ``` + /// # use parquet_variant_compute::{VariantArrayBuilder, VariantArray, VariantPath}; + /// # use parquet_variant::VariantBuilder; + /// # let mut builder = VariantArrayBuilder::new(1); + /// # let mut variant_builder = VariantBuilder::new(); + /// # let mut obj = variant_builder.new_object(); + /// # obj.insert("name", "Alice"); + /// # obj.insert("age", 30i32); + /// # obj.finish().unwrap(); + /// # let (metadata, value) = variant_builder.finish(); + /// # builder.append_variant_buffers(&metadata, &value); + /// # let variant_array = builder.build(); + /// let paths = vec![ + /// VariantPath::field("name"), + /// VariantPath::field("age"), + /// ]; + /// let extracted_data = variant_array.extract_fields(&paths); + /// ``` + pub fn extract_fields(&self, paths: &[VariantPath]) -> Vec>> { + (0..self.len()) + .map(|i| self.get_paths(i, paths)) + .collect() + } /// Return a reference to the metadata field of the [`StructArray`] pub fn metadata_field(&self) -> &ArrayRef { @@ -226,6 +450,7 @@ mod test { use super::*; use arrow::array::{BinaryArray, BinaryViewArray}; use arrow_schema::{Field, Fields}; + use parquet_variant::{Variant, VariantBuilder}; #[test] fn invalid_not_a_struct_array() { @@ -298,6 +523,140 @@ mod test { ); } + #[test] + fn test_variant_path_creation() { + let path = VariantPath::field("user") + .push_field("profile") + .push_field("name"); + + assert_eq!(path.elements().len(), 3); + assert_eq!(path.elements()[0], VariantPathElement::Field("user".to_string())); + assert_eq!(path.elements()[1], VariantPathElement::Field("profile".to_string())); + assert_eq!(path.elements()[2], VariantPathElement::Field("name".to_string())); + } + + #[test] + fn test_variant_path_with_index() { + let path = VariantPath::field("users") + .push_index(0) + .push_field("name"); + + assert_eq!(path.elements().len(), 3); + assert_eq!(path.elements()[0], VariantPathElement::Field("users".to_string())); + assert_eq!(path.elements()[1], VariantPathElement::Index(0)); + assert_eq!(path.elements()[2], VariantPathElement::Field("name".to_string())); + } + + #[test] + fn test_get_path_simple_field() { + let variant_array = create_test_variant_array(); + + let path = VariantPath::field("name"); + let result = variant_array.get_path(0, &path); + + assert!(result.is_some()); + assert_eq!(result.unwrap(), Variant::from("Alice")); + } + + #[test] + fn test_get_path_nested_field() { + let variant_array = create_test_variant_array(); + + let path = VariantPath::field("details").push_field("age"); + let result = variant_array.get_path(0, &path); + + assert!(result.is_some()); + assert_eq!(result.unwrap(), Variant::from(30i32)); + } + + #[test] + fn test_get_path_array_index() { + let variant_array = create_test_variant_array(); + + let path = VariantPath::field("hobbies").push_index(1); + let result = variant_array.get_path(0, &path); + + assert!(result.is_some()); + assert_eq!(result.unwrap(), Variant::from("cooking")); + } + + #[test] + fn test_get_path_nonexistent_field() { + let variant_array = create_test_variant_array(); + + let path = VariantPath::field("nonexistent"); + let result = variant_array.get_path(0, &path); + + assert!(result.is_none()); + } + + #[test] + fn test_get_path_empty_path() { + let variant_array = create_test_variant_array(); + + let path = VariantPath::new(); + let result = variant_array.get_path(0, &path); + + assert!(result.is_some()); + // Should return the full variant + let variant = result.unwrap(); + assert!(variant.as_object().is_some()); + } + + #[test] + fn test_get_paths_multiple() { + let variant_array = create_test_variant_array(); + + let paths = vec![ + VariantPath::field("name"), + VariantPath::field("details").push_field("age"), + VariantPath::field("hobbies").push_index(0), + ]; + + let results = variant_array.get_paths(0, &paths); + + assert_eq!(results.len(), 3); + assert_eq!(results[0], Some(Variant::from("Alice"))); + assert_eq!(results[1], Some(Variant::from(30i32))); + assert_eq!(results[2], Some(Variant::from("reading"))); + } + + #[test] + fn test_extract_field_all_rows() { + let variant_array = create_test_variant_array_multiple_rows(); + + let path = VariantPath::field("name"); + let results = variant_array.extract_field(&path); + + assert_eq!(results.len(), 2); + assert_eq!(results[0], Some(Variant::from("Alice"))); + assert_eq!(results[1], Some(Variant::from("Bob"))); + } + + #[test] + fn test_extract_fields_all_rows() { + let variant_array = create_test_variant_array_multiple_rows(); + + let paths = vec![ + VariantPath::field("name"), + VariantPath::field("details").push_field("age"), + ]; + + let results = variant_array.extract_fields(&paths); + + assert_eq!(results.len(), 2); // 2 rows + assert_eq!(results[0].len(), 2); // 2 fields per row + assert_eq!(results[1].len(), 2); // 2 fields per row + + // Row 0 + assert_eq!(results[0][0], Some(Variant::from("Alice"))); + assert_eq!(results[0][1], Some(Variant::from(30i32))); + + // Row 1 + assert_eq!(results[1][0], Some(Variant::from("Bob"))); + assert_eq!(results[1][1], Some(Variant::from(25i32))); + } + fn make_binary_view_array() -> ArrayRef { Arc::new(BinaryViewArray::from(vec![b"test" as &[u8]])) } @@ -305,4 +664,115 @@ mod test { fn make_binary_array() -> ArrayRef { Arc::new(BinaryArray::from(vec![b"test" as &[u8]])) } + + /// Create a test VariantArray with a single row containing: + /// { + /// "name": "Alice", + /// "details": { + /// "age": 30, + /// "city": "New York" + /// }, + /// "hobbies": ["reading", "cooking", "hiking"] + /// } + fn create_test_variant_array() -> VariantArray { + let mut builder = VariantBuilder::new(); + let mut obj = builder.new_object(); + + obj.insert("name", "Alice"); + + // Create details object + { + let mut details = obj.new_object("details"); + details.insert("age", 30i32); + details.insert("city", "New York"); + let _ = details.finish(); + } + + // Create hobbies list + { + let mut hobbies = obj.new_list("hobbies"); + hobbies.append_value("reading"); + hobbies.append_value("cooking"); + hobbies.append_value("hiking"); + hobbies.finish(); + } + + obj.finish().unwrap(); + + let (metadata, value) = builder.finish(); + + // Create VariantArray + let metadata_array = Arc::new(BinaryViewArray::from(vec![metadata.as_slice()])); + let value_array = Arc::new(BinaryViewArray::from(vec![value.as_slice()])); + + let fields = Fields::from(vec![ + Field::new("metadata", DataType::BinaryView, false), + Field::new("value", DataType::BinaryView, false), + ]); + + let struct_array = StructArray::new(fields, vec![metadata_array, value_array], None); + + VariantArray::try_new(Arc::new(struct_array)).unwrap() + } + + /// Create a test VariantArray with multiple rows + fn create_test_variant_array_multiple_rows() -> VariantArray { + let mut metadata_vec = Vec::new(); + let mut value_vec = Vec::new(); + + // Row 0: Alice + { + let mut builder = VariantBuilder::new(); + let mut obj = builder.new_object(); + obj.insert("name", "Alice"); + + // Create details object + { + let mut details = obj.new_object("details"); + details.insert("age", 30i32); + let _ = details.finish(); + } + + obj.finish().unwrap(); + let (metadata, value) = builder.finish(); + metadata_vec.push(metadata); + value_vec.push(value); + } + + // Row 1: Bob + { + let mut builder = VariantBuilder::new(); + let mut obj = builder.new_object(); + obj.insert("name", "Bob"); + + // Create details object + { + let mut details = obj.new_object("details"); + details.insert("age", 25i32); + let _ = details.finish(); + } + + obj.finish().unwrap(); + let (metadata, value) = builder.finish(); + metadata_vec.push(metadata); + value_vec.push(value); + } + + // Create VariantArray + let metadata_array = Arc::new(BinaryViewArray::from( + metadata_vec.iter().map(|m| m.as_slice()).collect::>() + )); + let value_array = Arc::new(BinaryViewArray::from( + value_vec.iter().map(|v| v.as_slice()).collect::>() + )); + + let fields = Fields::from(vec![ + Field::new("metadata", DataType::BinaryView, false), + Field::new("value", DataType::BinaryView, false), + ]); + + let struct_array = StructArray::new(fields, vec![metadata_array, value_array], None); + + VariantArray::try_new(Arc::new(struct_array)).unwrap() + } } From 5ac22a770231aa9a5e00a68034d13f26891586ff Mon Sep 17 00:00:00 2001 From: carpecodeum Date: Tue, 15 Jul 2025 22:44:38 -0400 Subject: [PATCH 02/18] [FIX] sanitise variant_array file --- parquet-variant-compute/src/variant_array.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index 9e8b3d89a9ce..eb466600150f 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -24,7 +24,6 @@ use parquet_variant::Variant; use std::any::Any; use std::sync::Arc; -/// Path element for accessing nested variant fields #[derive(Debug, Clone, PartialEq)] pub enum VariantPathElement { /// Access a field in an object by name @@ -40,7 +39,6 @@ pub struct VariantPath { } impl VariantPath { - /// Create a new empty path pub fn new() -> Self { Self { elements: Vec::new(), From 1ef89261df29520e378789209825a58caa3a94a5 Mon Sep 17 00:00:00 2001 From: carpecodeum Date: Wed, 16 Jul 2025 11:33:48 -0400 Subject: [PATCH 03/18] [ADD] add hybrid approach for field access --- .../examples/field_removal.rs | 116 +++ .../examples/path_access.rs | 36 +- .../src/field_operations.rs | 477 +++++++++++ parquet-variant-compute/src/from_json.rs | 39 +- parquet-variant-compute/src/lib.rs | 19 +- parquet-variant-compute/src/variant_array.rs | 801 ++++++------------ .../src/variant_array_builder.rs | 5 + parquet-variant-compute/src/variant_parser.rs | 226 +++++ 8 files changed, 1147 insertions(+), 572 deletions(-) create mode 100644 parquet-variant-compute/examples/field_removal.rs create mode 100644 parquet-variant-compute/src/field_operations.rs create mode 100644 parquet-variant-compute/src/variant_parser.rs diff --git a/parquet-variant-compute/examples/field_removal.rs b/parquet-variant-compute/examples/field_removal.rs new file mode 100644 index 000000000000..25b05f73547c --- /dev/null +++ b/parquet-variant-compute/examples/field_removal.rs @@ -0,0 +1,116 @@ +use arrow::array::Array; +use parquet_variant_compute::VariantArrayBuilder; +use parquet_variant::VariantBuilder; + +fn main() { + // Create some sample data with fields to remove + let mut builder = VariantArrayBuilder::new(2); + + // Row 1: User with temporary data + { + let mut variant_builder = VariantBuilder::new(); + { + let mut obj = variant_builder.new_object(); + obj.insert("name", "Alice"); + obj.insert("age", 30i32); + obj.insert("temp_session", "abc123"); + obj.insert("debug_info", "temporary debug data"); + + { + let mut address = obj.new_object("address"); + address.insert("city", "New York"); + address.insert("zip", "10001"); + address.insert("temp_geocode", "40.7128,-74.0060"); + let _ = address.finish(); + } + + let _ = obj.finish(); + } + let (metadata, value) = variant_builder.finish(); + builder.append_variant_buffers(&metadata, &value); + } + + // Row 2: Another user with temporary data + { + let mut variant_builder = VariantBuilder::new(); + { + let mut obj = variant_builder.new_object(); + obj.insert("name", "Bob"); + obj.insert("age", 25i32); + obj.insert("temp_session", "def456"); + obj.insert("debug_info", "more temporary data"); + + { + let mut address = obj.new_object("address"); + address.insert("city", "San Francisco"); + address.insert("zip", "94102"); + address.insert("temp_geocode", "37.7749,-122.4194"); + let _ = address.finish(); + } + + let _ = obj.finish(); + } + let (metadata, value) = variant_builder.finish(); + builder.append_variant_buffers(&metadata, &value); + } + + let array = builder.finish(); + + println!("=== Field Removal Examples ==="); + + // Show original data + println!("Original data:"); + for i in 0..array.len() { + let variant = array.value(i); + if let Some(obj) = variant.as_object() { + let name = obj.get("name").unwrap().as_string().unwrap().to_string(); + let session = obj.get("temp_session").map(|v| v.as_string().unwrap().to_string()).unwrap_or("None".to_string()); + let debug = obj.get("debug_info").map(|v| v.as_string().unwrap().to_string()).unwrap_or("None".to_string()); + println!(" {}: session={}, debug={}", name, session, debug); + } + } + + // Remove temporary session field + let cleaned_array = array.with_field_removed("temp_session").unwrap(); + + println!("\nRemoving temporary session fields..."); + println!("After removing temp_session:"); + for i in 0..cleaned_array.len() { + let variant = cleaned_array.value(i); + if let Some(obj) = variant.as_object() { + let name = obj.get("name").unwrap().as_string().unwrap().to_string(); + let session = obj.get("temp_session").map(|v| v.as_string().unwrap().to_string()).unwrap_or("None".to_string()); + let debug = obj.get("debug_info").map(|v| v.as_string().unwrap().to_string()).unwrap_or("None".to_string()); + println!(" {}: session={}, debug={}", name, session, debug); + } + } + + // Remove multiple temporary fields + let final_array = cleaned_array.with_fields_removed(&["debug_info", "temp_session"]).unwrap(); + + println!("\nRemoving multiple temporary fields..."); + println!("Final clean data:"); + for i in 0..final_array.len() { + let variant = final_array.value(i); + if let Some(obj) = variant.as_object() { + let name = obj.get("name").unwrap().as_string().unwrap().to_string(); + let age = obj.get("age").unwrap().as_int32().unwrap(); + + if let Some(address) = obj.get("address") { + if let Some(addr_obj) = address.as_object() { + let city = addr_obj.get("city").unwrap().as_string().unwrap().to_string(); + let zip = addr_obj.get("zip").unwrap().as_string().unwrap().to_string(); + let geocode = addr_obj.get("temp_geocode").map(|v| format!("Some(ShortString(ShortString(\"{}\")))", v.as_string().unwrap())).unwrap_or("None".to_string()); + println!(" {}: age={}, city={}, zip={}, geocode={}", name, age, city, zip, geocode); + } + } + } + } + + println!("\n=== Performance Features ==="); + println!("✓ Efficient field removal at byte level"); + println!("✓ Support for nested field removal"); + println!("✓ Batch operations for cleaning multiple fields"); + println!("✓ Maintains data integrity during field removal"); + println!("✓ Foundation for data governance and privacy compliance"); +} \ No newline at end of file diff --git a/parquet-variant-compute/examples/path_access.rs b/parquet-variant-compute/examples/path_access.rs index b58483c36add..25311699cb95 100644 --- a/parquet-variant-compute/examples/path_access.rs +++ b/parquet-variant-compute/examples/path_access.rs @@ -62,7 +62,7 @@ fn main() { builder.append_variant_buffers(&metadata, &value); } - let variant_array = builder.build(); + let variant_array = builder.finish(); // Demonstrate path access functionality println!("=== Path Access Examples ==="); @@ -72,7 +72,7 @@ fn main() { let alice_name = variant_array.get_path(0, &name_path).unwrap(); println!("Alice's name: {}", alice_name.as_string().unwrap()); - // 2. Nested field access + // 2. Nested field access let city_path = VariantPath::field("address").push_field("city"); let alice_city = variant_array.get_path(0, &city_path).unwrap(); let bob_city = variant_array.get_path(1, &city_path).unwrap(); @@ -92,19 +92,25 @@ fn main() { VariantPath::field("age"), VariantPath::field("address").push_field("city"), ]; - let alice_data = variant_array.get_paths(0, &paths); - println!("Alice's data: name={}, age={}, city={}", - alice_data[0].as_ref().unwrap().as_string().unwrap(), - alice_data[1].as_ref().unwrap().as_int32().unwrap(), - alice_data[2].as_ref().unwrap().as_string().unwrap()); - - // 5. Column-wise extraction - let names = variant_array.extract_field(&VariantPath::field("name")); - println!("All names: {:?}", names.iter().map(|v| v.as_ref().unwrap().as_string().unwrap()).collect::>()); + print!("Alice's data: "); + for (i, path_result) in alice_data.iter().enumerate() { + if let Some(variant) = path_result { + if i == 0 { + print!("name={}", variant.as_string().unwrap()); + } else if i == 1 { + print!(", age={}", variant.as_int32().unwrap()); + } else if i == 2 { + print!(", city={}", variant.as_string().unwrap()); + } + } + } + println!(); - println!("=== Performance Benefit ==="); - println!("✓ Direct field access without reconstructing entire variants"); - println!("✓ Efficient batch operations for analytical workloads"); - println!("✓ Foundation for shredding/unshredding operations"); + // 5. Batch field extraction + let all_names = variant_array.extract_field_by_path(&VariantPath::field("name")); + let name_strings: Vec = all_names.iter() + .filter_map(|opt| opt.as_ref().map(|v| v.as_string().unwrap().to_string())) + .collect(); + println!("All names: {:?}", name_strings); } \ No newline at end of file diff --git a/parquet-variant-compute/src/field_operations.rs b/parquet-variant-compute/src/field_operations.rs new file mode 100644 index 000000000000..3ef71d7db77c --- /dev/null +++ b/parquet-variant-compute/src/field_operations.rs @@ -0,0 +1,477 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Field extraction and removal operations for variant objects + +use crate::variant_parser::{VariantParser, ObjectHeader, ObjectOffsets}; +use arrow::error::ArrowError; +use parquet_variant::VariantMetadata; +use std::collections::HashSet; + +/// Represents a path element in a variant path +#[derive(Debug, Clone)] +pub enum VariantPathElement { + Field(String), + Index(usize), +} + +/// Represents a path through a variant object/array structure +#[derive(Debug, Clone)] +pub struct VariantPath { + elements: Vec, +} + +impl VariantPath { + /// Create a new path starting with a field + pub fn field(name: &str) -> Self { + Self { + elements: vec![VariantPathElement::Field(name.to_string())], + } + } + + /// Add a field to the path + pub fn push_field(mut self, name: &str) -> Self { + self.elements.push(VariantPathElement::Field(name.to_string())); + self + } + + /// Add an index to the path + pub fn push_index(mut self, index: usize) -> Self { + self.elements.push(VariantPathElement::Index(index)); + self + } + + /// Get the elements of the path + pub fn elements(&self) -> &[VariantPathElement] { + &self.elements + } +} + +/// Field operations for variant objects +pub struct FieldOperations; + +impl FieldOperations { + /// Extract field bytes from a single variant object + pub fn extract_field_bytes( + metadata_bytes: &[u8], + value_bytes: &[u8], + field_name: &str, + ) -> Result>, ArrowError> { + if !VariantParser::is_object(value_bytes) { + return Ok(None); + } + + let header_byte = value_bytes[0]; + let header = VariantParser::parse_object_header(header_byte)?; + let num_elements = VariantParser::unpack_int(&value_bytes[1..], header.num_elements_size)?; + let offsets = VariantParser::calculate_object_offsets(&header, num_elements); + + // Find field ID for the target field name + let target_field_id = Self::find_field_id(metadata_bytes, field_name)?; + let target_field_id = match target_field_id { + Some(id) => id, + None => return Ok(None), // Field not found + }; + + // Search for the field in the object + for i in 0..num_elements { + let field_id_offset = offsets.field_ids_start + (i * header.field_id_size); + let field_id = VariantParser::unpack_int(&value_bytes[field_id_offset..], header.field_id_size)?; + + if field_id == target_field_id { + return Self::extract_field_value_at_index(value_bytes, &header, &offsets, i, num_elements); + } + } + + Ok(None) + } + + /// Remove field from a single variant object + pub fn remove_field_bytes( + metadata_bytes: &[u8], + value_bytes: &[u8], + field_name: &str, + ) -> Result>, ArrowError> { + Self::remove_fields_bytes(metadata_bytes, value_bytes, &[field_name]) + } + + /// Remove multiple fields from a single variant object + pub fn remove_fields_bytes( + metadata_bytes: &[u8], + value_bytes: &[u8], + field_names: &[&str], + ) -> Result>, ArrowError> { + if !VariantParser::is_object(value_bytes) { + return Ok(Some(value_bytes.to_vec())); + } + + let header_byte = value_bytes[0]; + let header = VariantParser::parse_object_header(header_byte)?; + let num_elements = VariantParser::unpack_int(&value_bytes[1..], header.num_elements_size)?; + let offsets = VariantParser::calculate_object_offsets(&header, num_elements); + + // Find field IDs for target field names + let target_field_ids = Self::find_field_ids(metadata_bytes, field_names)?; + + if target_field_ids.is_empty() { + return Ok(Some(value_bytes.to_vec())); // No fields to remove + } + + // Collect fields to keep + let fields_to_keep = Self::collect_fields_to_keep( + value_bytes, + &header, + &offsets, + num_elements, + &target_field_ids, + )?; + + if fields_to_keep.len() == num_elements { + return Ok(Some(value_bytes.to_vec())); // No fields were removed + } + + // Sort fields by name for proper variant object ordering + let sorted_fields = Self::sort_fields_by_name(metadata_bytes, fields_to_keep)?; + + // Reconstruct object with remaining fields + Self::reconstruct_object(sorted_fields) + } + + /// Find field ID for a given field name + fn find_field_id(metadata_bytes: &[u8], field_name: &str) -> Result, ArrowError> { + let metadata = VariantMetadata::try_new(metadata_bytes)?; + + for dict_idx in 0..metadata.len() { + if let Ok(name) = metadata.get(dict_idx) { + if name == field_name { + return Ok(Some(dict_idx)); + } + } + } + + Ok(None) + } + + /// Find field IDs for multiple field names + fn find_field_ids(metadata_bytes: &[u8], field_names: &[&str]) -> Result, ArrowError> { + let metadata = VariantMetadata::try_new(metadata_bytes)?; + let mut target_field_ids = HashSet::new(); + + for field_name in field_names { + for dict_idx in 0..metadata.len() { + if let Ok(name) = metadata.get(dict_idx) { + if name == *field_name { + target_field_ids.insert(dict_idx); + break; + } + } + } + } + + Ok(target_field_ids) + } + + /// Extract field value at a specific index + fn extract_field_value_at_index( + value_bytes: &[u8], + header: &ObjectHeader, + offsets: &ObjectOffsets, + field_index: usize, + num_elements: usize, + ) -> Result>, ArrowError> { + // Get all field offsets + let mut field_offsets = Vec::new(); + for i in 0..=num_elements { + let offset_idx = offsets.field_offsets_start + (i * header.field_offset_size); + let offset_val = VariantParser::unpack_int(&value_bytes[offset_idx..], header.field_offset_size)?; + field_offsets.push(offset_val); + } + + let field_start = field_offsets[field_index]; + + // To find the end offset, we need to find the next field in byte order + // Since fields are stored in alphabetical order, we can't just use field_index + 1 + // We need to find the smallest offset that's greater than field_start + let mut field_end = field_offsets[num_elements]; // Default to final offset + + for i in 0..num_elements { + if i != field_index { + let other_offset = field_offsets[i]; + if other_offset > field_start && other_offset < field_end { + field_end = other_offset; + } + } + } + + let field_start_absolute = offsets.values_start + field_start; + let field_end_absolute = offsets.values_start + field_end; + + if field_start_absolute <= field_end_absolute && field_end_absolute <= value_bytes.len() { + let field_value_bytes = &value_bytes[field_start_absolute..field_end_absolute]; + Ok(Some(field_value_bytes.to_vec())) + } else { + Ok(None) + } + } + + /// Collect fields to keep (those not being removed) + fn collect_fields_to_keep( + value_bytes: &[u8], + header: &ObjectHeader, + offsets: &ObjectOffsets, + num_elements: usize, + target_field_ids: &HashSet, + ) -> Result)>, ArrowError> { + let mut fields_to_keep = Vec::new(); + + for i in 0..num_elements { + let field_id_offset = offsets.field_ids_start + (i * header.field_id_size); + let field_id = VariantParser::unpack_int(&value_bytes[field_id_offset..], header.field_id_size)?; + + if !target_field_ids.contains(&field_id) { + if let Some(field_value) = Self::extract_field_value_at_index(value_bytes, header, offsets, i, num_elements)? { + fields_to_keep.push((field_id, field_value)); + } + } + } + + Ok(fields_to_keep) + } + + /// Sort fields by their names (variant objects must be sorted alphabetically) + fn sort_fields_by_name( + metadata_bytes: &[u8], + mut fields: Vec<(usize, Vec)>, + ) -> Result)>, ArrowError> { + let metadata = VariantMetadata::try_new(metadata_bytes)?; + + fields.sort_by(|a, b| { + let name_a = metadata.get(a.0).unwrap_or(""); + let name_b = metadata.get(b.0).unwrap_or(""); + name_a.cmp(name_b) + }); + + Ok(fields) + } + + /// Reconstruct variant object from sorted fields + fn reconstruct_object(fields: Vec<(usize, Vec)>) -> Result>, ArrowError> { + let new_num_elements = fields.len(); + let new_is_large = new_num_elements > 255; + + // Calculate sizes for new object + let max_field_id = fields.iter().map(|(id, _)| *id).max().unwrap_or(0); + let new_field_id_size = VariantParser::calculate_int_size(max_field_id); + + let total_values_size: usize = fields.iter().map(|(_, value)| value.len()).sum(); + let new_field_offset_size = VariantParser::calculate_int_size(total_values_size); + + // Build new object + let mut new_value_bytes = Vec::new(); + + // Write header + let new_header = VariantParser::build_object_header(new_is_large, new_field_id_size, new_field_offset_size); + new_value_bytes.push(new_header); + + // Write num_elements + if new_is_large { + new_value_bytes.extend_from_slice(&(new_num_elements as u32).to_le_bytes()); + } else { + new_value_bytes.push(new_num_elements as u8); + } + + // Write field IDs + for (field_id, _) in &fields { + VariantParser::write_int_bytes(&mut new_value_bytes, *field_id, new_field_id_size); + } + + // Write field offsets + let mut current_offset = 0; + for (_, field_value) in &fields { + VariantParser::write_int_bytes(&mut new_value_bytes, current_offset, new_field_offset_size); + current_offset += field_value.len(); + } + // Write final offset + VariantParser::write_int_bytes(&mut new_value_bytes, current_offset, new_field_offset_size); + + // Write field values + for (_, field_value) in &fields { + new_value_bytes.extend_from_slice(field_value); + } + + Ok(Some(new_value_bytes)) + } + + /// Get the bytes at a specific path through the variant data + pub fn get_path_bytes( + metadata_bytes: &[u8], + value_bytes: &[u8], + path: &VariantPath, + ) -> Result>, ArrowError> { + let mut current_value = value_bytes.to_vec(); + + for element in path.elements() { + match element { + VariantPathElement::Field(field_name) => { + if let Some(field_bytes) = Self::get_field_bytes(metadata_bytes, ¤t_value, field_name)? { + current_value = field_bytes; + } else { + return Ok(None); + } + } + VariantPathElement::Index(idx) => { + if let Some(element_bytes) = Self::get_array_element_bytes(metadata_bytes, ¤t_value, *idx)? { + current_value = element_bytes; + } else { + return Ok(None); + } + } + } + } + + Ok(Some(current_value)) + } + + /// Get field bytes from an object at the byte level + fn get_field_bytes( + metadata_bytes: &[u8], + value_bytes: &[u8], + field_name: &str, + ) -> Result>, ArrowError> { + Self::extract_field_bytes(metadata_bytes, value_bytes, field_name) + } + + /// Get array element bytes at the byte level + fn get_array_element_bytes( + _metadata_bytes: &[u8], + value_bytes: &[u8], + index: usize, + ) -> Result>, ArrowError> { + // Check if this is an array + if value_bytes.is_empty() { + return Ok(None); + } + + let header_byte = value_bytes[0]; + let basic_type = VariantParser::get_basic_type(header_byte); + + // Only handle arrays (basic_type == 3 according to variant spec) + if basic_type != 3 { + return Ok(None); + } + + // Parse array header to get element count and offsets + let array_header = VariantParser::parse_array_header(header_byte)?; + let num_elements = VariantParser::unpack_int( + &value_bytes[1..], + array_header.num_elements_size + )?; + + // Check bounds + if index >= num_elements { + return Ok(None); + } + + // Calculate array offsets + let offsets = VariantParser::calculate_array_offsets(&array_header, num_elements); + + // Get element offset + let element_offset_start = offsets.element_offsets_start + index * array_header.element_offset_size; + let element_offset_end = element_offset_start + array_header.element_offset_size; + + if element_offset_end > value_bytes.len() { + return Err(ArrowError::InvalidArgumentError( + "Element offset exceeds value buffer".to_string() + )); + } + + let element_offset = VariantParser::unpack_int( + &value_bytes[element_offset_start..element_offset_end], + array_header.element_offset_size + )?; + + // Get next element offset (or end of data) + let next_offset = if index + 1 < num_elements { + let next_element_offset_start = offsets.element_offsets_start + (index + 1) * array_header.element_offset_size; + let next_element_offset_end = next_element_offset_start + array_header.element_offset_size; + VariantParser::unpack_int( + &value_bytes[next_element_offset_start..next_element_offset_end], + array_header.element_offset_size + )? + } else { + value_bytes.len() + }; + + // Extract element bytes + let element_start = offsets.elements_start + element_offset; + let element_end = offsets.elements_start + next_offset; + + if element_end > value_bytes.len() { + return Err(ArrowError::InvalidArgumentError( + "Element data exceeds value buffer".to_string() + )); + } + + Ok(Some(value_bytes[element_start..element_end].to_vec())) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use parquet_variant::VariantBuilder; + + fn create_test_object() -> (Vec, Vec) { + let mut builder = VariantBuilder::new(); + { + let mut obj = builder.new_object(); + obj.insert("name", "Alice"); + obj.insert("age", 30i32); + obj.insert("city", "NYC"); + obj.finish().unwrap(); + } + builder.finish() + } + + #[test] + fn test_extract_field_bytes() { + let (metadata, value) = create_test_object(); + + let name_bytes = FieldOperations::extract_field_bytes(&metadata, &value, "name").unwrap(); + assert!(name_bytes.is_some()); + + let nonexistent_bytes = FieldOperations::extract_field_bytes(&metadata, &value, "nonexistent").unwrap(); + assert!(nonexistent_bytes.is_none()); + } + + #[test] + fn test_remove_field_bytes() { + let (metadata, value) = create_test_object(); + + let result = FieldOperations::remove_field_bytes(&metadata, &value, "city").unwrap(); + assert!(result.is_some()); + + // Verify the field was removed by checking we can't extract it + let new_value = result.unwrap(); + let city_bytes = FieldOperations::extract_field_bytes(&metadata, &new_value, "city").unwrap(); + assert!(city_bytes.is_none()); + + // Verify other fields are still there + let name_bytes = FieldOperations::extract_field_bytes(&metadata, &new_value, "name").unwrap(); + assert!(name_bytes.is_some()); + } +} \ No newline at end of file diff --git a/parquet-variant-compute/src/from_json.rs b/parquet-variant-compute/src/from_json.rs index df4d7c2753ef..96980a3ef8a6 100644 --- a/parquet-variant-compute/src/from_json.rs +++ b/parquet-variant-compute/src/from_json.rs @@ -18,7 +18,8 @@ //! Module for transforming a batch of JSON strings into a batch of Variants represented as //! STRUCT -use crate::{VariantArray, VariantArrayBuilder}; +use crate::variant_array::VariantArray; +use crate::variant_array_builder::VariantArrayBuilder; use arrow::array::{Array, ArrayRef, StringArray}; use arrow_schema::ArrowError; use parquet_variant::VariantBuilder; @@ -47,15 +48,14 @@ pub fn batch_json_string_to_variant(input: &ArrayRef) -> Result, -} - -impl VariantPath { - pub fn new() -> Self { - Self { - elements: Vec::new(), - } - } - - /// Create a path from a single field name - pub fn field(name: impl Into) -> Self { - Self { - elements: vec![VariantPathElement::Field(name.into())], - } - } - - /// Create a path from a single array index - pub fn index(idx: usize) -> Self { - Self { - elements: vec![VariantPathElement::Index(idx)], - } - } - - /// Add a field access to this path - pub fn push_field(mut self, name: impl Into) -> Self { - self.elements.push(VariantPathElement::Field(name.into())); - self - } - - /// Add an array index access to this path - pub fn push_index(mut self, idx: usize) -> Self { - self.elements.push(VariantPathElement::Index(idx)); - self - } - - /// Get the path elements - pub fn elements(&self) -> &[VariantPathElement] { - &self.elements - } - - /// Check if this path is empty - pub fn is_empty(&self) -> bool { - self.elements.is_empty() - } -} - -impl Default for VariantPath { - fn default() -> Self { - Self::new() - } -} - -/// An array of Parquet [`Variant`] values -/// -/// A [`VariantArray`] wraps an Arrow [`StructArray`] that stores the underlying -/// `metadata` and `value` fields, and adds convenience methods to access -/// the `Variant`s -/// -/// See [`VariantArrayBuilder`] for constructing a `VariantArray`. -/// -/// [`VariantArrayBuilder`]: crate::VariantArrayBuilder -/// -/// # Specification -/// -/// 1. This code follows the conventions for storing variants in Arrow `StructArray` -/// defined by [Extension Type for Parquet Variant arrow] and this [document]. -/// At the time of this writing, this is not yet a standardized Arrow extension type. -/// -/// [Extension Type for Parquet Variant arrow]: https://github.com/apache/arrow/issues/46908 -/// [document]: https://docs.google.com/document/d/1pw0AWoMQY3SjD7R4LgbPvMjG_xSCtXp3rZHkVp9jpZ4/edit?usp=sharing +/// Array implementation for variant data with hybrid byte-level and high-level APIs #[derive(Debug)] pub struct VariantArray { /// StructArray of up to three fields: @@ -132,29 +53,12 @@ pub struct VariantArray { } impl VariantArray { - /// Creates a new `VariantArray` from a [`StructArray`]. - /// - /// # Arguments - /// - `inner` - The underlying [`StructArray`] that contains the variant data. - /// - /// # Returns - /// - A new instance of `VariantArray`. - /// - /// # Errors: - /// - If the `StructArray` does not contain the required fields - /// - /// # Current support - /// This structure does not (yet) support the full Arrow Variant Array specification. - /// - /// Only `StructArrays` with `metadata` and `value` fields that are - /// [`BinaryViewArray`] are supported. Shredded values are not currently supported - /// nor are using types other than `BinaryViewArray` - /// - /// [`BinaryViewArray`]: arrow::array::BinaryViewArray - pub fn try_new(inner: ArrayRef) -> Result { - let Some(inner) = inner.as_struct_opt() else { + /// Create a new VariantArray from a StructArray + pub fn try_new(inner: Arc) -> Result { + // Validate that the struct has the expected format + if inner.num_columns() != 2 { return Err(ArrowError::InvalidArgumentError( - "Invalid VariantArray: requires StructArray as input".to_string(), + "Expected struct with exactly 2 columns (metadata, value)".to_string(), )); }; // Ensure the StructArray has a metadata field of BinaryView @@ -254,127 +158,125 @@ impl VariantArray { // Start with the root variant let mut current = self.value(index); - // Navigate through the path elements + Ok(Self { inner }) + } + + /// Get the metadata field as a BinaryViewArray + pub fn metadata_field(&self) -> &BinaryViewArray { + self.inner.column(0) + .as_any() + .downcast_ref::() + .expect("Expected metadata field to be BinaryViewArray") + } + + /// Get the value field as a BinaryViewArray + pub fn value_field(&self) -> &BinaryViewArray { + self.inner.column(1) + .as_any() + .downcast_ref::() + .expect("Expected value field to be BinaryViewArray") + } + + /// Get the metadata bytes for a specific index + pub fn metadata(&self, index: usize) -> &[u8] { + self.metadata_field().value(index).as_ref() + } + + /// Get the value bytes for a specific index + pub fn value_bytes(&self, index: usize) -> &[u8] { + self.value_field().value(index).as_ref() + } + + /// Get the parsed variant at a specific index + pub fn value(&self, index: usize) -> Variant { + if index >= self.len() { + panic!("Index {} out of bounds for array of length {}", index, self.len()); + } + + if self.is_null(index) { + return Variant::Null; + } + + let metadata = self.metadata(index); + let value = self.value_bytes(index); + + let variant_metadata = VariantMetadata::try_new(metadata) + .expect("Failed to parse variant metadata"); + Variant::try_new_with_metadata(variant_metadata, value) + .expect("Failed to create variant from metadata and value") + } + + /// Get value at a specific path for the variant at the given index + /// + /// Uses high-level Variant API for convenience. Returns a Variant object that can be + /// directly used with standard variant operations. + pub fn get_path(&self, index: usize, path: &crate::field_operations::VariantPath) -> Option { + if index >= self.len() || self.is_null(index) { + return None; + } + + let mut current_variant = self.value(index); + for element in path.elements() { match element { - VariantPathElement::Field(field_name) => { - current = current.get_object_field(field_name)?; + crate::field_operations::VariantPathElement::Field(field_name) => { + current_variant = current_variant.get_object_field(field_name)?; } - VariantPathElement::Index(idx) => { - current = current.get_list_element(*idx)?; + crate::field_operations::VariantPathElement::Index(idx) => { + current_variant = current_variant.get_list_element(*idx)?; } } } - - Some(current) - } - - /// Extract multiple fields from the variant at the specified row using paths. - /// - /// This method is more efficient than calling `get_path` multiple times - /// for the same row, as it avoids repeated work. - /// - /// # Arguments - /// * `index` - The row index in the array - /// * `paths` - The paths to the fields to extract - /// - /// # Returns - /// A vector of `Option` where each element corresponds to the - /// field at the same index in the paths vector. - /// - /// # Example - /// ``` - /// # use parquet_variant_compute::{VariantArrayBuilder, VariantArray, VariantPath}; - /// # use parquet_variant::VariantBuilder; - /// # let mut builder = VariantArrayBuilder::new(1); - /// # let mut variant_builder = VariantBuilder::new(); - /// # let mut obj = variant_builder.new_object(); - /// # obj.insert("name", "Alice"); - /// # obj.insert("email", "alice@example.com"); - /// # obj.insert("timestamp", 1234567890i64); - /// # obj.finish().unwrap(); - /// # let (metadata, value) = variant_builder.finish(); - /// # builder.append_variant_buffers(&metadata, &value); - /// # let variant_array = builder.build(); - /// let paths = vec![ - /// VariantPath::field("name"), - /// VariantPath::field("email"), - /// VariantPath::field("timestamp"), - /// ]; - /// let fields = variant_array.get_paths(0, &paths); - /// ``` - pub fn get_paths(&self, index: usize, paths: &[VariantPath]) -> Vec> { - paths.iter().map(|path| self.get_path(index, path)).collect() + + Some(current_variant) + } + + /// Get values at multiple paths for the variant at the given index + /// + /// Convenience method that applies `get_path()` to multiple paths at once. + /// Useful for extracting multiple fields from a single variant row. + pub fn get_paths(&self, index: usize, paths: &[crate::field_operations::VariantPath]) -> Vec> { + let mut results = Vec::new(); + for path in paths { + results.push(self.get_path(index, path)); + } + results } - - /// Extract a specific field from all rows in the array. - /// - /// This method is optimized for extracting the same field from many rows, - /// which is a common operation in analytical queries. - /// - /// # Arguments - /// * `path` - The path to the field to extract from all rows - /// - /// # Returns - /// A vector of `Option` where each element corresponds to the - /// field value at the same row index. - /// - /// # Example - /// ``` - /// # use parquet_variant_compute::{VariantArrayBuilder, VariantArray, VariantPath}; - /// # use parquet_variant::VariantBuilder; - /// # let mut builder = VariantArrayBuilder::new(1); - /// # let mut variant_builder = VariantBuilder::new(); - /// # let mut obj = variant_builder.new_object(); - /// # obj.insert("id", 123i32); - /// # obj.finish().unwrap(); - /// # let (metadata, value) = variant_builder.finish(); - /// # builder.append_variant_buffers(&metadata, &value); - /// # let variant_array = builder.build(); - /// let path = VariantPath::field("id"); - /// let user_ids = variant_array.extract_field(&path); - /// ``` - pub fn extract_field(&self, path: &VariantPath) -> Vec> { - (0..self.len()) - .map(|i| self.get_path(i, path)) - .collect() + + /// Get the field names for an object at the given index + pub fn get_field_names(&self, index: usize) -> Vec { + if index >= self.len() { + return vec![]; + } + + if self.is_null(index) { + return vec![]; + } + + let variant = self.value(index); + if let Some(obj) = variant.as_object() { + let mut paths = Vec::new(); + for i in 0..obj.len() { + if let Some(field_name) = obj.field_name(i) { + paths.push(field_name.to_string()); + } + } + paths + } else { + vec![] + } } - - /// Extract multiple fields from all rows in the array. - /// - /// This method is optimized for extracting multiple fields from many rows, - /// which is essential for efficient shredding operations. - /// - /// # Arguments - /// * `paths` - The paths to the fields to extract from all rows - /// - /// # Returns - /// A vector of vectors where the outer vector corresponds to rows and - /// the inner vector corresponds to the fields at the same index in paths. - /// - /// # Example - /// ``` - /// # use parquet_variant_compute::{VariantArrayBuilder, VariantArray, VariantPath}; - /// # use parquet_variant::VariantBuilder; - /// # let mut builder = VariantArrayBuilder::new(1); - /// # let mut variant_builder = VariantBuilder::new(); - /// # let mut obj = variant_builder.new_object(); - /// # obj.insert("name", "Alice"); - /// # obj.insert("age", 30i32); - /// # obj.finish().unwrap(); - /// # let (metadata, value) = variant_builder.finish(); - /// # builder.append_variant_buffers(&metadata, &value); - /// # let variant_array = builder.build(); - /// let paths = vec![ - /// VariantPath::field("name"), - /// VariantPath::field("age"), - /// ]; - /// let extracted_data = variant_array.extract_fields(&paths); - /// ``` - pub fn extract_fields(&self, paths: &[VariantPath]) -> Vec>> { - (0..self.len()) - .map(|i| self.get_paths(i, paths)) - .collect() + + /// Extract field values by path from all variants in the array + /// + /// Applies `get_path()` to a single path across all rows in the array. + /// Useful for extracting a column of values from nested variant data. + pub fn extract_field_by_path(&self, path: &crate::field_operations::VariantPath) -> Vec> { + let mut results = Vec::new(); + for i in 0..self.len() { + results.push(self.get_path(i, path)); + } + results } /// Return a reference to the metadata field of the [`StructArray`] @@ -388,21 +290,70 @@ impl VariantArray { // spec says fields order is not guaranteed, so we search by name &self.value_ref } + + /// Create a new VariantArray with a field removed from all variants + pub fn with_field_removed(&self, field_name: &str) -> Result { + let mut builder = crate::variant_array_builder::VariantArrayBuilder::new(self.len()); + + for i in 0..self.len() { + if self.is_null(i) { + builder.append_null(); + } else { + match FieldOperations::remove_field_bytes(self.metadata(i), self.value_bytes(i), field_name)? { + Some(new_value) => { + builder.append_variant_buffers(self.metadata(i), &new_value); + } + None => { + // Field didn't exist, use original value + builder.append_variant_buffers(self.metadata(i), self.value_bytes(i)); + } + } + } + } + + Ok(builder.build()) + } + + /// Create a new VariantArray with multiple fields removed from all variants + pub fn with_fields_removed(&self, field_names: &[&str]) -> Result { + let mut builder = crate::variant_array_builder::VariantArrayBuilder::new(self.len()); + + for i in 0..self.len() { + if self.is_null(i) { + builder.append_null(); + } else { + match FieldOperations::remove_fields_bytes(self.metadata(i), self.value_bytes(i), field_names)? { + Some(new_value) => { + builder.append_variant_buffers(self.metadata(i), &new_value); + } + None => { + // No fields existed, use original value + builder.append_variant_buffers(self.metadata(i), self.value_bytes(i)); + } + } + } + } + + Ok(builder.build()) + } } impl Array for VariantArray { fn as_any(&self) -> &dyn Any { self } - + fn to_data(&self) -> ArrayData { self.inner.to_data() } - + fn into_data(self) -> ArrayData { - self.inner.into_data() + match Arc::try_unwrap(self.inner) { + Ok(inner) => inner.into_data(), + Err(inner) => inner.to_data(), + } } - + fn data_type(&self) -> &DataType { self.inner.data_type() } @@ -417,360 +368,152 @@ impl Array for VariantArray { value_ref: val, }) } - + fn len(&self) -> usize { self.inner.len() } - + fn is_empty(&self) -> bool { self.inner.is_empty() } - - fn offset(&self) -> usize { - self.inner.offset() - } - + fn nulls(&self) -> Option<&NullBuffer> { self.inner.nulls() } - + + fn offset(&self) -> usize { + self.inner.offset() + } + fn get_buffer_memory_size(&self) -> usize { self.inner.get_buffer_memory_size() } - + fn get_array_memory_size(&self) -> usize { self.inner.get_array_memory_size() } } #[cfg(test)] -mod test { +mod tests { use super::*; - use arrow::array::{BinaryArray, BinaryViewArray}; - use arrow_schema::{Field, Fields}; - use parquet_variant::{Variant, VariantBuilder}; + use crate::variant_array_builder::VariantArrayBuilder; + use parquet_variant::VariantBuilder; - #[test] - fn invalid_not_a_struct_array() { - let array = make_binary_view_array(); - // Should fail because the input is not a StructArray - let err = VariantArray::try_new(array); - assert_eq!( - err.unwrap_err().to_string(), - "Invalid argument error: Invalid VariantArray: requires StructArray as input" - ); - } - - #[test] - fn invalid_missing_metadata() { - let fields = Fields::from(vec![Field::new("value", DataType::BinaryView, true)]); - let array = StructArray::new(fields, vec![make_binary_view_array()], None); - // Should fail because the StructArray does not contain a 'metadata' field - let err = VariantArray::try_new(Arc::new(array)); - assert_eq!( - err.unwrap_err().to_string(), - "Invalid argument error: Invalid VariantArray: StructArray must contain a 'metadata' field" - ); - } - - #[test] - fn invalid_missing_value() { - let fields = Fields::from(vec![Field::new("metadata", DataType::BinaryView, false)]); - let array = StructArray::new(fields, vec![make_binary_view_array()], None); - // Should fail because the StructArray does not contain a 'value' field - let err = VariantArray::try_new(Arc::new(array)); - assert_eq!( - err.unwrap_err().to_string(), - "Invalid argument error: Invalid VariantArray: StructArray must contain a 'value' field" - ); - } - - #[test] - fn invalid_metadata_field_type() { - let fields = Fields::from(vec![ - Field::new("metadata", DataType::Binary, true), // Not yet supported - Field::new("value", DataType::BinaryView, true), - ]); - let array = StructArray::new( - fields, - vec![make_binary_array(), make_binary_view_array()], - None, - ); - let err = VariantArray::try_new(Arc::new(array)); - assert_eq!( - err.unwrap_err().to_string(), - "Not yet implemented: VariantArray 'metadata' field must be BinaryView, got Binary" - ); - } - - #[test] - fn invalid_value_field_type() { - let fields = Fields::from(vec![ - Field::new("metadata", DataType::BinaryView, true), - Field::new("value", DataType::Binary, true), // Not yet supported - ]); - let array = StructArray::new( - fields, - vec![make_binary_view_array(), make_binary_array()], - None, - ); - let err = VariantArray::try_new(Arc::new(array)); - assert_eq!( - err.unwrap_err().to_string(), - "Not yet implemented: VariantArray 'value' field must be BinaryView, got Binary" - ); - } - - #[test] - fn test_variant_path_creation() { - let path = VariantPath::field("user") - .push_field("profile") - .push_field("name"); - - assert_eq!(path.elements().len(), 3); - assert_eq!(path.elements()[0], VariantPathElement::Field("user".to_string())); - assert_eq!(path.elements()[1], VariantPathElement::Field("profile".to_string())); - assert_eq!(path.elements()[2], VariantPathElement::Field("name".to_string())); - } - - #[test] - fn test_variant_path_with_index() { - let path = VariantPath::field("users") - .push_index(0) - .push_field("name"); + fn create_test_variant_array() -> VariantArray { + let mut builder = VariantArrayBuilder::new(2); - assert_eq!(path.elements().len(), 3); - assert_eq!(path.elements()[0], VariantPathElement::Field("users".to_string())); - assert_eq!(path.elements()[1], VariantPathElement::Index(0)); - assert_eq!(path.elements()[2], VariantPathElement::Field("name".to_string())); - } - - #[test] - fn test_get_path_simple_field() { - let variant_array = create_test_variant_array(); + // Create variant 1: {"name": "Alice", "age": 30} + let mut builder1 = VariantBuilder::new(); + { + let mut obj = builder1.new_object(); + obj.insert("name", "Alice"); + obj.insert("age", 30i32); + obj.finish().unwrap(); + } + let (metadata1, value1) = builder1.finish(); + builder.append_variant_buffers(&metadata1, &value1); - let path = VariantPath::field("name"); - let result = variant_array.get_path(0, &path); + // Create variant 2: {"name": "Bob", "age": 25, "city": "NYC"} + let mut builder2 = VariantBuilder::new(); + { + let mut obj = builder2.new_object(); + obj.insert("name", "Bob"); + obj.insert("age", 25i32); + obj.insert("city", "NYC"); + obj.finish().unwrap(); + } + let (metadata2, value2) = builder2.finish(); + builder.append_variant_buffers(&metadata2, &value2); - assert!(result.is_some()); - assert_eq!(result.unwrap(), Variant::from("Alice")); + builder.build() } #[test] - fn test_get_path_nested_field() { - let variant_array = create_test_variant_array(); + fn test_variant_array_basic() { + let array = create_test_variant_array(); + assert_eq!(array.len(), 2); + assert!(!array.is_empty()); - let path = VariantPath::field("details").push_field("age"); - let result = variant_array.get_path(0, &path); + // Test accessing variants + let variant1 = array.value(0); + assert_eq!(variant1.get_object_field("name").unwrap().as_string(), Some("Alice")); + assert_eq!(variant1.get_object_field("age").unwrap().as_int32(), Some(30)); - assert!(result.is_some()); - assert_eq!(result.unwrap(), Variant::from(30i32)); + let variant2 = array.value(1); + assert_eq!(variant2.get_object_field("name").unwrap().as_string(), Some("Bob")); + assert_eq!(variant2.get_object_field("age").unwrap().as_int32(), Some(25)); + assert_eq!(variant2.get_object_field("city").unwrap().as_string(), Some("NYC")); } #[test] - fn test_get_path_array_index() { - let variant_array = create_test_variant_array(); + fn test_get_field_names() { + let array = create_test_variant_array(); - let path = VariantPath::field("hobbies").push_index(1); - let result = variant_array.get_path(0, &path); + let paths1 = array.get_field_names(0); + assert_eq!(paths1.len(), 2); + assert!(paths1.contains(&"name".to_string())); + assert!(paths1.contains(&"age".to_string())); - assert!(result.is_some()); - assert_eq!(result.unwrap(), Variant::from("cooking")); + let paths2 = array.get_field_names(1); + assert_eq!(paths2.len(), 3); + assert!(paths2.contains(&"name".to_string())); + assert!(paths2.contains(&"age".to_string())); + assert!(paths2.contains(&"city".to_string())); } #[test] - fn test_get_path_nonexistent_field() { - let variant_array = create_test_variant_array(); + fn test_get_path() { + let array = create_test_variant_array(); - let path = VariantPath::field("nonexistent"); - let result = variant_array.get_path(0, &path); + // Test field access + let name_path = crate::field_operations::VariantPath::field("name"); + let alice_name = array.get_path(0, &name_path).unwrap(); + assert_eq!(alice_name.as_string(), Some("Alice")); + // Test non-existent field + let nonexistent_path = crate::field_operations::VariantPath::field("nonexistent"); + let result = array.get_path(0, &nonexistent_path); assert!(result.is_none()); } #[test] - fn test_get_path_empty_path() { - let variant_array = create_test_variant_array(); - - let path = VariantPath::new(); - let result = variant_array.get_path(0, &path); - - assert!(result.is_some()); - // Should return the full variant - let variant = result.unwrap(); - assert!(variant.as_object().is_some()); - } - - #[test] - fn test_get_paths_multiple() { - let variant_array = create_test_variant_array(); - - let paths = vec![ - VariantPath::field("name"), - VariantPath::field("details").push_field("age"), - VariantPath::field("hobbies").push_index(0), - ]; - - let results = variant_array.get_paths(0, &paths); + fn test_with_field_removed() { + let array = create_test_variant_array(); - assert_eq!(results.len(), 3); - assert_eq!(results[0], Some(Variant::from("Alice"))); - assert_eq!(results[1], Some(Variant::from(30i32))); - assert_eq!(results[2], Some(Variant::from("reading"))); - } - - #[test] - fn test_extract_field_all_rows() { - let variant_array = create_test_variant_array_multiple_rows(); + let new_array = array.with_field_removed("age").unwrap(); - let path = VariantPath::field("name"); - let results = variant_array.extract_field(&path); + // Check that age field was removed from all variants + let variant1 = new_array.value(0); + let obj1 = variant1.as_object().unwrap(); + assert_eq!(obj1.len(), 1); + assert!(obj1.get("name").is_some()); + assert!(obj1.get("age").is_none()); - assert_eq!(results.len(), 2); - assert_eq!(results[0], Some(Variant::from("Alice"))); - assert_eq!(results[1], Some(Variant::from("Bob"))); + let variant2 = new_array.value(1); + let obj2 = variant2.as_object().unwrap(); + assert_eq!(obj2.len(), 2); + assert!(obj2.get("name").is_some()); + assert!(obj2.get("age").is_none()); + assert!(obj2.get("city").is_some()); } #[test] - fn test_extract_fields_all_rows() { - let variant_array = create_test_variant_array_multiple_rows(); + fn test_metadata_and_value_fields() { + let array = create_test_variant_array(); - let paths = vec![ - VariantPath::field("name"), - VariantPath::field("details").push_field("age"), - ]; + let metadata_field = array.metadata_field(); + let value_field = array.value_field(); - let results = variant_array.extract_fields(&paths); + // Check that we got the expected arrays + assert_eq!(metadata_field.len(), 2); + assert_eq!(value_field.len(), 2); - assert_eq!(results.len(), 2); // 2 rows - assert_eq!(results[0].len(), 2); // 2 fields per row - assert_eq!(results[1].len(), 2); // 2 fields per row - - // Row 0 - assert_eq!(results[0][0], Some(Variant::from("Alice"))); - assert_eq!(results[0][1], Some(Variant::from(30i32))); - - // Row 1 - assert_eq!(results[1][0], Some(Variant::from("Bob"))); - assert_eq!(results[1][1], Some(Variant::from(25i32))); - } - - fn make_binary_view_array() -> ArrayRef { - Arc::new(BinaryViewArray::from(vec![b"test" as &[u8]])) - } - - fn make_binary_array() -> ArrayRef { - Arc::new(BinaryArray::from(vec![b"test" as &[u8]])) - } - - /// Create a test VariantArray with a single row containing: - /// { - /// "name": "Alice", - /// "details": { - /// "age": 30, - /// "city": "New York" - /// }, - /// "hobbies": ["reading", "cooking", "hiking"] - /// } - fn create_test_variant_array() -> VariantArray { - let mut builder = VariantBuilder::new(); - let mut obj = builder.new_object(); - - obj.insert("name", "Alice"); - - // Create details object - { - let mut details = obj.new_object("details"); - details.insert("age", 30i32); - details.insert("city", "New York"); - let _ = details.finish(); - } - - // Create hobbies list - { - let mut hobbies = obj.new_list("hobbies"); - hobbies.append_value("reading"); - hobbies.append_value("cooking"); - hobbies.append_value("hiking"); - hobbies.finish(); - } - - obj.finish().unwrap(); - - let (metadata, value) = builder.finish(); - - // Create VariantArray - let metadata_array = Arc::new(BinaryViewArray::from(vec![metadata.as_slice()])); - let value_array = Arc::new(BinaryViewArray::from(vec![value.as_slice()])); - - let fields = Fields::from(vec![ - Field::new("metadata", DataType::BinaryView, false), - Field::new("value", DataType::BinaryView, false), - ]); - - let struct_array = StructArray::new(fields, vec![metadata_array, value_array], None); - - VariantArray::try_new(Arc::new(struct_array)).unwrap() - } - - /// Create a test VariantArray with multiple rows - fn create_test_variant_array_multiple_rows() -> VariantArray { - let mut metadata_vec = Vec::new(); - let mut value_vec = Vec::new(); - - // Row 0: Alice - { - let mut builder = VariantBuilder::new(); - let mut obj = builder.new_object(); - obj.insert("name", "Alice"); - - // Create details object - { - let mut details = obj.new_object("details"); - details.insert("age", 30i32); - let _ = details.finish(); - } - - obj.finish().unwrap(); - let (metadata, value) = builder.finish(); - metadata_vec.push(metadata); - value_vec.push(value); - } - - // Row 1: Bob - { - let mut builder = VariantBuilder::new(); - let mut obj = builder.new_object(); - obj.insert("name", "Bob"); - - // Create details object - { - let mut details = obj.new_object("details"); - details.insert("age", 25i32); - let _ = details.finish(); - } - - obj.finish().unwrap(); - let (metadata, value) = builder.finish(); - metadata_vec.push(metadata); - value_vec.push(value); - } - - // Create VariantArray - let metadata_array = Arc::new(BinaryViewArray::from( - metadata_vec.iter().map(|m| m.as_slice()).collect::>() - )); - let value_array = Arc::new(BinaryViewArray::from( - value_vec.iter().map(|v| v.as_slice()).collect::>() - )); - - let fields = Fields::from(vec![ - Field::new("metadata", DataType::BinaryView, false), - Field::new("value", DataType::BinaryView, false), - ]); - - let struct_array = StructArray::new(fields, vec![metadata_array, value_array], None); - - VariantArray::try_new(Arc::new(struct_array)).unwrap() + // Check that metadata and value bytes are non-empty + assert!(!metadata_field.value(0).is_empty()); + assert!(!value_field.value(0).is_empty()); + assert!(!metadata_field.value(1).is_empty()); + assert!(!value_field.value(1).is_empty()); } } + diff --git a/parquet-variant-compute/src/variant_array_builder.rs b/parquet-variant-compute/src/variant_array_builder.rs index 6bc405c27b06..aab5b978e107 100644 --- a/parquet-variant-compute/src/variant_array_builder.rs +++ b/parquet-variant-compute/src/variant_array_builder.rs @@ -131,6 +131,11 @@ impl VariantArrayBuilder { VariantArray::try_new(Arc::new(inner)).expect("valid VariantArray by construction") } + + /// Finish building the VariantArray (alias for build for compatibility) + pub fn finish(self) -> VariantArray { + self.build() + } /// Appends a null row to the builder. pub fn append_null(&mut self) { diff --git a/parquet-variant-compute/src/variant_parser.rs b/parquet-variant-compute/src/variant_parser.rs new file mode 100644 index 000000000000..cc09301ea851 --- /dev/null +++ b/parquet-variant-compute/src/variant_parser.rs @@ -0,0 +1,226 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Low-level binary format parsing for variant objects + +use arrow::error::ArrowError; + +/// Object header structure for variant objects +#[derive(Debug, Clone)] +pub struct ObjectHeader { + pub num_elements_size: usize, + pub field_id_size: usize, + pub field_offset_size: usize, + pub is_large: bool, +} + +/// Array header structure for variant objects +#[derive(Debug, Clone)] +pub struct ArrayHeader { + pub num_elements_size: usize, + pub element_offset_size: usize, + pub is_large: bool, +} + +/// Object byte offsets structure +#[derive(Debug, Clone)] +pub struct ObjectOffsets { + pub field_ids_start: usize, + pub field_offsets_start: usize, + pub values_start: usize, +} + +/// Array byte offsets structure +#[derive(Debug, Clone)] +pub struct ArrayOffsets { + pub element_offsets_start: usize, + pub elements_start: usize, +} + +/// Low-level parser for variant binary format +pub struct VariantParser; + +impl VariantParser { + /// Parse object header from header byte + pub fn parse_object_header(header_byte: u8) -> Result { + let value_header = header_byte >> 2; + let field_offset_size_minus_one = value_header & 0x03; + let field_id_size_minus_one = (value_header >> 2) & 0x03; + let is_large = (value_header & 0x10) != 0; + + let num_elements_size = if is_large { 4 } else { 1 }; + let field_id_size = (field_id_size_minus_one + 1) as usize; + let field_offset_size = (field_offset_size_minus_one + 1) as usize; + + Ok(ObjectHeader { + num_elements_size, + field_id_size, + field_offset_size, + is_large, + }) + } + + /// Unpack integer from bytes + pub fn unpack_int(bytes: &[u8], size: usize) -> Result { + if bytes.len() < size { + return Err(ArrowError::InvalidArgumentError( + "Not enough bytes to unpack integer".to_string() + )); + } + + match size { + 1 => Ok(bytes[0] as usize), + 2 => Ok(u16::from_le_bytes([bytes[0], bytes[1]]) as usize), + 3 => Ok(u32::from_le_bytes([bytes[0], bytes[1], bytes[2], 0]) as usize), + 4 => Ok(u32::from_le_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]) as usize), + _ => Err(ArrowError::InvalidArgumentError( + format!("Invalid integer size: {}", size) + )), + } + } + + /// Calculate the size needed to store an integer + pub fn calculate_int_size(value: usize) -> usize { + if value <= u8::MAX as usize { + 1 + } else if value <= u16::MAX as usize { + 2 + } else if value <= 0xFFFFFF { + 3 + } else { + 4 + } + } + + /// Build object header byte + pub fn build_object_header(is_large: bool, field_id_size: usize, field_offset_size: usize) -> u8 { + let large_bit = if is_large { 1 } else { 0 }; + (large_bit << 6) | (((field_id_size - 1) as u8) << 4) | (((field_offset_size - 1) as u8) << 2) | 2 + } + + /// Write integer bytes to buffer + pub fn write_int_bytes(buffer: &mut Vec, value: usize, size: usize) { + match size { + 1 => buffer.push(value as u8), + 2 => buffer.extend_from_slice(&(value as u16).to_le_bytes()), + 3 => { + let bytes = (value as u32).to_le_bytes(); + buffer.extend_from_slice(&bytes[..3]); + } + 4 => buffer.extend_from_slice(&(value as u32).to_le_bytes()), + _ => panic!("Invalid size: {}", size), + } + } + + /// Check if value bytes represent an object + pub fn is_object(value_bytes: &[u8]) -> bool { + if value_bytes.is_empty() { + return false; + } + + let header_byte = value_bytes[0]; + let basic_type = header_byte & 0x03; // Basic type is in first 2 bits + basic_type == 2 // Object type + } + + /// Get the basic type from header byte + pub fn get_basic_type(header_byte: u8) -> u8 { + header_byte & 0x03 // Basic type is in first 2 bits + } + + /// Check if value bytes represent an array + pub fn is_array(value_bytes: &[u8]) -> bool { + if value_bytes.is_empty() { + return false; + } + + let header_byte = value_bytes[0]; + let basic_type = header_byte & 0x03; // Basic type is in first 2 bits + basic_type == 3 // Array type + } + + /// Parse array header from header byte + pub fn parse_array_header(header_byte: u8) -> Result { + let value_header = header_byte >> 2; + let element_offset_size_minus_one = value_header & 0x03; + let is_large = (value_header & 0x10) != 0; + + let num_elements_size = if is_large { 4 } else { 1 }; + let element_offset_size = (element_offset_size_minus_one + 1) as usize; + + Ok(ArrayHeader { + num_elements_size, + element_offset_size, + is_large, + }) + } + + /// Calculate byte offsets for array elements + pub fn calculate_array_offsets(header: &ArrayHeader, num_elements: usize) -> ArrayOffsets { + let element_offsets_start = 1 + header.num_elements_size; + let elements_start = element_offsets_start + ((num_elements + 1) * header.element_offset_size); + + ArrayOffsets { + element_offsets_start, + elements_start, + } + } + + /// Calculate byte offsets for object fields + pub fn calculate_object_offsets(header: &ObjectHeader, num_elements: usize) -> ObjectOffsets { + let field_ids_start = 1 + header.num_elements_size; + let field_offsets_start = field_ids_start + (num_elements * header.field_id_size); + let values_start = field_offsets_start + ((num_elements + 1) * header.field_offset_size); + + ObjectOffsets { + field_ids_start, + field_offsets_start, + values_start, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_unpack_int() { + assert_eq!(VariantParser::unpack_int(&[42], 1).unwrap(), 42); + assert_eq!(VariantParser::unpack_int(&[0, 1], 2).unwrap(), 256); + assert_eq!(VariantParser::unpack_int(&[0, 0, 1, 0], 4).unwrap(), 65536); + } + + #[test] + fn test_calculate_int_size() { + assert_eq!(VariantParser::calculate_int_size(255), 1); + assert_eq!(VariantParser::calculate_int_size(256), 2); + assert_eq!(VariantParser::calculate_int_size(65536), 3); + assert_eq!(VariantParser::calculate_int_size(16777216), 4); + } + + #[test] + fn test_write_int_bytes() { + let mut buffer = Vec::new(); + VariantParser::write_int_bytes(&mut buffer, 42, 1); + assert_eq!(buffer, vec![42]); + + let mut buffer = Vec::new(); + VariantParser::write_int_bytes(&mut buffer, 256, 2); + assert_eq!(buffer, vec![0, 1]); + } +} \ No newline at end of file From d7821972ce7ad830ef5fb2cc09636edefd91e939 Mon Sep 17 00:00:00 2001 From: carpecodeum Date: Wed, 16 Jul 2025 12:14:31 -0400 Subject: [PATCH 04/18] [FIX] fix variant_array implementation --- .../examples/field_removal.rs | 6 - parquet-variant-compute/src/from_json.rs | 10 +- parquet-variant-compute/src/variant_array.rs | 246 +++++++++++++----- 3 files changed, 183 insertions(+), 79 deletions(-) diff --git a/parquet-variant-compute/examples/field_removal.rs b/parquet-variant-compute/examples/field_removal.rs index 25b05f73547c..ed1e8feb3038 100644 --- a/parquet-variant-compute/examples/field_removal.rs +++ b/parquet-variant-compute/examples/field_removal.rs @@ -107,10 +107,4 @@ fn main() { } } - println!("\n=== Performance Features ==="); - println!("✓ Efficient field removal at byte level"); - println!("✓ Support for nested field removal"); - println!("✓ Batch operations for cleaning multiple fields"); - println!("✓ Maintains data integrity during field removal"); - println!("✓ Foundation for data governance and privacy compliance"); } \ No newline at end of file diff --git a/parquet-variant-compute/src/from_json.rs b/parquet-variant-compute/src/from_json.rs index 96980a3ef8a6..b48487d8dd5b 100644 --- a/parquet-variant-compute/src/from_json.rs +++ b/parquet-variant-compute/src/from_json.rs @@ -54,7 +54,7 @@ pub fn batch_json_string_to_variant(input: &ArrayRef) -> Result) -> Result { - // Validate that the struct has the expected format - if inner.num_columns() != 2 { + /// Creates a new `VariantArray` from a [`StructArray`]. + /// + /// # Arguments + /// - `inner` - The underlying [`StructArray`] that contains the variant data. + /// + /// # Returns + /// - A new instance of `VariantArray`. + /// + /// # Errors: + /// - If the `StructArray` does not contain the required fields + /// + /// # Current support + /// This structure does not (yet) support the full Arrow Variant Array specification. + /// + /// Only `StructArrays` with `metadata` and `value` fields that are + /// [`BinaryViewArray`] are supported. Shredded values are not currently supported + /// nor are using types other than `BinaryViewArray` + /// + /// [`BinaryViewArray`]: arrow::array::BinaryViewArray + pub fn try_new(inner: ArrayRef) -> Result { + let Some(inner) = inner.as_struct_opt() else { return Err(ArrowError::InvalidArgumentError( - "Expected struct with exactly 2 columns (metadata, value)".to_string(), + "Invalid VariantArray: requires StructArray as input".to_string(), )); }; // Ensure the StructArray has a metadata field of BinaryView @@ -160,50 +193,49 @@ impl VariantArray { Ok(Self { inner }) } - - /// Get the metadata field as a BinaryViewArray - pub fn metadata_field(&self) -> &BinaryViewArray { - self.inner.column(0) - .as_any() - .downcast_ref::() - .expect("Expected metadata field to be BinaryViewArray") + + /// Returns a reference to the underlying [`StructArray`]. + pub fn inner(&self) -> &StructArray { + &self.inner } - - /// Get the value field as a BinaryViewArray - pub fn value_field(&self) -> &BinaryViewArray { - self.inner.column(1) - .as_any() - .downcast_ref::() - .expect("Expected value field to be BinaryViewArray") + + /// Returns the inner [`StructArray`], consuming self + pub fn into_inner(self) -> StructArray { + self.inner } - + + /// Return the [`Variant`] instance stored at the given row + /// + /// Panics if the index is out of bounds. + /// + /// Note: Does not do deep validation of the [`Variant`], so it is up to the + /// caller to ensure that the metadata and value were constructed correctly. + pub fn value(&self, index: usize) -> Variant { + let metadata = self.metadata_field().as_binary_view().value(index); + let value = self.value_field().as_binary_view().value(index); + Variant::new(metadata, value) + } + + /// Return a reference to the metadata field of the [`StructArray`] + pub fn metadata_field(&self) -> &ArrayRef { + // spec says fields order is not guaranteed, so we search by name + self.inner.column_by_name("metadata").unwrap() + } + + /// Return a reference to the value field of the `StructArray` + pub fn value_field(&self) -> &ArrayRef { + // spec says fields order is not guaranteed, so we search by name + self.inner.column_by_name("value").unwrap() + } + /// Get the metadata bytes for a specific index pub fn metadata(&self, index: usize) -> &[u8] { - self.metadata_field().value(index).as_ref() + self.metadata_field().as_binary_view().value(index).as_ref() } /// Get the value bytes for a specific index pub fn value_bytes(&self, index: usize) -> &[u8] { - self.value_field().value(index).as_ref() - } - - /// Get the parsed variant at a specific index - pub fn value(&self, index: usize) -> Variant { - if index >= self.len() { - panic!("Index {} out of bounds for array of length {}", index, self.len()); - } - - if self.is_null(index) { - return Variant::Null; - } - - let metadata = self.metadata(index); - let value = self.value_bytes(index); - - let variant_metadata = VariantMetadata::try_new(metadata) - .expect("Failed to parse variant metadata"); - Variant::try_new_with_metadata(variant_metadata, value) - .expect("Failed to create variant from metadata and value") + self.value_field().as_binary_view().value(index).as_ref() } /// Get value at a specific path for the variant at the given index @@ -342,18 +374,15 @@ impl Array for VariantArray { fn as_any(&self) -> &dyn Any { self } - + fn to_data(&self) -> ArrayData { self.inner.to_data() } - + fn into_data(self) -> ArrayData { - match Arc::try_unwrap(self.inner) { - Ok(inner) => inner.into_data(), - Err(inner) => inner.to_data(), - } + self.inner.into_data() } - + fn data_type(&self) -> &DataType { self.inner.data_type() } @@ -368,38 +397,111 @@ impl Array for VariantArray { value_ref: val, }) } - + fn len(&self) -> usize { self.inner.len() } - + fn is_empty(&self) -> bool { self.inner.is_empty() } - - fn nulls(&self) -> Option<&NullBuffer> { - self.inner.nulls() - } - + fn offset(&self) -> usize { self.inner.offset() } - + + fn nulls(&self) -> Option<&NullBuffer> { + self.inner.nulls() + } + fn get_buffer_memory_size(&self) -> usize { self.inner.get_buffer_memory_size() } - + fn get_array_memory_size(&self) -> usize { self.inner.get_array_memory_size() } } #[cfg(test)] -mod tests { +mod test { use super::*; use crate::variant_array_builder::VariantArrayBuilder; + use arrow::array::{BinaryArray, BinaryViewArray}; + use arrow_schema::{Field, Fields}; use parquet_variant::VariantBuilder; + #[test] + fn invalid_not_a_struct_array() { + let array = make_binary_view_array(); + // Should fail because the input is not a StructArray + let err = VariantArray::try_new(array); + assert_eq!( + err.unwrap_err().to_string(), + "Invalid argument error: Invalid VariantArray: requires StructArray as input" + ); + } + + #[test] + fn invalid_missing_metadata() { + let fields = Fields::from(vec![Field::new("value", DataType::BinaryView, true)]); + let array = StructArray::new(fields, vec![make_binary_view_array()], None); + // Should fail because the StructArray does not contain a 'metadata' field + let err = VariantArray::try_new(Arc::new(array)); + assert_eq!( + err.unwrap_err().to_string(), + "Invalid argument error: Invalid VariantArray: StructArray must contain a 'metadata' field" + ); + } + + #[test] + fn invalid_missing_value() { + let fields = Fields::from(vec![Field::new("metadata", DataType::BinaryView, false)]); + let array = StructArray::new(fields, vec![make_binary_view_array()], None); + // Should fail because the StructArray does not contain a 'value' field + let err = VariantArray::try_new(Arc::new(array)); + assert_eq!( + err.unwrap_err().to_string(), + "Invalid argument error: Invalid VariantArray: StructArray must contain a 'value' field" + ); + } + + #[test] + fn invalid_metadata_field_type() { + let fields = Fields::from(vec![ + Field::new("metadata", DataType::Binary, true), // Not yet supported + Field::new("value", DataType::BinaryView, true), + ]); + let array = StructArray::new( + fields, + vec![make_binary_array(), make_binary_view_array()], + None, + ); + let err = VariantArray::try_new(Arc::new(array)); + assert_eq!( + err.unwrap_err().to_string(), + "Not yet implemented: VariantArray 'metadata' field must be BinaryView, got Binary" + ); + } + + #[test] + fn invalid_value_field_type() { + let fields = Fields::from(vec![ + Field::new("metadata", DataType::BinaryView, true), + Field::new("value", DataType::Binary, true), // Not yet supported + ]); + let array = StructArray::new( + fields, + vec![make_binary_view_array(), make_binary_array()], + None, + ); + let err = VariantArray::try_new(Arc::new(array)); + assert_eq!( + err.unwrap_err().to_string(), + "Not yet implemented: VariantArray 'value' field must be BinaryView, got Binary" + ); + } + fn create_test_variant_array() -> VariantArray { let mut builder = VariantArrayBuilder::new(2); @@ -510,10 +612,18 @@ mod tests { assert_eq!(value_field.len(), 2); // Check that metadata and value bytes are non-empty - assert!(!metadata_field.value(0).is_empty()); - assert!(!value_field.value(0).is_empty()); - assert!(!metadata_field.value(1).is_empty()); - assert!(!value_field.value(1).is_empty()); + assert!(!metadata_field.as_binary_view().value(0).is_empty()); + assert!(!value_field.as_binary_view().value(0).is_empty()); + assert!(!metadata_field.as_binary_view().value(1).is_empty()); + assert!(!value_field.as_binary_view().value(1).is_empty()); + } + + fn make_binary_view_array() -> ArrayRef { + Arc::new(BinaryViewArray::from(vec![b"test" as &[u8]])) + } + + fn make_binary_array() -> ArrayRef { + Arc::new(BinaryArray::from(vec![b"test" as &[u8]])) } } From 948bb394432af05f01e24db8c1fa78271b04e8b6 Mon Sep 17 00:00:00 2001 From: carpecodeum Date: Wed, 16 Jul 2025 15:57:30 -0400 Subject: [PATCH 05/18] [ADD] add support for path operations on different data types --- .../src/field_operations.rs | 144 ++++--- parquet-variant-compute/src/lib.rs | 1 + parquet-variant-compute/src/variant_parser.rs | 361 ++++++++++++++++-- 3 files changed, 418 insertions(+), 88 deletions(-) diff --git a/parquet-variant-compute/src/field_operations.rs b/parquet-variant-compute/src/field_operations.rs index 3ef71d7db77c..526293a4f7c2 100644 --- a/parquet-variant-compute/src/field_operations.rs +++ b/parquet-variant-compute/src/field_operations.rs @@ -346,13 +346,38 @@ impl FieldOperations { Ok(Some(current_value)) } + /// Get the value at a specific path and return its type and data + pub fn get_path_with_type( + metadata_bytes: &[u8], + value_bytes: &[u8], + path: &VariantPath, + ) -> Result)>, ArrowError> { + if let Some(value_bytes) = Self::get_path_bytes(metadata_bytes, value_bytes, path)? { + if !value_bytes.is_empty() { + let variant_type = VariantParser::parse_variant_header(value_bytes[0])?; + return Ok(Some((variant_type, value_bytes))); + } + } + Ok(None) + } + /// Get field bytes from an object at the byte level fn get_field_bytes( metadata_bytes: &[u8], value_bytes: &[u8], field_name: &str, ) -> Result>, ArrowError> { - Self::extract_field_bytes(metadata_bytes, value_bytes, field_name) + // Use the general dispatch parser to ensure we're dealing with an object + if !value_bytes.is_empty() { + match VariantParser::parse_variant_header(value_bytes[0])? { + crate::variant_parser::VariantType::Object(_) => { + Self::extract_field_bytes(metadata_bytes, value_bytes, field_name) + } + _ => Ok(None), // Not an object, can't extract fields + } + } else { + Ok(None) + } } /// Get array element bytes at the byte level @@ -361,72 +386,67 @@ impl FieldOperations { value_bytes: &[u8], index: usize, ) -> Result>, ArrowError> { - // Check if this is an array + // Use the general dispatch parser to ensure we're dealing with an array if value_bytes.is_empty() { return Ok(None); } - let header_byte = value_bytes[0]; - let basic_type = VariantParser::get_basic_type(header_byte); - - // Only handle arrays (basic_type == 3 according to variant spec) - if basic_type != 3 { - return Ok(None); - } - - // Parse array header to get element count and offsets - let array_header = VariantParser::parse_array_header(header_byte)?; - let num_elements = VariantParser::unpack_int( - &value_bytes[1..], - array_header.num_elements_size - )?; - - // Check bounds - if index >= num_elements { - return Ok(None); - } - - // Calculate array offsets - let offsets = VariantParser::calculate_array_offsets(&array_header, num_elements); - - // Get element offset - let element_offset_start = offsets.element_offsets_start + index * array_header.element_offset_size; - let element_offset_end = element_offset_start + array_header.element_offset_size; - - if element_offset_end > value_bytes.len() { - return Err(ArrowError::InvalidArgumentError( - "Element offset exceeds value buffer".to_string() - )); - } - - let element_offset = VariantParser::unpack_int( - &value_bytes[element_offset_start..element_offset_end], - array_header.element_offset_size - )?; - - // Get next element offset (or end of data) - let next_offset = if index + 1 < num_elements { - let next_element_offset_start = offsets.element_offsets_start + (index + 1) * array_header.element_offset_size; - let next_element_offset_end = next_element_offset_start + array_header.element_offset_size; - VariantParser::unpack_int( - &value_bytes[next_element_offset_start..next_element_offset_end], - array_header.element_offset_size - )? - } else { - value_bytes.len() - }; - - // Extract element bytes - let element_start = offsets.elements_start + element_offset; - let element_end = offsets.elements_start + next_offset; - - if element_end > value_bytes.len() { - return Err(ArrowError::InvalidArgumentError( - "Element data exceeds value buffer".to_string() - )); + match VariantParser::parse_variant_header(value_bytes[0])? { + crate::variant_parser::VariantType::Array(array_header) => { + let num_elements = VariantParser::unpack_int( + &value_bytes[1..], + array_header.num_elements_size + )?; + + // Check bounds + if index >= num_elements { + return Ok(None); + } + + // Calculate array offsets + let offsets = VariantParser::calculate_array_offsets(&array_header, num_elements); + + // Get element offset + let element_offset_start = offsets.element_offsets_start + index * array_header.element_offset_size; + let element_offset_end = element_offset_start + array_header.element_offset_size; + + if element_offset_end > value_bytes.len() { + return Err(ArrowError::InvalidArgumentError( + "Element offset exceeds value buffer".to_string() + )); + } + + let element_offset = VariantParser::unpack_int( + &value_bytes[element_offset_start..element_offset_end], + array_header.element_offset_size + )?; + + // Get next element offset (or end of data) + let next_offset = if index + 1 < num_elements { + let next_element_offset_start = offsets.element_offsets_start + (index + 1) * array_header.element_offset_size; + let next_element_offset_end = next_element_offset_start + array_header.element_offset_size; + VariantParser::unpack_int( + &value_bytes[next_element_offset_start..next_element_offset_end], + array_header.element_offset_size + )? + } else { + value_bytes.len() + }; + + // Extract element bytes + let element_start = offsets.elements_start + element_offset; + let element_end = offsets.elements_start + next_offset; + + if element_end > value_bytes.len() { + return Err(ArrowError::InvalidArgumentError( + "Element data exceeds value buffer".to_string() + )); + } + + Ok(Some(value_bytes[element_start..element_end].to_vec())) + } + _ => Ok(None), // Not an array, can't extract elements } - - Ok(Some(value_bytes[element_start..element_end].to_vec())) } } diff --git a/parquet-variant-compute/src/lib.rs b/parquet-variant-compute/src/lib.rs index cc946b035af7..10c9add18308 100644 --- a/parquet-variant-compute/src/lib.rs +++ b/parquet-variant-compute/src/lib.rs @@ -27,5 +27,6 @@ pub mod to_json; pub use variant_array::VariantArray; pub use variant_array_builder::VariantArrayBuilder; pub use field_operations::{VariantPath, VariantPathElement}; +pub use variant_parser::{VariantType, PrimitiveType, ShortStringHeader, ObjectHeader, ArrayHeader}; pub use from_json::batch_json_string_to_variant; pub use to_json::batch_variant_to_json_string; diff --git a/parquet-variant-compute/src/variant_parser.rs b/parquet-variant-compute/src/variant_parser.rs index cc09301ea851..139e9e7c924c 100644 --- a/parquet-variant-compute/src/variant_parser.rs +++ b/parquet-variant-compute/src/variant_parser.rs @@ -19,8 +19,45 @@ use arrow::error::ArrowError; +/// Variant type enumeration covering all possible types +#[derive(Debug, Clone, PartialEq)] +pub enum VariantType { + Primitive(PrimitiveType), + ShortString(ShortStringHeader), + Object(ObjectHeader), + Array(ArrayHeader), +} + +/// Primitive type variants +#[derive(Debug, Clone, PartialEq)] +pub enum PrimitiveType { + Null, + True, + False, + Int8, + Int16, + Int32, + Int64, + Double, + Decimal4, + Decimal8, + Decimal16, + Date, + TimestampNtz, + TimestampLtz, + Float, + Binary, + String, +} + +/// Short string header structure +#[derive(Debug, Clone, PartialEq)] +pub struct ShortStringHeader { + pub length: usize, +} + /// Object header structure for variant objects -#[derive(Debug, Clone)] +#[derive(Debug, Clone, PartialEq)] pub struct ObjectHeader { pub num_elements_size: usize, pub field_id_size: usize, @@ -29,7 +66,7 @@ pub struct ObjectHeader { } /// Array header structure for variant objects -#[derive(Debug, Clone)] +#[derive(Debug, Clone, PartialEq)] pub struct ArrayHeader { pub num_elements_size: usize, pub element_offset_size: usize, @@ -55,6 +92,62 @@ pub struct ArrayOffsets { pub struct VariantParser; impl VariantParser { + /// General dispatch function to parse any variant header + pub fn parse_variant_header(header_byte: u8) -> Result { + let basic_type = header_byte & 0x03; + + match basic_type { + 0 => Ok(VariantType::Primitive(Self::parse_primitive_header(header_byte)?)), + 1 => Ok(VariantType::ShortString(Self::parse_short_string_header(header_byte)?)), + 2 => Ok(VariantType::Object(Self::parse_object_header(header_byte)?)), + 3 => Ok(VariantType::Array(Self::parse_array_header(header_byte)?)), + _ => Err(ArrowError::InvalidArgumentError( + format!("Invalid basic type: {}", basic_type) + )), + } + } + + /// Parse primitive type header + pub fn parse_primitive_header(header_byte: u8) -> Result { + let primitive_type = header_byte >> 2; + + match primitive_type { + 0 => Ok(PrimitiveType::Null), + 1 => Ok(PrimitiveType::True), + 2 => Ok(PrimitiveType::False), + 3 => Ok(PrimitiveType::Int8), + 4 => Ok(PrimitiveType::Int16), + 5 => Ok(PrimitiveType::Int32), + 6 => Ok(PrimitiveType::Int64), + 7 => Ok(PrimitiveType::Double), + 8 => Ok(PrimitiveType::Decimal4), + 9 => Ok(PrimitiveType::Decimal8), + 10 => Ok(PrimitiveType::Decimal16), + 11 => Ok(PrimitiveType::Date), + 12 => Ok(PrimitiveType::TimestampNtz), + 13 => Ok(PrimitiveType::TimestampLtz), + 14 => Ok(PrimitiveType::Float), + 15 => Ok(PrimitiveType::Binary), + 16 => Ok(PrimitiveType::String), + _ => Err(ArrowError::InvalidArgumentError( + format!("Invalid primitive type: {}", primitive_type) + )), + } + } + + /// Parse short string header + pub fn parse_short_string_header(header_byte: u8) -> Result { + let length = (header_byte >> 2) as usize; + + if length > 13 { + return Err(ArrowError::InvalidArgumentError( + format!("Short string length {} exceeds maximum of 13", length) + )); + } + + Ok(ShortStringHeader { length }) + } + /// Parse object header from header byte pub fn parse_object_header(header_byte: u8) -> Result { let value_header = header_byte >> 2; @@ -74,6 +167,22 @@ impl VariantParser { }) } + /// Parse array header from header byte + pub fn parse_array_header(header_byte: u8) -> Result { + let value_header = header_byte >> 2; + let element_offset_size_minus_one = value_header & 0x03; + let is_large = (value_header & 0x10) != 0; + + let num_elements_size = if is_large { 4 } else { 1 }; + let element_offset_size = (element_offset_size_minus_one + 1) as usize; + + Ok(ArrayHeader { + num_elements_size, + element_offset_size, + is_large, + }) + } + /// Unpack integer from bytes pub fn unpack_int(bytes: &[u8], size: usize) -> Result { if bytes.len() < size { @@ -126,20 +235,33 @@ impl VariantParser { } } - /// Check if value bytes represent an object - pub fn is_object(value_bytes: &[u8]) -> bool { + /// Get the basic type from header byte + pub fn get_basic_type(header_byte: u8) -> u8 { + header_byte & 0x03 + } + + /// Check if value bytes represent a primitive + pub fn is_primitive(value_bytes: &[u8]) -> bool { if value_bytes.is_empty() { return false; } - - let header_byte = value_bytes[0]; - let basic_type = header_byte & 0x03; // Basic type is in first 2 bits - basic_type == 2 // Object type + Self::get_basic_type(value_bytes[0]) == 0 } - /// Get the basic type from header byte - pub fn get_basic_type(header_byte: u8) -> u8 { - header_byte & 0x03 // Basic type is in first 2 bits + /// Check if value bytes represent a short string + pub fn is_short_string(value_bytes: &[u8]) -> bool { + if value_bytes.is_empty() { + return false; + } + Self::get_basic_type(value_bytes[0]) == 1 + } + + /// Check if value bytes represent an object + pub fn is_object(value_bytes: &[u8]) -> bool { + if value_bytes.is_empty() { + return false; + } + Self::get_basic_type(value_bytes[0]) == 2 } /// Check if value bytes represent an array @@ -147,26 +269,79 @@ impl VariantParser { if value_bytes.is_empty() { return false; } + Self::get_basic_type(value_bytes[0]) == 3 + } + + /// Get the data length for a primitive type + pub fn get_primitive_data_length(primitive_type: &PrimitiveType) -> usize { + match primitive_type { + PrimitiveType::Null | PrimitiveType::True | PrimitiveType::False => 0, + PrimitiveType::Int8 => 1, + PrimitiveType::Int16 => 2, + PrimitiveType::Int32 | PrimitiveType::Float | PrimitiveType::Decimal4 | PrimitiveType::Date => 4, + PrimitiveType::Int64 | PrimitiveType::Double | PrimitiveType::Decimal8 | PrimitiveType::TimestampNtz | PrimitiveType::TimestampLtz => 8, + PrimitiveType::Decimal16 => 16, + PrimitiveType::Binary | PrimitiveType::String => 0, // Variable length, need to read from data + } + } + + /// Extract short string data from value bytes + pub fn extract_short_string_data(value_bytes: &[u8]) -> Result<&[u8], ArrowError> { + if value_bytes.is_empty() { + return Err(ArrowError::InvalidArgumentError("Empty value bytes".to_string())); + } + + let header = Self::parse_short_string_header(value_bytes[0])?; - let header_byte = value_bytes[0]; - let basic_type = header_byte & 0x03; // Basic type is in first 2 bits - basic_type == 3 // Array type + if value_bytes.len() < 1 + header.length { + return Err(ArrowError::InvalidArgumentError( + format!("Short string data length {} exceeds available bytes", header.length) + )); + } + + Ok(&value_bytes[1..1 + header.length]) } - /// Parse array header from header byte - pub fn parse_array_header(header_byte: u8) -> Result { - let value_header = header_byte >> 2; - let element_offset_size_minus_one = value_header & 0x03; - let is_large = (value_header & 0x10) != 0; + /// Extract primitive data from value bytes + pub fn extract_primitive_data(value_bytes: &[u8]) -> Result<&[u8], ArrowError> { + if value_bytes.is_empty() { + return Err(ArrowError::InvalidArgumentError("Empty value bytes".to_string())); + } - let num_elements_size = if is_large { 4 } else { 1 }; - let element_offset_size = (element_offset_size_minus_one + 1) as usize; + let primitive_type = Self::parse_primitive_header(value_bytes[0])?; + let data_length = Self::get_primitive_data_length(&primitive_type); - Ok(ArrayHeader { - num_elements_size, - element_offset_size, - is_large, - }) + if data_length == 0 { + // Handle variable length types and null/boolean + match primitive_type { + PrimitiveType::Null | PrimitiveType::True | PrimitiveType::False => Ok(&[]), + PrimitiveType::Binary | PrimitiveType::String => { + // These require reading length from the data + if value_bytes.len() < 5 { + return Err(ArrowError::InvalidArgumentError( + "Not enough bytes for variable length primitive".to_string() + )); + } + let length = u32::from_le_bytes([value_bytes[1], value_bytes[2], value_bytes[3], value_bytes[4]]) as usize; + if value_bytes.len() < 5 + length { + return Err(ArrowError::InvalidArgumentError( + "Variable length primitive data exceeds available bytes".to_string() + )); + } + Ok(&value_bytes[5..5 + length]) + } + _ => Err(ArrowError::InvalidArgumentError( + format!("Unhandled primitive type: {:?}", primitive_type) + )), + } + } else { + if value_bytes.len() < 1 + data_length { + return Err(ArrowError::InvalidArgumentError( + format!("Primitive data length {} exceeds available bytes", data_length) + )); + } + Ok(&value_bytes[1..1 + data_length]) + } } /// Calculate byte offsets for array elements @@ -223,4 +398,138 @@ mod tests { VariantParser::write_int_bytes(&mut buffer, 256, 2); assert_eq!(buffer, vec![0, 1]); } + + #[test] + fn test_parse_primitive_header() { + // Test null (primitive type 0) + assert_eq!(VariantParser::parse_primitive_header(0b00000000).unwrap(), PrimitiveType::Null); + + // Test true (primitive type 1) + assert_eq!(VariantParser::parse_primitive_header(0b00000100).unwrap(), PrimitiveType::True); + + // Test false (primitive type 2) + assert_eq!(VariantParser::parse_primitive_header(0b00001000).unwrap(), PrimitiveType::False); + + // Test int32 (primitive type 5) + assert_eq!(VariantParser::parse_primitive_header(0b00010100).unwrap(), PrimitiveType::Int32); + + // Test double (primitive type 7) + assert_eq!(VariantParser::parse_primitive_header(0b00011100).unwrap(), PrimitiveType::Double); + } + + #[test] + fn test_parse_short_string_header() { + // Test 0-length short string + assert_eq!(VariantParser::parse_short_string_header(0b00000001).unwrap(), ShortStringHeader { length: 0 }); + + // Test 5-length short string + assert_eq!(VariantParser::parse_short_string_header(0b00010101).unwrap(), ShortStringHeader { length: 5 }); + + // Test 13-length short string (maximum) + assert_eq!(VariantParser::parse_short_string_header(0b00110101).unwrap(), ShortStringHeader { length: 13 }); + + // Test invalid length > 13 + assert!(VariantParser::parse_short_string_header(0b00111001).is_err()); + } + + #[test] + fn test_parse_variant_header_dispatch() { + // Test primitive dispatch + let primitive_header = 0b00000100; // True primitive + match VariantParser::parse_variant_header(primitive_header).unwrap() { + VariantType::Primitive(PrimitiveType::True) => {}, + _ => panic!("Expected primitive True"), + } + + // Test short string dispatch + let short_string_header = 0b00010101; // 5-length short string + match VariantParser::parse_variant_header(short_string_header).unwrap() { + VariantType::ShortString(ShortStringHeader { length: 5 }) => {}, + _ => panic!("Expected short string with length 5"), + } + + // Test object dispatch + let object_header = 0b00000010; // Basic object + match VariantParser::parse_variant_header(object_header).unwrap() { + VariantType::Object(_) => {}, + _ => panic!("Expected object"), + } + + // Test array dispatch + let array_header = 0b00000011; // Basic array + match VariantParser::parse_variant_header(array_header).unwrap() { + VariantType::Array(_) => {}, + _ => panic!("Expected array"), + } + } + + #[test] + fn test_basic_type_checks() { + // Test primitive type check + assert!(VariantParser::is_primitive(&[0b00000000])); // Null + assert!(VariantParser::is_primitive(&[0b00000100])); // True + assert!(!VariantParser::is_primitive(&[0b00000001])); // Not primitive + + // Test short string type check + assert!(VariantParser::is_short_string(&[0b00000001])); // 0-length short string + assert!(VariantParser::is_short_string(&[0b00010101])); // 5-length short string + assert!(!VariantParser::is_short_string(&[0b00000000])); // Not short string + + // Test object type check + assert!(VariantParser::is_object(&[0b00000010])); // Basic object + assert!(!VariantParser::is_object(&[0b00000001])); // Not object + + // Test array type check + assert!(VariantParser::is_array(&[0b00000011])); // Basic array + assert!(!VariantParser::is_array(&[0b00000010])); // Not array + } + + #[test] + fn test_get_primitive_data_length() { + assert_eq!(VariantParser::get_primitive_data_length(&PrimitiveType::Null), 0); + assert_eq!(VariantParser::get_primitive_data_length(&PrimitiveType::True), 0); + assert_eq!(VariantParser::get_primitive_data_length(&PrimitiveType::False), 0); + assert_eq!(VariantParser::get_primitive_data_length(&PrimitiveType::Int8), 1); + assert_eq!(VariantParser::get_primitive_data_length(&PrimitiveType::Int16), 2); + assert_eq!(VariantParser::get_primitive_data_length(&PrimitiveType::Int32), 4); + assert_eq!(VariantParser::get_primitive_data_length(&PrimitiveType::Int64), 8); + assert_eq!(VariantParser::get_primitive_data_length(&PrimitiveType::Double), 8); + assert_eq!(VariantParser::get_primitive_data_length(&PrimitiveType::Decimal16), 16); + assert_eq!(VariantParser::get_primitive_data_length(&PrimitiveType::Binary), 0); // Variable length + assert_eq!(VariantParser::get_primitive_data_length(&PrimitiveType::String), 0); // Variable length + } + + #[test] + fn test_extract_short_string_data() { + // Test 0-length short string + let data = &[0b00000001]; // 0-length short string header + assert_eq!(VariantParser::extract_short_string_data(data).unwrap(), &[] as &[u8]); + + // Test 5-length short string + let data = &[0b00010101, b'H', b'e', b'l', b'l', b'o']; // 5-length short string + "Hello" + assert_eq!(VariantParser::extract_short_string_data(data).unwrap(), b"Hello"); + + // Test insufficient data + let data = &[0b00010101, b'H', b'i']; // Claims 5 bytes but only has 2 + assert!(VariantParser::extract_short_string_data(data).is_err()); + } + + #[test] + fn test_extract_primitive_data() { + // Test null (no data) + let data = &[0b00000000]; // Null header + assert_eq!(VariantParser::extract_primitive_data(data).unwrap(), &[] as &[u8]); + + // Test true (no data) + let data = &[0b00000100]; // True header + assert_eq!(VariantParser::extract_primitive_data(data).unwrap(), &[] as &[u8]); + + // Test int32 (4 bytes) + let data = &[0b00010100, 0x2A, 0x00, 0x00, 0x00]; // Int32 header + 42 in little endian + assert_eq!(VariantParser::extract_primitive_data(data).unwrap(), &[0x2A, 0x00, 0x00, 0x00]); + + // Test insufficient data for int32 + let data = &[0b00010100, 0x2A, 0x00]; // Int32 header but only 2 bytes + assert!(VariantParser::extract_primitive_data(data).is_err()); + } } \ No newline at end of file From e16af07e9c523f21939a3a3a6328e2c16f7df1e1 Mon Sep 17 00:00:00 2001 From: carpecodeum Date: Wed, 16 Jul 2025 17:42:49 -0400 Subject: [PATCH 06/18] [FIX] minor fixes --- parquet-variant-compute/src/lib.rs | 2 +- parquet-variant-compute/src/variant_parser.rs | 40 ++++++++++++------- 2 files changed, 27 insertions(+), 15 deletions(-) diff --git a/parquet-variant-compute/src/lib.rs b/parquet-variant-compute/src/lib.rs index 10c9add18308..efe0163ef3b5 100644 --- a/parquet-variant-compute/src/lib.rs +++ b/parquet-variant-compute/src/lib.rs @@ -27,6 +27,6 @@ pub mod to_json; pub use variant_array::VariantArray; pub use variant_array_builder::VariantArrayBuilder; pub use field_operations::{VariantPath, VariantPathElement}; -pub use variant_parser::{VariantType, PrimitiveType, ShortStringHeader, ObjectHeader, ArrayHeader}; +pub use variant_parser::{VariantType, VariantBasicType, PrimitiveType, ShortStringHeader, ObjectHeader, ArrayHeader}; pub use from_json::batch_json_string_to_variant; pub use to_json::batch_variant_to_json_string; diff --git a/parquet-variant-compute/src/variant_parser.rs b/parquet-variant-compute/src/variant_parser.rs index 139e9e7c924c..b9e967154ea9 100644 --- a/parquet-variant-compute/src/variant_parser.rs +++ b/parquet-variant-compute/src/variant_parser.rs @@ -19,6 +19,15 @@ use arrow::error::ArrowError; +/// Basic variant type enumeration for the first 2 bits of header +#[derive(Debug, Clone, PartialEq)] +pub enum VariantBasicType { + Primitive = 0, + ShortString = 1, + Object = 2, + Array = 3, +} + /// Variant type enumeration covering all possible types #[derive(Debug, Clone, PartialEq)] pub enum VariantType { @@ -94,16 +103,13 @@ pub struct VariantParser; impl VariantParser { /// General dispatch function to parse any variant header pub fn parse_variant_header(header_byte: u8) -> Result { - let basic_type = header_byte & 0x03; + let basic_type = Self::get_basic_type(header_byte); match basic_type { - 0 => Ok(VariantType::Primitive(Self::parse_primitive_header(header_byte)?)), - 1 => Ok(VariantType::ShortString(Self::parse_short_string_header(header_byte)?)), - 2 => Ok(VariantType::Object(Self::parse_object_header(header_byte)?)), - 3 => Ok(VariantType::Array(Self::parse_array_header(header_byte)?)), - _ => Err(ArrowError::InvalidArgumentError( - format!("Invalid basic type: {}", basic_type) - )), + VariantBasicType::Primitive => Ok(VariantType::Primitive(Self::parse_primitive_header(header_byte)?)), + VariantBasicType::ShortString => Ok(VariantType::ShortString(Self::parse_short_string_header(header_byte)?)), + VariantBasicType::Object => Ok(VariantType::Object(Self::parse_object_header(header_byte)?)), + VariantBasicType::Array => Ok(VariantType::Array(Self::parse_array_header(header_byte)?)), } } @@ -236,8 +242,14 @@ impl VariantParser { } /// Get the basic type from header byte - pub fn get_basic_type(header_byte: u8) -> u8 { - header_byte & 0x03 + pub fn get_basic_type(header_byte: u8) -> VariantBasicType { + match header_byte & 0x03 { + 0 => VariantBasicType::Primitive, + 1 => VariantBasicType::ShortString, + 2 => VariantBasicType::Object, + 3 => VariantBasicType::Array, + _ => panic!("Invalid basic type: {}", header_byte & 0x03), + } } /// Check if value bytes represent a primitive @@ -245,7 +257,7 @@ impl VariantParser { if value_bytes.is_empty() { return false; } - Self::get_basic_type(value_bytes[0]) == 0 + Self::get_basic_type(value_bytes[0]) == VariantBasicType::Primitive } /// Check if value bytes represent a short string @@ -253,7 +265,7 @@ impl VariantParser { if value_bytes.is_empty() { return false; } - Self::get_basic_type(value_bytes[0]) == 1 + Self::get_basic_type(value_bytes[0]) == VariantBasicType::ShortString } /// Check if value bytes represent an object @@ -261,7 +273,7 @@ impl VariantParser { if value_bytes.is_empty() { return false; } - Self::get_basic_type(value_bytes[0]) == 2 + Self::get_basic_type(value_bytes[0]) == VariantBasicType::Object } /// Check if value bytes represent an array @@ -269,7 +281,7 @@ impl VariantParser { if value_bytes.is_empty() { return false; } - Self::get_basic_type(value_bytes[0]) == 3 + Self::get_basic_type(value_bytes[0]) == VariantBasicType::Array } /// Get the data length for a primitive type From 3da46b8e078acc0dfa5a61aef610c820eec31401 Mon Sep 17 00:00:00 2001 From: carpecodeum Date: Wed, 16 Jul 2025 18:00:29 -0400 Subject: [PATCH 07/18] [FIX] fix formatting issues --- .../examples/field_removal.rs | 86 +++-- .../examples/path_access.rs | 47 +-- .../src/field_operations.rs | 215 ++++++----- parquet-variant-compute/src/from_json.rs | 6 +- parquet-variant-compute/src/lib.rs | 14 +- parquet-variant-compute/src/variant_array.rs | 121 ++++--- .../src/variant_array_builder.rs | 2 +- parquet-variant-compute/src/variant_parser.rs | 340 ++++++++++++------ 8 files changed, 526 insertions(+), 305 deletions(-) diff --git a/parquet-variant-compute/examples/field_removal.rs b/parquet-variant-compute/examples/field_removal.rs index ed1e8feb3038..b73ebc00d7db 100644 --- a/parquet-variant-compute/examples/field_removal.rs +++ b/parquet-variant-compute/examples/field_removal.rs @@ -1,11 +1,11 @@ use arrow::array::Array; -use parquet_variant_compute::VariantArrayBuilder; use parquet_variant::VariantBuilder; +use parquet_variant_compute::VariantArrayBuilder; fn main() { // Create some sample data with fields to remove let mut builder = VariantArrayBuilder::new(2); - + // Row 1: User with temporary data { let mut variant_builder = VariantBuilder::new(); @@ -15,7 +15,7 @@ fn main() { obj.insert("age", 30i32); obj.insert("temp_session", "abc123"); obj.insert("debug_info", "temporary debug data"); - + { let mut address = obj.new_object("address"); address.insert("city", "New York"); @@ -23,13 +23,13 @@ fn main() { address.insert("temp_geocode", "40.7128,-74.0060"); let _ = address.finish(); } - + let _ = obj.finish(); } let (metadata, value) = variant_builder.finish(); builder.append_variant_buffers(&metadata, &value); } - + // Row 2: Another user with temporary data { let mut variant_builder = VariantBuilder::new(); @@ -39,7 +39,7 @@ fn main() { obj.insert("age", 25i32); obj.insert("temp_session", "def456"); obj.insert("debug_info", "more temporary data"); - + { let mut address = obj.new_object("address"); address.insert("city", "San Francisco"); @@ -47,47 +47,61 @@ fn main() { address.insert("temp_geocode", "37.7749,-122.4194"); let _ = address.finish(); } - + let _ = obj.finish(); } let (metadata, value) = variant_builder.finish(); builder.append_variant_buffers(&metadata, &value); } - + let array = builder.finish(); - + println!("=== Field Removal Examples ==="); - + // Show original data println!("Original data:"); for i in 0..array.len() { let variant = array.value(i); if let Some(obj) = variant.as_object() { let name = obj.get("name").unwrap().as_string().unwrap().to_string(); - let session = obj.get("temp_session").map(|v| v.as_string().unwrap().to_string()).unwrap_or("None".to_string()); - let debug = obj.get("debug_info").map(|v| v.as_string().unwrap().to_string()).unwrap_or("None".to_string()); + let session = obj + .get("temp_session") + .map(|v| v.as_string().unwrap().to_string()) + .unwrap_or("None".to_string()); + let debug = obj + .get("debug_info") + .map(|v| v.as_string().unwrap().to_string()) + .unwrap_or("None".to_string()); println!(" {}: session={}, debug={}", name, session, debug); } } - + // Remove temporary session field let cleaned_array = array.with_field_removed("temp_session").unwrap(); - + println!("\nRemoving temporary session fields..."); println!("After removing temp_session:"); for i in 0..cleaned_array.len() { let variant = cleaned_array.value(i); if let Some(obj) = variant.as_object() { let name = obj.get("name").unwrap().as_string().unwrap().to_string(); - let session = obj.get("temp_session").map(|v| v.as_string().unwrap().to_string()).unwrap_or("None".to_string()); - let debug = obj.get("debug_info").map(|v| v.as_string().unwrap().to_string()).unwrap_or("None".to_string()); + let session = obj + .get("temp_session") + .map(|v| v.as_string().unwrap().to_string()) + .unwrap_or("None".to_string()); + let debug = obj + .get("debug_info") + .map(|v| v.as_string().unwrap().to_string()) + .unwrap_or("None".to_string()); println!(" {}: session={}, debug={}", name, session, debug); } } - + // Remove multiple temporary fields - let final_array = cleaned_array.with_fields_removed(&["debug_info", "temp_session"]).unwrap(); - + let final_array = cleaned_array + .with_fields_removed(&["debug_info", "temp_session"]) + .unwrap(); + println!("\nRemoving multiple temporary fields..."); println!("Final clean data:"); for i in 0..final_array.len() { @@ -95,16 +109,36 @@ fn main() { if let Some(obj) = variant.as_object() { let name = obj.get("name").unwrap().as_string().unwrap().to_string(); let age = obj.get("age").unwrap().as_int32().unwrap(); - + if let Some(address) = obj.get("address") { if let Some(addr_obj) = address.as_object() { - let city = addr_obj.get("city").unwrap().as_string().unwrap().to_string(); - let zip = addr_obj.get("zip").unwrap().as_string().unwrap().to_string(); - let geocode = addr_obj.get("temp_geocode").map(|v| format!("Some(ShortString(ShortString(\"{}\")))", v.as_string().unwrap())).unwrap_or("None".to_string()); - println!(" {}: age={}, city={}, zip={}, geocode={}", name, age, city, zip, geocode); + let city = addr_obj + .get("city") + .unwrap() + .as_string() + .unwrap() + .to_string(); + let zip = addr_obj + .get("zip") + .unwrap() + .as_string() + .unwrap() + .to_string(); + let geocode = addr_obj + .get("temp_geocode") + .map(|v| { + format!( + "Some(ShortString(ShortString(\"{}\")))", + v.as_string().unwrap() + ) + }) + .unwrap_or("None".to_string()); + println!( + " {}: age={}, city={}, zip={}, geocode={}", + name, age, city, zip, geocode + ); } } } } - -} \ No newline at end of file +} diff --git a/parquet-variant-compute/examples/path_access.rs b/parquet-variant-compute/examples/path_access.rs index 25311699cb95..5f8755a442a1 100644 --- a/parquet-variant-compute/examples/path_access.rs +++ b/parquet-variant-compute/examples/path_access.rs @@ -1,10 +1,10 @@ -use parquet_variant_compute::{VariantArrayBuilder, VariantPath}; use parquet_variant::VariantBuilder; +use parquet_variant_compute::{VariantArrayBuilder, VariantPath}; fn main() { // Create some sample data let mut builder = VariantArrayBuilder::new(2); - + // Row 1: User Alice { let mut variant_builder = VariantBuilder::new(); @@ -12,14 +12,14 @@ fn main() { let mut obj = variant_builder.new_object(); obj.insert("name", "Alice"); obj.insert("age", 30i32); - + { let mut address = obj.new_object("address"); address.insert("city", "New York"); address.insert("zip", "10001"); let _ = address.finish(); } - + { let mut hobbies = obj.new_list("hobbies"); hobbies.append_value("reading"); @@ -27,13 +27,13 @@ fn main() { hobbies.append_value("cooking"); hobbies.finish(); } - + obj.finish().unwrap(); } let (metadata, value) = variant_builder.finish(); builder.append_variant_buffers(&metadata, &value); } - + // Row 2: User Bob { let mut variant_builder = VariantBuilder::new(); @@ -41,51 +41,57 @@ fn main() { let mut obj = variant_builder.new_object(); obj.insert("name", "Bob"); obj.insert("age", 25i32); - + { let mut address = obj.new_object("address"); address.insert("city", "San Francisco"); address.insert("zip", "94102"); let _ = address.finish(); } - + { let mut hobbies = obj.new_list("hobbies"); hobbies.append_value("swimming"); hobbies.append_value("gaming"); hobbies.finish(); } - + obj.finish().unwrap(); } let (metadata, value) = variant_builder.finish(); builder.append_variant_buffers(&metadata, &value); } - + let variant_array = builder.finish(); - + // Demonstrate path access functionality println!("=== Path Access Examples ==="); - + // 1. Single field access let name_path = VariantPath::field("name"); let alice_name = variant_array.get_path(0, &name_path).unwrap(); println!("Alice's name: {}", alice_name.as_string().unwrap()); - + // 2. Nested field access let city_path = VariantPath::field("address").push_field("city"); let alice_city = variant_array.get_path(0, &city_path).unwrap(); let bob_city = variant_array.get_path(1, &city_path).unwrap(); println!("Alice's city: {}", alice_city.as_string().unwrap()); println!("Bob's city: {}", bob_city.as_string().unwrap()); - + // 3. Array index access let hobby_path = VariantPath::field("hobbies").push_index(0); let alice_first_hobby = variant_array.get_path(0, &hobby_path).unwrap(); let bob_first_hobby = variant_array.get_path(1, &hobby_path).unwrap(); - println!("Alice's first hobby: {}", alice_first_hobby.as_string().unwrap()); - println!("Bob's first hobby: {}", bob_first_hobby.as_string().unwrap()); - + println!( + "Alice's first hobby: {}", + alice_first_hobby.as_string().unwrap() + ); + println!( + "Bob's first hobby: {}", + bob_first_hobby.as_string().unwrap() + ); + // 4. Multiple field extraction let paths = vec![ VariantPath::field("name"), @@ -106,11 +112,12 @@ fn main() { } } println!(); - + // 5. Batch field extraction let all_names = variant_array.extract_field_by_path(&VariantPath::field("name")); - let name_strings: Vec = all_names.iter() + let name_strings: Vec = all_names + .iter() .filter_map(|opt| opt.as_ref().map(|v| v.as_string().unwrap().to_string())) .collect(); println!("All names: {:?}", name_strings); -} \ No newline at end of file +} diff --git a/parquet-variant-compute/src/field_operations.rs b/parquet-variant-compute/src/field_operations.rs index 526293a4f7c2..674e5e1cd83d 100644 --- a/parquet-variant-compute/src/field_operations.rs +++ b/parquet-variant-compute/src/field_operations.rs @@ -17,7 +17,7 @@ //! Field extraction and removal operations for variant objects -use crate::variant_parser::{VariantParser, ObjectHeader, ObjectOffsets}; +use crate::variant_parser::{ObjectHeader, ObjectOffsets, VariantParser}; use arrow::error::ArrowError; use parquet_variant::VariantMetadata; use std::collections::HashSet; @@ -45,7 +45,8 @@ impl VariantPath { /// Add a field to the path pub fn push_field(mut self, name: &str) -> Self { - self.elements.push(VariantPathElement::Field(name.to_string())); + self.elements + .push(VariantPathElement::Field(name.to_string())); self } @@ -74,32 +75,39 @@ impl FieldOperations { if !VariantParser::is_object(value_bytes) { return Ok(None); } - + let header_byte = value_bytes[0]; let header = VariantParser::parse_object_header(header_byte)?; let num_elements = VariantParser::unpack_int(&value_bytes[1..], header.num_elements_size)?; let offsets = VariantParser::calculate_object_offsets(&header, num_elements); - + // Find field ID for the target field name let target_field_id = Self::find_field_id(metadata_bytes, field_name)?; let target_field_id = match target_field_id { Some(id) => id, None => return Ok(None), // Field not found }; - + // Search for the field in the object for i in 0..num_elements { let field_id_offset = offsets.field_ids_start + (i * header.field_id_size); - let field_id = VariantParser::unpack_int(&value_bytes[field_id_offset..], header.field_id_size)?; - + let field_id = + VariantParser::unpack_int(&value_bytes[field_id_offset..], header.field_id_size)?; + if field_id == target_field_id { - return Self::extract_field_value_at_index(value_bytes, &header, &offsets, i, num_elements); + return Self::extract_field_value_at_index( + value_bytes, + &header, + &offsets, + i, + num_elements, + ); } } - + Ok(None) } - + /// Remove field from a single variant object pub fn remove_field_bytes( metadata_bytes: &[u8], @@ -108,7 +116,7 @@ impl FieldOperations { ) -> Result>, ArrowError> { Self::remove_fields_bytes(metadata_bytes, value_bytes, &[field_name]) } - + /// Remove multiple fields from a single variant object pub fn remove_fields_bytes( metadata_bytes: &[u8], @@ -118,19 +126,19 @@ impl FieldOperations { if !VariantParser::is_object(value_bytes) { return Ok(Some(value_bytes.to_vec())); } - + let header_byte = value_bytes[0]; let header = VariantParser::parse_object_header(header_byte)?; let num_elements = VariantParser::unpack_int(&value_bytes[1..], header.num_elements_size)?; let offsets = VariantParser::calculate_object_offsets(&header, num_elements); - + // Find field IDs for target field names let target_field_ids = Self::find_field_ids(metadata_bytes, field_names)?; - + if target_field_ids.is_empty() { return Ok(Some(value_bytes.to_vec())); // No fields to remove } - + // Collect fields to keep let fields_to_keep = Self::collect_fields_to_keep( value_bytes, @@ -139,22 +147,22 @@ impl FieldOperations { num_elements, &target_field_ids, )?; - + if fields_to_keep.len() == num_elements { return Ok(Some(value_bytes.to_vec())); // No fields were removed } - + // Sort fields by name for proper variant object ordering let sorted_fields = Self::sort_fields_by_name(metadata_bytes, fields_to_keep)?; - + // Reconstruct object with remaining fields Self::reconstruct_object(sorted_fields) } - + /// Find field ID for a given field name fn find_field_id(metadata_bytes: &[u8], field_name: &str) -> Result, ArrowError> { let metadata = VariantMetadata::try_new(metadata_bytes)?; - + for dict_idx in 0..metadata.len() { if let Ok(name) = metadata.get(dict_idx) { if name == field_name { @@ -162,15 +170,18 @@ impl FieldOperations { } } } - + Ok(None) } - + /// Find field IDs for multiple field names - fn find_field_ids(metadata_bytes: &[u8], field_names: &[&str]) -> Result, ArrowError> { + fn find_field_ids( + metadata_bytes: &[u8], + field_names: &[&str], + ) -> Result, ArrowError> { let metadata = VariantMetadata::try_new(metadata_bytes)?; let mut target_field_ids = HashSet::new(); - + for field_name in field_names { for dict_idx in 0..metadata.len() { if let Ok(name) = metadata.get(dict_idx) { @@ -181,10 +192,10 @@ impl FieldOperations { } } } - + Ok(target_field_ids) } - + /// Extract field value at a specific index fn extract_field_value_at_index( value_bytes: &[u8], @@ -197,17 +208,18 @@ impl FieldOperations { let mut field_offsets = Vec::new(); for i in 0..=num_elements { let offset_idx = offsets.field_offsets_start + (i * header.field_offset_size); - let offset_val = VariantParser::unpack_int(&value_bytes[offset_idx..], header.field_offset_size)?; + let offset_val = + VariantParser::unpack_int(&value_bytes[offset_idx..], header.field_offset_size)?; field_offsets.push(offset_val); } - + let field_start = field_offsets[field_index]; - + // To find the end offset, we need to find the next field in byte order // Since fields are stored in alphabetical order, we can't just use field_index + 1 // We need to find the smallest offset that's greater than field_start let mut field_end = field_offsets[num_elements]; // Default to final offset - + for i in 0..num_elements { if i != field_index { let other_offset = field_offsets[i]; @@ -216,10 +228,10 @@ impl FieldOperations { } } } - + let field_start_absolute = offsets.values_start + field_start; let field_end_absolute = offsets.values_start + field_end; - + if field_start_absolute <= field_end_absolute && field_end_absolute <= value_bytes.len() { let field_value_bytes = &value_bytes[field_start_absolute..field_end_absolute]; Ok(Some(field_value_bytes.to_vec())) @@ -227,7 +239,7 @@ impl FieldOperations { Ok(None) } } - + /// Collect fields to keep (those not being removed) fn collect_fields_to_keep( value_bytes: &[u8], @@ -237,82 +249,97 @@ impl FieldOperations { target_field_ids: &HashSet, ) -> Result)>, ArrowError> { let mut fields_to_keep = Vec::new(); - + for i in 0..num_elements { let field_id_offset = offsets.field_ids_start + (i * header.field_id_size); - let field_id = VariantParser::unpack_int(&value_bytes[field_id_offset..], header.field_id_size)?; - + let field_id = + VariantParser::unpack_int(&value_bytes[field_id_offset..], header.field_id_size)?; + if !target_field_ids.contains(&field_id) { - if let Some(field_value) = Self::extract_field_value_at_index(value_bytes, header, offsets, i, num_elements)? { + if let Some(field_value) = Self::extract_field_value_at_index( + value_bytes, + header, + offsets, + i, + num_elements, + )? { fields_to_keep.push((field_id, field_value)); } } } - + Ok(fields_to_keep) } - + /// Sort fields by their names (variant objects must be sorted alphabetically) fn sort_fields_by_name( metadata_bytes: &[u8], mut fields: Vec<(usize, Vec)>, ) -> Result)>, ArrowError> { let metadata = VariantMetadata::try_new(metadata_bytes)?; - + fields.sort_by(|a, b| { let name_a = metadata.get(a.0).unwrap_or(""); let name_b = metadata.get(b.0).unwrap_or(""); name_a.cmp(name_b) }); - + Ok(fields) } - + /// Reconstruct variant object from sorted fields fn reconstruct_object(fields: Vec<(usize, Vec)>) -> Result>, ArrowError> { let new_num_elements = fields.len(); let new_is_large = new_num_elements > 255; - + // Calculate sizes for new object let max_field_id = fields.iter().map(|(id, _)| *id).max().unwrap_or(0); let new_field_id_size = VariantParser::calculate_int_size(max_field_id); - + let total_values_size: usize = fields.iter().map(|(_, value)| value.len()).sum(); let new_field_offset_size = VariantParser::calculate_int_size(total_values_size); - + // Build new object let mut new_value_bytes = Vec::new(); - + // Write header - let new_header = VariantParser::build_object_header(new_is_large, new_field_id_size, new_field_offset_size); + let new_header = VariantParser::build_object_header( + new_is_large, + new_field_id_size, + new_field_offset_size, + ); new_value_bytes.push(new_header); - + // Write num_elements if new_is_large { new_value_bytes.extend_from_slice(&(new_num_elements as u32).to_le_bytes()); } else { new_value_bytes.push(new_num_elements as u8); } - + // Write field IDs for (field_id, _) in &fields { VariantParser::write_int_bytes(&mut new_value_bytes, *field_id, new_field_id_size); } - + // Write field offsets let mut current_offset = 0; for (_, field_value) in &fields { - VariantParser::write_int_bytes(&mut new_value_bytes, current_offset, new_field_offset_size); + VariantParser::write_int_bytes( + &mut new_value_bytes, + current_offset, + new_field_offset_size, + ); current_offset += field_value.len(); } // Write final offset VariantParser::write_int_bytes(&mut new_value_bytes, current_offset, new_field_offset_size); - + // Write field values for (_, field_value) in &fields { new_value_bytes.extend_from_slice(field_value); } - + Ok(Some(new_value_bytes)) } @@ -323,18 +350,22 @@ impl FieldOperations { path: &VariantPath, ) -> Result>, ArrowError> { let mut current_value = value_bytes.to_vec(); - + for element in path.elements() { match element { VariantPathElement::Field(field_name) => { - if let Some(field_bytes) = Self::get_field_bytes(metadata_bytes, ¤t_value, field_name)? { + if let Some(field_bytes) = + Self::get_field_bytes(metadata_bytes, ¤t_value, field_name)? + { current_value = field_bytes; } else { return Ok(None); } } VariantPathElement::Index(idx) => { - if let Some(element_bytes) = Self::get_array_element_bytes(metadata_bytes, ¤t_value, *idx)? { + if let Some(element_bytes) = + Self::get_array_element_bytes(metadata_bytes, ¤t_value, *idx)? + { current_value = element_bytes; } else { return Ok(None); @@ -342,10 +373,10 @@ impl FieldOperations { } } } - + Ok(Some(current_value)) } - + /// Get the value at a specific path and return its type and data pub fn get_path_with_type( metadata_bytes: &[u8], @@ -360,7 +391,7 @@ impl FieldOperations { } Ok(None) } - + /// Get field bytes from an object at the byte level fn get_field_bytes( metadata_bytes: &[u8], @@ -379,7 +410,7 @@ impl FieldOperations { Ok(None) } } - + /// Get array element bytes at the byte level fn get_array_element_bytes( _metadata_bytes: &[u8], @@ -390,59 +421,60 @@ impl FieldOperations { if value_bytes.is_empty() { return Ok(None); } - + match VariantParser::parse_variant_header(value_bytes[0])? { crate::variant_parser::VariantType::Array(array_header) => { - let num_elements = VariantParser::unpack_int( - &value_bytes[1..], - array_header.num_elements_size - )?; - + let num_elements = + VariantParser::unpack_int(&value_bytes[1..], array_header.num_elements_size)?; + // Check bounds if index >= num_elements { return Ok(None); } - + // Calculate array offsets let offsets = VariantParser::calculate_array_offsets(&array_header, num_elements); - + // Get element offset - let element_offset_start = offsets.element_offsets_start + index * array_header.element_offset_size; + let element_offset_start = + offsets.element_offsets_start + index * array_header.element_offset_size; let element_offset_end = element_offset_start + array_header.element_offset_size; - + if element_offset_end > value_bytes.len() { return Err(ArrowError::InvalidArgumentError( - "Element offset exceeds value buffer".to_string() + "Element offset exceeds value buffer".to_string(), )); } - + let element_offset = VariantParser::unpack_int( &value_bytes[element_offset_start..element_offset_end], - array_header.element_offset_size + array_header.element_offset_size, )?; - + // Get next element offset (or end of data) let next_offset = if index + 1 < num_elements { - let next_element_offset_start = offsets.element_offsets_start + (index + 1) * array_header.element_offset_size; - let next_element_offset_end = next_element_offset_start + array_header.element_offset_size; + let next_element_offset_start = offsets.element_offsets_start + + (index + 1) * array_header.element_offset_size; + let next_element_offset_end = + next_element_offset_start + array_header.element_offset_size; VariantParser::unpack_int( &value_bytes[next_element_offset_start..next_element_offset_end], - array_header.element_offset_size + array_header.element_offset_size, )? } else { value_bytes.len() }; - + // Extract element bytes let element_start = offsets.elements_start + element_offset; let element_end = offsets.elements_start + next_offset; - + if element_end > value_bytes.len() { return Err(ArrowError::InvalidArgumentError( - "Element data exceeds value buffer".to_string() + "Element data exceeds value buffer".to_string(), )); } - + Ok(Some(value_bytes[element_start..element_end].to_vec())) } _ => Ok(None), // Not an array, can't extract elements @@ -470,28 +502,31 @@ mod tests { #[test] fn test_extract_field_bytes() { let (metadata, value) = create_test_object(); - + let name_bytes = FieldOperations::extract_field_bytes(&metadata, &value, "name").unwrap(); assert!(name_bytes.is_some()); - - let nonexistent_bytes = FieldOperations::extract_field_bytes(&metadata, &value, "nonexistent").unwrap(); + + let nonexistent_bytes = + FieldOperations::extract_field_bytes(&metadata, &value, "nonexistent").unwrap(); assert!(nonexistent_bytes.is_none()); } #[test] fn test_remove_field_bytes() { let (metadata, value) = create_test_object(); - + let result = FieldOperations::remove_field_bytes(&metadata, &value, "city").unwrap(); assert!(result.is_some()); - + // Verify the field was removed by checking we can't extract it let new_value = result.unwrap(); - let city_bytes = FieldOperations::extract_field_bytes(&metadata, &new_value, "city").unwrap(); + let city_bytes = + FieldOperations::extract_field_bytes(&metadata, &new_value, "city").unwrap(); assert!(city_bytes.is_none()); - + // Verify other fields are still there - let name_bytes = FieldOperations::extract_field_bytes(&metadata, &new_value, "name").unwrap(); + let name_bytes = + FieldOperations::extract_field_bytes(&metadata, &new_value, "name").unwrap(); assert!(name_bytes.is_some()); } -} \ No newline at end of file +} diff --git a/parquet-variant-compute/src/from_json.rs b/parquet-variant-compute/src/from_json.rs index b48487d8dd5b..1de8e62bc41e 100644 --- a/parquet-variant-compute/src/from_json.rs +++ b/parquet-variant-compute/src/from_json.rs @@ -69,10 +69,10 @@ mod test { ]); let array_ref: ArrayRef = Arc::new(input); let variant_array = batch_json_string_to_variant(&array_ref).unwrap(); - + let metadata_array = variant_array.metadata_field(); let value_array = variant_array.value_field(); - + // Compare row 0 assert!(!variant_array.is_null(0)); assert_eq!(variant_array.value(0).as_int8(), Some(1)); @@ -101,7 +101,7 @@ mod test { assert!(!value_array.is_null(1)); assert!(!metadata_array.is_null(4)); assert!(!value_array.is_null(4)); - + // Null rows should have 0-length metadata and value assert_eq!(metadata_array.as_binary_view().value(1).len(), 0); assert_eq!(value_array.as_binary_view().value(1).len(), 0); diff --git a/parquet-variant-compute/src/lib.rs b/parquet-variant-compute/src/lib.rs index efe0163ef3b5..e3d77bf7ea0b 100644 --- a/parquet-variant-compute/src/lib.rs +++ b/parquet-variant-compute/src/lib.rs @@ -17,16 +17,18 @@ //! Parquet variant compute functions -pub mod from_json; -pub mod variant_parser; pub mod field_operations; +pub mod from_json; +pub mod to_json; pub mod variant_array; pub mod variant_array_builder; -pub mod to_json; +pub mod variant_parser; -pub use variant_array::VariantArray; -pub use variant_array_builder::VariantArrayBuilder; pub use field_operations::{VariantPath, VariantPathElement}; -pub use variant_parser::{VariantType, VariantBasicType, PrimitiveType, ShortStringHeader, ObjectHeader, ArrayHeader}; pub use from_json::batch_json_string_to_variant; pub use to_json::batch_variant_to_json_string; +pub use variant_array::VariantArray; +pub use variant_array_builder::VariantArrayBuilder; +pub use variant_parser::{ + ArrayHeader, ObjectHeader, PrimitiveType, ShortStringHeader, VariantBasicType, VariantType, +}; diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index cf05de0555ce..b90298f5f6a1 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -232,23 +232,27 @@ impl VariantArray { pub fn metadata(&self, index: usize) -> &[u8] { self.metadata_field().as_binary_view().value(index).as_ref() } - + /// Get the value bytes for a specific index pub fn value_bytes(&self, index: usize) -> &[u8] { self.value_field().as_binary_view().value(index).as_ref() } - + /// Get value at a specific path for the variant at the given index - /// + /// /// Uses high-level Variant API for convenience. Returns a Variant object that can be /// directly used with standard variant operations. - pub fn get_path(&self, index: usize, path: &crate::field_operations::VariantPath) -> Option { + pub fn get_path( + &self, + index: usize, + path: &crate::field_operations::VariantPath, + ) -> Option { if index >= self.len() || self.is_null(index) { return None; } - + let mut current_variant = self.value(index); - + for element in path.elements() { match element { crate::field_operations::VariantPathElement::Field(field_name) => { @@ -259,32 +263,36 @@ impl VariantArray { } } } - + Some(current_variant) } - + /// Get values at multiple paths for the variant at the given index - /// + /// /// Convenience method that applies `get_path()` to multiple paths at once. /// Useful for extracting multiple fields from a single variant row. - pub fn get_paths(&self, index: usize, paths: &[crate::field_operations::VariantPath]) -> Vec> { + pub fn get_paths( + &self, + index: usize, + paths: &[crate::field_operations::VariantPath], + ) -> Vec> { let mut results = Vec::new(); for path in paths { results.push(self.get_path(index, path)); } results } - + /// Get the field names for an object at the given index pub fn get_field_names(&self, index: usize) -> Vec { if index >= self.len() { return vec![]; } - + if self.is_null(index) { return vec![]; } - + let variant = self.value(index); if let Some(obj) = variant.as_object() { let mut paths = Vec::new(); @@ -298,12 +306,15 @@ impl VariantArray { vec![] } } - + /// Extract field values by path from all variants in the array - /// + /// /// Applies `get_path()` to a single path across all rows in the array. /// Useful for extracting a column of values from nested variant data. - pub fn extract_field_by_path(&self, path: &crate::field_operations::VariantPath) -> Vec> { + pub fn extract_field_by_path( + &self, + path: &crate::field_operations::VariantPath, + ) -> Vec> { let mut results = Vec::new(); for i in 0..self.len() { results.push(self.get_path(i, path)); @@ -326,12 +337,16 @@ impl VariantArray { /// Create a new VariantArray with a field removed from all variants pub fn with_field_removed(&self, field_name: &str) -> Result { let mut builder = crate::variant_array_builder::VariantArrayBuilder::new(self.len()); - + for i in 0..self.len() { if self.is_null(i) { builder.append_null(); } else { - match FieldOperations::remove_field_bytes(self.metadata(i), self.value_bytes(i), field_name)? { + match FieldOperations::remove_field_bytes( + self.metadata(i), + self.value_bytes(i), + field_name, + )? { Some(new_value) => { builder.append_variant_buffers(self.metadata(i), &new_value); } @@ -342,19 +357,23 @@ impl VariantArray { } } } - + Ok(builder.build()) } - + /// Create a new VariantArray with multiple fields removed from all variants pub fn with_fields_removed(&self, field_names: &[&str]) -> Result { let mut builder = crate::variant_array_builder::VariantArrayBuilder::new(self.len()); - + for i in 0..self.len() { if self.is_null(i) { builder.append_null(); } else { - match FieldOperations::remove_fields_bytes(self.metadata(i), self.value_bytes(i), field_names)? { + match FieldOperations::remove_fields_bytes( + self.metadata(i), + self.value_bytes(i), + field_names, + )? { Some(new_value) => { builder.append_variant_buffers(self.metadata(i), &new_value); } @@ -365,7 +384,7 @@ impl VariantArray { } } } - + Ok(builder.build()) } } @@ -504,7 +523,7 @@ mod test { fn create_test_variant_array() -> VariantArray { let mut builder = VariantArrayBuilder::new(2); - + // Create variant 1: {"name": "Alice", "age": 30} let mut builder1 = VariantBuilder::new(); { @@ -515,7 +534,7 @@ mod test { } let (metadata1, value1) = builder1.finish(); builder.append_variant_buffers(&metadata1, &value1); - + // Create variant 2: {"name": "Bob", "age": 25, "city": "NYC"} let mut builder2 = VariantBuilder::new(); { @@ -527,7 +546,7 @@ mod test { } let (metadata2, value2) = builder2.finish(); builder.append_variant_buffers(&metadata2, &value2); - + builder.build() } @@ -536,27 +555,42 @@ mod test { let array = create_test_variant_array(); assert_eq!(array.len(), 2); assert!(!array.is_empty()); - + // Test accessing variants let variant1 = array.value(0); - assert_eq!(variant1.get_object_field("name").unwrap().as_string(), Some("Alice")); - assert_eq!(variant1.get_object_field("age").unwrap().as_int32(), Some(30)); - + assert_eq!( + variant1.get_object_field("name").unwrap().as_string(), + Some("Alice") + ); + assert_eq!( + variant1.get_object_field("age").unwrap().as_int32(), + Some(30) + ); + let variant2 = array.value(1); - assert_eq!(variant2.get_object_field("name").unwrap().as_string(), Some("Bob")); - assert_eq!(variant2.get_object_field("age").unwrap().as_int32(), Some(25)); - assert_eq!(variant2.get_object_field("city").unwrap().as_string(), Some("NYC")); + assert_eq!( + variant2.get_object_field("name").unwrap().as_string(), + Some("Bob") + ); + assert_eq!( + variant2.get_object_field("age").unwrap().as_int32(), + Some(25) + ); + assert_eq!( + variant2.get_object_field("city").unwrap().as_string(), + Some("NYC") + ); } #[test] fn test_get_field_names() { let array = create_test_variant_array(); - + let paths1 = array.get_field_names(0); assert_eq!(paths1.len(), 2); assert!(paths1.contains(&"name".to_string())); assert!(paths1.contains(&"age".to_string())); - + let paths2 = array.get_field_names(1); assert_eq!(paths2.len(), 3); assert!(paths2.contains(&"name".to_string())); @@ -567,12 +601,12 @@ mod test { #[test] fn test_get_path() { let array = create_test_variant_array(); - + // Test field access let name_path = crate::field_operations::VariantPath::field("name"); let alice_name = array.get_path(0, &name_path).unwrap(); assert_eq!(alice_name.as_string(), Some("Alice")); - + // Test non-existent field let nonexistent_path = crate::field_operations::VariantPath::field("nonexistent"); let result = array.get_path(0, &nonexistent_path); @@ -582,16 +616,16 @@ mod test { #[test] fn test_with_field_removed() { let array = create_test_variant_array(); - + let new_array = array.with_field_removed("age").unwrap(); - + // Check that age field was removed from all variants let variant1 = new_array.value(0); let obj1 = variant1.as_object().unwrap(); assert_eq!(obj1.len(), 1); assert!(obj1.get("name").is_some()); assert!(obj1.get("age").is_none()); - + let variant2 = new_array.value(1); let obj2 = variant2.as_object().unwrap(); assert_eq!(obj2.len(), 2); @@ -603,14 +637,14 @@ mod test { #[test] fn test_metadata_and_value_fields() { let array = create_test_variant_array(); - + let metadata_field = array.metadata_field(); let value_field = array.value_field(); - + // Check that we got the expected arrays assert_eq!(metadata_field.len(), 2); assert_eq!(value_field.len(), 2); - + // Check that metadata and value bytes are non-empty assert!(!metadata_field.as_binary_view().value(0).is_empty()); assert!(!value_field.as_binary_view().value(0).is_empty()); @@ -626,4 +660,3 @@ mod test { Arc::new(BinaryArray::from(vec![b"test" as &[u8]])) } } - diff --git a/parquet-variant-compute/src/variant_array_builder.rs b/parquet-variant-compute/src/variant_array_builder.rs index aab5b978e107..18823ac71cd7 100644 --- a/parquet-variant-compute/src/variant_array_builder.rs +++ b/parquet-variant-compute/src/variant_array_builder.rs @@ -131,7 +131,7 @@ impl VariantArrayBuilder { VariantArray::try_new(Arc::new(inner)).expect("valid VariantArray by construction") } - + /// Finish building the VariantArray (alias for build for compatibility) pub fn finish(self) -> VariantArray { self.build() diff --git a/parquet-variant-compute/src/variant_parser.rs b/parquet-variant-compute/src/variant_parser.rs index b9e967154ea9..0289445bed01 100644 --- a/parquet-variant-compute/src/variant_parser.rs +++ b/parquet-variant-compute/src/variant_parser.rs @@ -104,19 +104,27 @@ impl VariantParser { /// General dispatch function to parse any variant header pub fn parse_variant_header(header_byte: u8) -> Result { let basic_type = Self::get_basic_type(header_byte); - + match basic_type { - VariantBasicType::Primitive => Ok(VariantType::Primitive(Self::parse_primitive_header(header_byte)?)), - VariantBasicType::ShortString => Ok(VariantType::ShortString(Self::parse_short_string_header(header_byte)?)), - VariantBasicType::Object => Ok(VariantType::Object(Self::parse_object_header(header_byte)?)), - VariantBasicType::Array => Ok(VariantType::Array(Self::parse_array_header(header_byte)?)), + VariantBasicType::Primitive => Ok(VariantType::Primitive( + Self::parse_primitive_header(header_byte)?, + )), + VariantBasicType::ShortString => Ok(VariantType::ShortString( + Self::parse_short_string_header(header_byte)?, + )), + VariantBasicType::Object => { + Ok(VariantType::Object(Self::parse_object_header(header_byte)?)) + } + VariantBasicType::Array => { + Ok(VariantType::Array(Self::parse_array_header(header_byte)?)) + } } } - + /// Parse primitive type header pub fn parse_primitive_header(header_byte: u8) -> Result { let primitive_type = header_byte >> 2; - + match primitive_type { 0 => Ok(PrimitiveType::Null), 1 => Ok(PrimitiveType::True), @@ -135,36 +143,38 @@ impl VariantParser { 14 => Ok(PrimitiveType::Float), 15 => Ok(PrimitiveType::Binary), 16 => Ok(PrimitiveType::String), - _ => Err(ArrowError::InvalidArgumentError( - format!("Invalid primitive type: {}", primitive_type) - )), + _ => Err(ArrowError::InvalidArgumentError(format!( + "Invalid primitive type: {}", + primitive_type + ))), } } - + /// Parse short string header pub fn parse_short_string_header(header_byte: u8) -> Result { let length = (header_byte >> 2) as usize; - + if length > 13 { - return Err(ArrowError::InvalidArgumentError( - format!("Short string length {} exceeds maximum of 13", length) - )); + return Err(ArrowError::InvalidArgumentError(format!( + "Short string length {} exceeds maximum of 13", + length + ))); } - + Ok(ShortStringHeader { length }) } - + /// Parse object header from header byte pub fn parse_object_header(header_byte: u8) -> Result { let value_header = header_byte >> 2; let field_offset_size_minus_one = value_header & 0x03; let field_id_size_minus_one = (value_header >> 2) & 0x03; let is_large = (value_header & 0x10) != 0; - + let num_elements_size = if is_large { 4 } else { 1 }; let field_id_size = (field_id_size_minus_one + 1) as usize; let field_offset_size = (field_offset_size_minus_one + 1) as usize; - + Ok(ObjectHeader { num_elements_size, field_id_size, @@ -172,42 +182,43 @@ impl VariantParser { is_large, }) } - + /// Parse array header from header byte pub fn parse_array_header(header_byte: u8) -> Result { let value_header = header_byte >> 2; let element_offset_size_minus_one = value_header & 0x03; let is_large = (value_header & 0x10) != 0; - + let num_elements_size = if is_large { 4 } else { 1 }; let element_offset_size = (element_offset_size_minus_one + 1) as usize; - + Ok(ArrayHeader { num_elements_size, element_offset_size, is_large, }) } - + /// Unpack integer from bytes pub fn unpack_int(bytes: &[u8], size: usize) -> Result { if bytes.len() < size { return Err(ArrowError::InvalidArgumentError( - "Not enough bytes to unpack integer".to_string() + "Not enough bytes to unpack integer".to_string(), )); } - + match size { 1 => Ok(bytes[0] as usize), 2 => Ok(u16::from_le_bytes([bytes[0], bytes[1]]) as usize), 3 => Ok(u32::from_le_bytes([bytes[0], bytes[1], bytes[2], 0]) as usize), 4 => Ok(u32::from_le_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]) as usize), - _ => Err(ArrowError::InvalidArgumentError( - format!("Invalid integer size: {}", size) - )), + _ => Err(ArrowError::InvalidArgumentError(format!( + "Invalid integer size: {}", + size + ))), } } - + /// Calculate the size needed to store an integer pub fn calculate_int_size(value: usize) -> usize { if value <= u8::MAX as usize { @@ -220,13 +231,20 @@ impl VariantParser { 4 } } - + /// Build object header byte - pub fn build_object_header(is_large: bool, field_id_size: usize, field_offset_size: usize) -> u8 { + pub fn build_object_header( + is_large: bool, + field_id_size: usize, + field_offset_size: usize, + ) -> u8 { let large_bit = if is_large { 1 } else { 0 }; - (large_bit << 6) | (((field_id_size - 1) as u8) << 4) | (((field_offset_size - 1) as u8) << 2) | 2 + (large_bit << 6) + | (((field_id_size - 1) as u8) << 4) + | (((field_offset_size - 1) as u8) << 2) + | 2 } - + /// Write integer bytes to buffer pub fn write_int_bytes(buffer: &mut Vec, value: usize, size: usize) { match size { @@ -240,7 +258,7 @@ impl VariantParser { _ => panic!("Invalid size: {}", size), } } - + /// Get the basic type from header byte pub fn get_basic_type(header_byte: u8) -> VariantBasicType { match header_byte & 0x03 { @@ -251,7 +269,7 @@ impl VariantParser { _ => panic!("Invalid basic type: {}", header_byte & 0x03), } } - + /// Check if value bytes represent a primitive pub fn is_primitive(value_bytes: &[u8]) -> bool { if value_bytes.is_empty() { @@ -259,7 +277,7 @@ impl VariantParser { } Self::get_basic_type(value_bytes[0]) == VariantBasicType::Primitive } - + /// Check if value bytes represent a short string pub fn is_short_string(value_bytes: &[u8]) -> bool { if value_bytes.is_empty() { @@ -267,7 +285,7 @@ impl VariantParser { } Self::get_basic_type(value_bytes[0]) == VariantBasicType::ShortString } - + /// Check if value bytes represent an object pub fn is_object(value_bytes: &[u8]) -> bool { if value_bytes.is_empty() { @@ -275,7 +293,7 @@ impl VariantParser { } Self::get_basic_type(value_bytes[0]) == VariantBasicType::Object } - + /// Check if value bytes represent an array pub fn is_array(value_bytes: &[u8]) -> bool { if value_bytes.is_empty() { @@ -283,46 +301,58 @@ impl VariantParser { } Self::get_basic_type(value_bytes[0]) == VariantBasicType::Array } - + /// Get the data length for a primitive type pub fn get_primitive_data_length(primitive_type: &PrimitiveType) -> usize { match primitive_type { PrimitiveType::Null | PrimitiveType::True | PrimitiveType::False => 0, PrimitiveType::Int8 => 1, PrimitiveType::Int16 => 2, - PrimitiveType::Int32 | PrimitiveType::Float | PrimitiveType::Decimal4 | PrimitiveType::Date => 4, - PrimitiveType::Int64 | PrimitiveType::Double | PrimitiveType::Decimal8 | PrimitiveType::TimestampNtz | PrimitiveType::TimestampLtz => 8, + PrimitiveType::Int32 + | PrimitiveType::Float + | PrimitiveType::Decimal4 + | PrimitiveType::Date => 4, + PrimitiveType::Int64 + | PrimitiveType::Double + | PrimitiveType::Decimal8 + | PrimitiveType::TimestampNtz + | PrimitiveType::TimestampLtz => 8, PrimitiveType::Decimal16 => 16, PrimitiveType::Binary | PrimitiveType::String => 0, // Variable length, need to read from data } } - + /// Extract short string data from value bytes pub fn extract_short_string_data(value_bytes: &[u8]) -> Result<&[u8], ArrowError> { if value_bytes.is_empty() { - return Err(ArrowError::InvalidArgumentError("Empty value bytes".to_string())); + return Err(ArrowError::InvalidArgumentError( + "Empty value bytes".to_string(), + )); } - + let header = Self::parse_short_string_header(value_bytes[0])?; - + if value_bytes.len() < 1 + header.length { - return Err(ArrowError::InvalidArgumentError( - format!("Short string data length {} exceeds available bytes", header.length) - )); + return Err(ArrowError::InvalidArgumentError(format!( + "Short string data length {} exceeds available bytes", + header.length + ))); } - + Ok(&value_bytes[1..1 + header.length]) } - + /// Extract primitive data from value bytes pub fn extract_primitive_data(value_bytes: &[u8]) -> Result<&[u8], ArrowError> { if value_bytes.is_empty() { - return Err(ArrowError::InvalidArgumentError("Empty value bytes".to_string())); + return Err(ArrowError::InvalidArgumentError( + "Empty value bytes".to_string(), + )); } - + let primitive_type = Self::parse_primitive_header(value_bytes[0])?; let data_length = Self::get_primitive_data_length(&primitive_type); - + if data_length == 0 { // Handle variable length types and null/boolean match primitive_type { @@ -331,48 +361,56 @@ impl VariantParser { // These require reading length from the data if value_bytes.len() < 5 { return Err(ArrowError::InvalidArgumentError( - "Not enough bytes for variable length primitive".to_string() + "Not enough bytes for variable length primitive".to_string(), )); } - let length = u32::from_le_bytes([value_bytes[1], value_bytes[2], value_bytes[3], value_bytes[4]]) as usize; + let length = u32::from_le_bytes([ + value_bytes[1], + value_bytes[2], + value_bytes[3], + value_bytes[4], + ]) as usize; if value_bytes.len() < 5 + length { return Err(ArrowError::InvalidArgumentError( - "Variable length primitive data exceeds available bytes".to_string() + "Variable length primitive data exceeds available bytes".to_string(), )); } Ok(&value_bytes[5..5 + length]) } - _ => Err(ArrowError::InvalidArgumentError( - format!("Unhandled primitive type: {:?}", primitive_type) - )), + _ => Err(ArrowError::InvalidArgumentError(format!( + "Unhandled primitive type: {:?}", + primitive_type + ))), } } else { if value_bytes.len() < 1 + data_length { - return Err(ArrowError::InvalidArgumentError( - format!("Primitive data length {} exceeds available bytes", data_length) - )); + return Err(ArrowError::InvalidArgumentError(format!( + "Primitive data length {} exceeds available bytes", + data_length + ))); } Ok(&value_bytes[1..1 + data_length]) } } - + /// Calculate byte offsets for array elements pub fn calculate_array_offsets(header: &ArrayHeader, num_elements: usize) -> ArrayOffsets { let element_offsets_start = 1 + header.num_elements_size; - let elements_start = element_offsets_start + ((num_elements + 1) * header.element_offset_size); - + let elements_start = + element_offsets_start + ((num_elements + 1) * header.element_offset_size); + ArrayOffsets { element_offsets_start, elements_start, } } - + /// Calculate byte offsets for object fields pub fn calculate_object_offsets(header: &ObjectHeader, num_elements: usize) -> ObjectOffsets { let field_ids_start = 1 + header.num_elements_size; let field_offsets_start = field_ids_start + (num_elements * header.field_id_size); let values_start = field_offsets_start + ((num_elements + 1) * header.field_offset_size); - + ObjectOffsets { field_ids_start, field_offsets_start, @@ -405,7 +443,7 @@ mod tests { let mut buffer = Vec::new(); VariantParser::write_int_bytes(&mut buffer, 42, 1); assert_eq!(buffer, vec![42]); - + let mut buffer = Vec::new(); VariantParser::write_int_bytes(&mut buffer, 256, 2); assert_eq!(buffer, vec![0, 1]); @@ -414,32 +452,56 @@ mod tests { #[test] fn test_parse_primitive_header() { // Test null (primitive type 0) - assert_eq!(VariantParser::parse_primitive_header(0b00000000).unwrap(), PrimitiveType::Null); - + assert_eq!( + VariantParser::parse_primitive_header(0b00000000).unwrap(), + PrimitiveType::Null + ); + // Test true (primitive type 1) - assert_eq!(VariantParser::parse_primitive_header(0b00000100).unwrap(), PrimitiveType::True); - + assert_eq!( + VariantParser::parse_primitive_header(0b00000100).unwrap(), + PrimitiveType::True + ); + // Test false (primitive type 2) - assert_eq!(VariantParser::parse_primitive_header(0b00001000).unwrap(), PrimitiveType::False); - + assert_eq!( + VariantParser::parse_primitive_header(0b00001000).unwrap(), + PrimitiveType::False + ); + // Test int32 (primitive type 5) - assert_eq!(VariantParser::parse_primitive_header(0b00010100).unwrap(), PrimitiveType::Int32); - + assert_eq!( + VariantParser::parse_primitive_header(0b00010100).unwrap(), + PrimitiveType::Int32 + ); + // Test double (primitive type 7) - assert_eq!(VariantParser::parse_primitive_header(0b00011100).unwrap(), PrimitiveType::Double); + assert_eq!( + VariantParser::parse_primitive_header(0b00011100).unwrap(), + PrimitiveType::Double + ); } #[test] fn test_parse_short_string_header() { // Test 0-length short string - assert_eq!(VariantParser::parse_short_string_header(0b00000001).unwrap(), ShortStringHeader { length: 0 }); - + assert_eq!( + VariantParser::parse_short_string_header(0b00000001).unwrap(), + ShortStringHeader { length: 0 } + ); + // Test 5-length short string - assert_eq!(VariantParser::parse_short_string_header(0b00010101).unwrap(), ShortStringHeader { length: 5 }); - + assert_eq!( + VariantParser::parse_short_string_header(0b00010101).unwrap(), + ShortStringHeader { length: 5 } + ); + // Test 13-length short string (maximum) - assert_eq!(VariantParser::parse_short_string_header(0b00110101).unwrap(), ShortStringHeader { length: 13 }); - + assert_eq!( + VariantParser::parse_short_string_header(0b00110101).unwrap(), + ShortStringHeader { length: 13 } + ); + // Test invalid length > 13 assert!(VariantParser::parse_short_string_header(0b00111001).is_err()); } @@ -449,28 +511,28 @@ mod tests { // Test primitive dispatch let primitive_header = 0b00000100; // True primitive match VariantParser::parse_variant_header(primitive_header).unwrap() { - VariantType::Primitive(PrimitiveType::True) => {}, + VariantType::Primitive(PrimitiveType::True) => {} _ => panic!("Expected primitive True"), } - + // Test short string dispatch let short_string_header = 0b00010101; // 5-length short string match VariantParser::parse_variant_header(short_string_header).unwrap() { - VariantType::ShortString(ShortStringHeader { length: 5 }) => {}, + VariantType::ShortString(ShortStringHeader { length: 5 }) => {} _ => panic!("Expected short string with length 5"), } - + // Test object dispatch let object_header = 0b00000010; // Basic object match VariantParser::parse_variant_header(object_header).unwrap() { - VariantType::Object(_) => {}, + VariantType::Object(_) => {} _ => panic!("Expected object"), } - + // Test array dispatch let array_header = 0b00000011; // Basic array match VariantParser::parse_variant_header(array_header).unwrap() { - VariantType::Array(_) => {}, + VariantType::Array(_) => {} _ => panic!("Expected array"), } } @@ -481,16 +543,16 @@ mod tests { assert!(VariantParser::is_primitive(&[0b00000000])); // Null assert!(VariantParser::is_primitive(&[0b00000100])); // True assert!(!VariantParser::is_primitive(&[0b00000001])); // Not primitive - + // Test short string type check assert!(VariantParser::is_short_string(&[0b00000001])); // 0-length short string assert!(VariantParser::is_short_string(&[0b00010101])); // 5-length short string assert!(!VariantParser::is_short_string(&[0b00000000])); // Not short string - + // Test object type check assert!(VariantParser::is_object(&[0b00000010])); // Basic object assert!(!VariantParser::is_object(&[0b00000001])); // Not object - + // Test array type check assert!(VariantParser::is_array(&[0b00000011])); // Basic array assert!(!VariantParser::is_array(&[0b00000010])); // Not array @@ -498,29 +560,68 @@ mod tests { #[test] fn test_get_primitive_data_length() { - assert_eq!(VariantParser::get_primitive_data_length(&PrimitiveType::Null), 0); - assert_eq!(VariantParser::get_primitive_data_length(&PrimitiveType::True), 0); - assert_eq!(VariantParser::get_primitive_data_length(&PrimitiveType::False), 0); - assert_eq!(VariantParser::get_primitive_data_length(&PrimitiveType::Int8), 1); - assert_eq!(VariantParser::get_primitive_data_length(&PrimitiveType::Int16), 2); - assert_eq!(VariantParser::get_primitive_data_length(&PrimitiveType::Int32), 4); - assert_eq!(VariantParser::get_primitive_data_length(&PrimitiveType::Int64), 8); - assert_eq!(VariantParser::get_primitive_data_length(&PrimitiveType::Double), 8); - assert_eq!(VariantParser::get_primitive_data_length(&PrimitiveType::Decimal16), 16); - assert_eq!(VariantParser::get_primitive_data_length(&PrimitiveType::Binary), 0); // Variable length - assert_eq!(VariantParser::get_primitive_data_length(&PrimitiveType::String), 0); // Variable length + assert_eq!( + VariantParser::get_primitive_data_length(&PrimitiveType::Null), + 0 + ); + assert_eq!( + VariantParser::get_primitive_data_length(&PrimitiveType::True), + 0 + ); + assert_eq!( + VariantParser::get_primitive_data_length(&PrimitiveType::False), + 0 + ); + assert_eq!( + VariantParser::get_primitive_data_length(&PrimitiveType::Int8), + 1 + ); + assert_eq!( + VariantParser::get_primitive_data_length(&PrimitiveType::Int16), + 2 + ); + assert_eq!( + VariantParser::get_primitive_data_length(&PrimitiveType::Int32), + 4 + ); + assert_eq!( + VariantParser::get_primitive_data_length(&PrimitiveType::Int64), + 8 + ); + assert_eq!( + VariantParser::get_primitive_data_length(&PrimitiveType::Double), + 8 + ); + assert_eq!( + VariantParser::get_primitive_data_length(&PrimitiveType::Decimal16), + 16 + ); + assert_eq!( + VariantParser::get_primitive_data_length(&PrimitiveType::Binary), + 0 + ); // Variable length + assert_eq!( + VariantParser::get_primitive_data_length(&PrimitiveType::String), + 0 + ); // Variable length } #[test] fn test_extract_short_string_data() { // Test 0-length short string let data = &[0b00000001]; // 0-length short string header - assert_eq!(VariantParser::extract_short_string_data(data).unwrap(), &[] as &[u8]); - + assert_eq!( + VariantParser::extract_short_string_data(data).unwrap(), + &[] as &[u8] + ); + // Test 5-length short string let data = &[0b00010101, b'H', b'e', b'l', b'l', b'o']; // 5-length short string + "Hello" - assert_eq!(VariantParser::extract_short_string_data(data).unwrap(), b"Hello"); - + assert_eq!( + VariantParser::extract_short_string_data(data).unwrap(), + b"Hello" + ); + // Test insufficient data let data = &[0b00010101, b'H', b'i']; // Claims 5 bytes but only has 2 assert!(VariantParser::extract_short_string_data(data).is_err()); @@ -530,18 +631,27 @@ mod tests { fn test_extract_primitive_data() { // Test null (no data) let data = &[0b00000000]; // Null header - assert_eq!(VariantParser::extract_primitive_data(data).unwrap(), &[] as &[u8]); - + assert_eq!( + VariantParser::extract_primitive_data(data).unwrap(), + &[] as &[u8] + ); + // Test true (no data) let data = &[0b00000100]; // True header - assert_eq!(VariantParser::extract_primitive_data(data).unwrap(), &[] as &[u8]); - + assert_eq!( + VariantParser::extract_primitive_data(data).unwrap(), + &[] as &[u8] + ); + // Test int32 (4 bytes) let data = &[0b00010100, 0x2A, 0x00, 0x00, 0x00]; // Int32 header + 42 in little endian - assert_eq!(VariantParser::extract_primitive_data(data).unwrap(), &[0x2A, 0x00, 0x00, 0x00]); - + assert_eq!( + VariantParser::extract_primitive_data(data).unwrap(), + &[0x2A, 0x00, 0x00, 0x00] + ); + // Test insufficient data for int32 let data = &[0b00010100, 0x2A, 0x00]; // Int32 header but only 2 bytes assert!(VariantParser::extract_primitive_data(data).is_err()); } -} \ No newline at end of file +} From 7c03e21494aa2c74d534c254026c76bc46de14d1 Mon Sep 17 00:00:00 2001 From: carpecodeum Date: Sun, 20 Jul 2025 18:31:47 -0400 Subject: [PATCH 08/18] [FIX] remove redundancy --- parquet-variant-compute/.cargo/config.toml | 2 - parquet-variant-compute/Cargo.toml | 3 - .../benches/variant_kernels.rs | 3 +- .../examples/field_removal.rs | 144 ---------------- .../examples/path_access.rs | 123 -------------- .../src/field_operations.rs | 52 +----- parquet-variant-compute/src/lib.rs | 3 +- parquet-variant-compute/src/variant_array.rs | 154 +----------------- parquet-variant-compute/src/variant_get.rs | 15 +- 9 files changed, 21 insertions(+), 478 deletions(-) delete mode 100644 parquet-variant-compute/.cargo/config.toml delete mode 100644 parquet-variant-compute/examples/field_removal.rs delete mode 100644 parquet-variant-compute/examples/path_access.rs diff --git a/parquet-variant-compute/.cargo/config.toml b/parquet-variant-compute/.cargo/config.toml deleted file mode 100644 index 190118d44ac6..000000000000 --- a/parquet-variant-compute/.cargo/config.toml +++ /dev/null @@ -1,2 +0,0 @@ -[build] -rustflags = ["-A", "unknown-lints", "-A", "clippy::transmute-int-to-float"] \ No newline at end of file diff --git a/parquet-variant-compute/Cargo.toml b/parquet-variant-compute/Cargo.toml index dd00c40df85d..68b9823c8dc8 100644 --- a/parquet-variant-compute/Cargo.toml +++ b/parquet-variant-compute/Cargo.toml @@ -29,9 +29,6 @@ keywords = ["arrow", "parquet", "variant"] edition = { workspace = true } rust-version = { workspace = true } -[lints.rust] -unknown_lints = "allow" - [dependencies] arrow = { workspace = true } arrow-schema = { workspace = true } diff --git a/parquet-variant-compute/benches/variant_kernels.rs b/parquet-variant-compute/benches/variant_kernels.rs index 8fd6af333fed..d4007076bbae 100644 --- a/parquet-variant-compute/benches/variant_kernels.rs +++ b/parquet-variant-compute/benches/variant_kernels.rs @@ -19,8 +19,7 @@ use arrow::array::{Array, ArrayRef, StringArray}; use arrow::util::test_util::seedable_rng; use criterion::{criterion_group, criterion_main, Criterion}; use parquet_variant::{Variant, VariantBuilder}; -use parquet_variant_compute::variant_get::{variant_get, GetOptions}; -use parquet_variant_compute::{batch_json_string_to_variant, VariantArray, VariantArrayBuilder}; +use parquet_variant_compute::{batch_json_string_to_variant, variant_get, GetOptions, VariantArray, VariantArrayBuilder}; use rand::distr::Alphanumeric; use rand::rngs::StdRng; use rand::Rng; diff --git a/parquet-variant-compute/examples/field_removal.rs b/parquet-variant-compute/examples/field_removal.rs deleted file mode 100644 index b73ebc00d7db..000000000000 --- a/parquet-variant-compute/examples/field_removal.rs +++ /dev/null @@ -1,144 +0,0 @@ -use arrow::array::Array; -use parquet_variant::VariantBuilder; -use parquet_variant_compute::VariantArrayBuilder; - -fn main() { - // Create some sample data with fields to remove - let mut builder = VariantArrayBuilder::new(2); - - // Row 1: User with temporary data - { - let mut variant_builder = VariantBuilder::new(); - { - let mut obj = variant_builder.new_object(); - obj.insert("name", "Alice"); - obj.insert("age", 30i32); - obj.insert("temp_session", "abc123"); - obj.insert("debug_info", "temporary debug data"); - - { - let mut address = obj.new_object("address"); - address.insert("city", "New York"); - address.insert("zip", "10001"); - address.insert("temp_geocode", "40.7128,-74.0060"); - let _ = address.finish(); - } - - let _ = obj.finish(); - } - let (metadata, value) = variant_builder.finish(); - builder.append_variant_buffers(&metadata, &value); - } - - // Row 2: Another user with temporary data - { - let mut variant_builder = VariantBuilder::new(); - { - let mut obj = variant_builder.new_object(); - obj.insert("name", "Bob"); - obj.insert("age", 25i32); - obj.insert("temp_session", "def456"); - obj.insert("debug_info", "more temporary data"); - - { - let mut address = obj.new_object("address"); - address.insert("city", "San Francisco"); - address.insert("zip", "94102"); - address.insert("temp_geocode", "37.7749,-122.4194"); - let _ = address.finish(); - } - - let _ = obj.finish(); - } - let (metadata, value) = variant_builder.finish(); - builder.append_variant_buffers(&metadata, &value); - } - - let array = builder.finish(); - - println!("=== Field Removal Examples ==="); - - // Show original data - println!("Original data:"); - for i in 0..array.len() { - let variant = array.value(i); - if let Some(obj) = variant.as_object() { - let name = obj.get("name").unwrap().as_string().unwrap().to_string(); - let session = obj - .get("temp_session") - .map(|v| v.as_string().unwrap().to_string()) - .unwrap_or("None".to_string()); - let debug = obj - .get("debug_info") - .map(|v| v.as_string().unwrap().to_string()) - .unwrap_or("None".to_string()); - println!(" {}: session={}, debug={}", name, session, debug); - } - } - - // Remove temporary session field - let cleaned_array = array.with_field_removed("temp_session").unwrap(); - - println!("\nRemoving temporary session fields..."); - println!("After removing temp_session:"); - for i in 0..cleaned_array.len() { - let variant = cleaned_array.value(i); - if let Some(obj) = variant.as_object() { - let name = obj.get("name").unwrap().as_string().unwrap().to_string(); - let session = obj - .get("temp_session") - .map(|v| v.as_string().unwrap().to_string()) - .unwrap_or("None".to_string()); - let debug = obj - .get("debug_info") - .map(|v| v.as_string().unwrap().to_string()) - .unwrap_or("None".to_string()); - println!(" {}: session={}, debug={}", name, session, debug); - } - } - - // Remove multiple temporary fields - let final_array = cleaned_array - .with_fields_removed(&["debug_info", "temp_session"]) - .unwrap(); - - println!("\nRemoving multiple temporary fields..."); - println!("Final clean data:"); - for i in 0..final_array.len() { - let variant = final_array.value(i); - if let Some(obj) = variant.as_object() { - let name = obj.get("name").unwrap().as_string().unwrap().to_string(); - let age = obj.get("age").unwrap().as_int32().unwrap(); - - if let Some(address) = obj.get("address") { - if let Some(addr_obj) = address.as_object() { - let city = addr_obj - .get("city") - .unwrap() - .as_string() - .unwrap() - .to_string(); - let zip = addr_obj - .get("zip") - .unwrap() - .as_string() - .unwrap() - .to_string(); - let geocode = addr_obj - .get("temp_geocode") - .map(|v| { - format!( - "Some(ShortString(ShortString(\"{}\")))", - v.as_string().unwrap() - ) - }) - .unwrap_or("None".to_string()); - println!( - " {}: age={}, city={}, zip={}, geocode={}", - name, age, city, zip, geocode - ); - } - } - } - } -} diff --git a/parquet-variant-compute/examples/path_access.rs b/parquet-variant-compute/examples/path_access.rs deleted file mode 100644 index 5f8755a442a1..000000000000 --- a/parquet-variant-compute/examples/path_access.rs +++ /dev/null @@ -1,123 +0,0 @@ -use parquet_variant::VariantBuilder; -use parquet_variant_compute::{VariantArrayBuilder, VariantPath}; - -fn main() { - // Create some sample data - let mut builder = VariantArrayBuilder::new(2); - - // Row 1: User Alice - { - let mut variant_builder = VariantBuilder::new(); - { - let mut obj = variant_builder.new_object(); - obj.insert("name", "Alice"); - obj.insert("age", 30i32); - - { - let mut address = obj.new_object("address"); - address.insert("city", "New York"); - address.insert("zip", "10001"); - let _ = address.finish(); - } - - { - let mut hobbies = obj.new_list("hobbies"); - hobbies.append_value("reading"); - hobbies.append_value("hiking"); - hobbies.append_value("cooking"); - hobbies.finish(); - } - - obj.finish().unwrap(); - } - let (metadata, value) = variant_builder.finish(); - builder.append_variant_buffers(&metadata, &value); - } - - // Row 2: User Bob - { - let mut variant_builder = VariantBuilder::new(); - { - let mut obj = variant_builder.new_object(); - obj.insert("name", "Bob"); - obj.insert("age", 25i32); - - { - let mut address = obj.new_object("address"); - address.insert("city", "San Francisco"); - address.insert("zip", "94102"); - let _ = address.finish(); - } - - { - let mut hobbies = obj.new_list("hobbies"); - hobbies.append_value("swimming"); - hobbies.append_value("gaming"); - hobbies.finish(); - } - - obj.finish().unwrap(); - } - let (metadata, value) = variant_builder.finish(); - builder.append_variant_buffers(&metadata, &value); - } - - let variant_array = builder.finish(); - - // Demonstrate path access functionality - println!("=== Path Access Examples ==="); - - // 1. Single field access - let name_path = VariantPath::field("name"); - let alice_name = variant_array.get_path(0, &name_path).unwrap(); - println!("Alice's name: {}", alice_name.as_string().unwrap()); - - // 2. Nested field access - let city_path = VariantPath::field("address").push_field("city"); - let alice_city = variant_array.get_path(0, &city_path).unwrap(); - let bob_city = variant_array.get_path(1, &city_path).unwrap(); - println!("Alice's city: {}", alice_city.as_string().unwrap()); - println!("Bob's city: {}", bob_city.as_string().unwrap()); - - // 3. Array index access - let hobby_path = VariantPath::field("hobbies").push_index(0); - let alice_first_hobby = variant_array.get_path(0, &hobby_path).unwrap(); - let bob_first_hobby = variant_array.get_path(1, &hobby_path).unwrap(); - println!( - "Alice's first hobby: {}", - alice_first_hobby.as_string().unwrap() - ); - println!( - "Bob's first hobby: {}", - bob_first_hobby.as_string().unwrap() - ); - - // 4. Multiple field extraction - let paths = vec![ - VariantPath::field("name"), - VariantPath::field("age"), - VariantPath::field("address").push_field("city"), - ]; - let alice_data = variant_array.get_paths(0, &paths); - print!("Alice's data: "); - for (i, path_result) in alice_data.iter().enumerate() { - if let Some(variant) = path_result { - if i == 0 { - print!("name={}", variant.as_string().unwrap()); - } else if i == 1 { - print!(", age={}", variant.as_int32().unwrap()); - } else if i == 2 { - print!(", city={}", variant.as_string().unwrap()); - } - } - } - println!(); - - // 5. Batch field extraction - let all_names = variant_array.extract_field_by_path(&VariantPath::field("name")); - let name_strings: Vec = all_names - .iter() - .filter_map(|opt| opt.as_ref().map(|v| v.as_string().unwrap().to_string())) - .collect(); - println!("All names: {:?}", name_strings); -} diff --git a/parquet-variant-compute/src/field_operations.rs b/parquet-variant-compute/src/field_operations.rs index 674e5e1cd83d..1b82d7c01b80 100644 --- a/parquet-variant-compute/src/field_operations.rs +++ b/parquet-variant-compute/src/field_operations.rs @@ -19,49 +19,9 @@ use crate::variant_parser::{ObjectHeader, ObjectOffsets, VariantParser}; use arrow::error::ArrowError; -use parquet_variant::VariantMetadata; +use parquet_variant::{VariantMetadata, VariantPath, VariantPathElement}; use std::collections::HashSet; -/// Represents a path element in a variant path -#[derive(Debug, Clone)] -pub enum VariantPathElement { - Field(String), - Index(usize), -} - -/// Represents a path through a variant object/array structure -#[derive(Debug, Clone)] -pub struct VariantPath { - elements: Vec, -} - -impl VariantPath { - /// Create a new path starting with a field - pub fn field(name: &str) -> Self { - Self { - elements: vec![VariantPathElement::Field(name.to_string())], - } - } - - /// Add a field to the path - pub fn push_field(mut self, name: &str) -> Self { - self.elements - .push(VariantPathElement::Field(name.to_string())); - self - } - - /// Add an index to the path - pub fn push_index(mut self, index: usize) -> Self { - self.elements.push(VariantPathElement::Index(index)); - self - } - - /// Get the elements of the path - pub fn elements(&self) -> &[VariantPathElement] { - &self.elements - } -} - /// Field operations for variant objects pub struct FieldOperations; @@ -351,20 +311,20 @@ impl FieldOperations { ) -> Result>, ArrowError> { let mut current_value = value_bytes.to_vec(); - for element in path.elements() { + for element in path.iter() { match element { - VariantPathElement::Field(field_name) => { + VariantPathElement::Field { name } => { if let Some(field_bytes) = - Self::get_field_bytes(metadata_bytes, ¤t_value, field_name)? + Self::get_field_bytes(metadata_bytes, ¤t_value, name)? { current_value = field_bytes; } else { return Ok(None); } } - VariantPathElement::Index(idx) => { + VariantPathElement::Index { index } => { if let Some(element_bytes) = - Self::get_array_element_bytes(metadata_bytes, ¤t_value, *idx)? + Self::get_array_element_bytes(metadata_bytes, ¤t_value, *index)? { current_value = element_bytes; } else { diff --git a/parquet-variant-compute/src/lib.rs b/parquet-variant-compute/src/lib.rs index e3d77bf7ea0b..e3d20ec50f12 100644 --- a/parquet-variant-compute/src/lib.rs +++ b/parquet-variant-compute/src/lib.rs @@ -22,13 +22,14 @@ pub mod from_json; pub mod to_json; pub mod variant_array; pub mod variant_array_builder; +pub mod variant_get; pub mod variant_parser; -pub use field_operations::{VariantPath, VariantPathElement}; pub use from_json::batch_json_string_to_variant; pub use to_json::batch_variant_to_json_string; pub use variant_array::VariantArray; pub use variant_array_builder::VariantArrayBuilder; +pub use variant_get::{variant_get, GetOptions}; pub use variant_parser::{ ArrayHeader, ObjectHeader, PrimitiveType, ShortStringHeader, VariantBasicType, VariantType, }; diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index b90298f5f6a1..bc3e70f557df 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -155,66 +155,6 @@ impl VariantArray { fn find_value_field(array: &StructArray) -> Option { array.column_by_name("value").cloned() } - /// Extract a field from the variant at the specified row using a path. - /// - /// This method provides direct access to nested fields without reconstructing - /// the entire variant, which is critical for performance with shredded variants. - /// - /// # Arguments - /// * `index` - The row index in the array - /// * `path` - The path to the field to extract - /// - /// # Returns - /// * `Some(Variant)` if the field exists at the specified path - /// * `None` if the field doesn't exist or the path is invalid - /// - /// # Example - /// ``` - /// # use parquet_variant_compute::{VariantArrayBuilder, VariantArray, VariantPath}; - /// # use parquet_variant::VariantBuilder; - /// # let mut builder = VariantArrayBuilder::new(1); - /// # let mut variant_builder = VariantBuilder::new(); - /// # let mut obj = variant_builder.new_object(); - /// # obj.insert("name", "Alice"); - /// # obj.finish().unwrap(); - /// # let (metadata, value) = variant_builder.finish(); - /// # builder.append_variant_buffers(&metadata, &value); - /// # let variant_array = builder.build(); - /// let path = VariantPath::field("name"); - /// let name_variant = variant_array.get_path(0, &path); - /// ``` - pub fn get_path(&self, index: usize, path: &VariantPath) -> Option { - if path.is_empty() { - return Some(self.value(index)); - } - - // Start with the root variant - let mut current = self.value(index); - - Ok(Self { inner }) - } - - /// Returns a reference to the underlying [`StructArray`]. - pub fn inner(&self) -> &StructArray { - &self.inner - } - - /// Returns the inner [`StructArray`], consuming self - pub fn into_inner(self) -> StructArray { - self.inner - } - - /// Return the [`Variant`] instance stored at the given row - /// - /// Panics if the index is out of bounds. - /// - /// Note: Does not do deep validation of the [`Variant`], so it is up to the - /// caller to ensure that the metadata and value were constructed correctly. - pub fn value(&self, index: usize) -> Variant { - let metadata = self.metadata_field().as_binary_view().value(index); - let value = self.value_field().as_binary_view().value(index); - Variant::new(metadata, value) - } /// Return a reference to the metadata field of the [`StructArray`] pub fn metadata_field(&self) -> &ArrayRef { @@ -238,51 +178,6 @@ impl VariantArray { self.value_field().as_binary_view().value(index).as_ref() } - /// Get value at a specific path for the variant at the given index - /// - /// Uses high-level Variant API for convenience. Returns a Variant object that can be - /// directly used with standard variant operations. - pub fn get_path( - &self, - index: usize, - path: &crate::field_operations::VariantPath, - ) -> Option { - if index >= self.len() || self.is_null(index) { - return None; - } - - let mut current_variant = self.value(index); - - for element in path.elements() { - match element { - crate::field_operations::VariantPathElement::Field(field_name) => { - current_variant = current_variant.get_object_field(field_name)?; - } - crate::field_operations::VariantPathElement::Index(idx) => { - current_variant = current_variant.get_list_element(*idx)?; - } - } - } - - Some(current_variant) - } - - /// Get values at multiple paths for the variant at the given index - /// - /// Convenience method that applies `get_path()` to multiple paths at once. - /// Useful for extracting multiple fields from a single variant row. - pub fn get_paths( - &self, - index: usize, - paths: &[crate::field_operations::VariantPath], - ) -> Vec> { - let mut results = Vec::new(); - for path in paths { - results.push(self.get_path(index, path)); - } - results - } - /// Get the field names for an object at the given index pub fn get_field_names(&self, index: usize) -> Vec { if index >= self.len() { @@ -295,45 +190,18 @@ impl VariantArray { let variant = self.value(index); if let Some(obj) = variant.as_object() { - let mut paths = Vec::new(); + let mut field_names = Vec::new(); for i in 0..obj.len() { if let Some(field_name) = obj.field_name(i) { - paths.push(field_name.to_string()); + field_names.push(field_name.to_string()); } } - paths + field_names } else { vec![] } } - /// Extract field values by path from all variants in the array - /// - /// Applies `get_path()` to a single path across all rows in the array. - /// Useful for extracting a column of values from nested variant data. - pub fn extract_field_by_path( - &self, - path: &crate::field_operations::VariantPath, - ) -> Vec> { - let mut results = Vec::new(); - for i in 0..self.len() { - results.push(self.get_path(i, path)); - } - results - } - - /// Return a reference to the metadata field of the [`StructArray`] - pub fn metadata_field(&self) -> &ArrayRef { - // spec says fields order is not guaranteed, so we search by name - &self.metadata_ref - } - - /// Return a reference to the value field of the `StructArray` - pub fn value_field(&self) -> &ArrayRef { - // spec says fields order is not guaranteed, so we search by name - &self.value_ref - } - /// Create a new VariantArray with a field removed from all variants pub fn with_field_removed(&self, field_name: &str) -> Result { let mut builder = crate::variant_array_builder::VariantArrayBuilder::new(self.len()); @@ -598,20 +466,8 @@ mod test { assert!(paths2.contains(&"city".to_string())); } - #[test] - fn test_get_path() { - let array = create_test_variant_array(); - - // Test field access - let name_path = crate::field_operations::VariantPath::field("name"); - let alice_name = array.get_path(0, &name_path).unwrap(); - assert_eq!(alice_name.as_string(), Some("Alice")); - - // Test non-existent field - let nonexistent_path = crate::field_operations::VariantPath::field("nonexistent"); - let result = array.get_path(0, &nonexistent_path); - assert!(result.is_none()); - } + // Note: test_get_path was removed as it tested the duplicate VariantPath implementation + // Use the official parquet_variant::VariantPath with variant_get functionality instead #[test] fn test_with_field_removed() { diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index b3a3d9e41f13..eee2cb5f19b1 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -90,14 +90,13 @@ mod test { use std::sync::Arc; use arrow::array::{Array, ArrayRef, StringArray}; - use parquet_variant::VariantPath; use crate::batch_json_string_to_variant; use crate::VariantArray; use super::{variant_get, GetOptions}; - fn single_variant_get_test(input_json: &str, path: VariantPath, expected_json: &str) { + fn single_variant_get_test(input_json: &str, path: parquet_variant::VariantPath, expected_json: &str) { // Create input array from JSON string let input_array_ref: ArrayRef = Arc::new(StringArray::from(vec![Some(input_json)])); let input_variant_array_ref: ArrayRef = @@ -132,21 +131,21 @@ mod test { fn get_primitive_variant_field() { single_variant_get_test( r#"{"some_field": 1234}"#, - VariantPath::from("some_field"), + parquet_variant::VariantPath::from("some_field"), "1234", ); } #[test] fn get_primitive_variant_list_index() { - single_variant_get_test("[1234, 5678]", VariantPath::from(0), "1234"); + single_variant_get_test("[1234, 5678]", parquet_variant::VariantPath::from(0), "1234"); } #[test] fn get_primitive_variant_inside_object_of_object() { single_variant_get_test( r#"{"top_level_field": {"inner_field": 1234}}"#, - VariantPath::from("top_level_field").join("inner_field"), + parquet_variant::VariantPath::from("top_level_field").join("inner_field"), "1234", ); } @@ -155,7 +154,7 @@ mod test { fn get_primitive_variant_inside_list_of_object() { single_variant_get_test( r#"[{"some_field": 1234}]"#, - VariantPath::from(0).join("some_field"), + parquet_variant::VariantPath::from(0).join("some_field"), "1234", ); } @@ -164,7 +163,7 @@ mod test { fn get_primitive_variant_inside_object_of_list() { single_variant_get_test( r#"{"some_field": [1234]}"#, - VariantPath::from("some_field").join(0), + parquet_variant::VariantPath::from("some_field").join(0), "1234", ); } @@ -173,7 +172,7 @@ mod test { fn get_complex_variant() { single_variant_get_test( r#"{"top_level_field": {"inner_field": 1234}}"#, - VariantPath::from("top_level_field"), + parquet_variant::VariantPath::from("top_level_field"), r#"{"inner_field": 1234}"#, ); } From eb8bb692ed8356a6d0d848bda11d0a8e5a8e1951 Mon Sep 17 00:00:00 2001 From: carpecodeum Date: Sun, 20 Jul 2025 18:43:13 -0400 Subject: [PATCH 09/18] [FIX] improve the tests --- .../src/field_operations.rs | 14 +++++----- parquet-variant-compute/src/variant_array.rs | 26 +++++++++---------- .../src/variant_array_builder.rs | 7 ++--- 3 files changed, 24 insertions(+), 23 deletions(-) diff --git a/parquet-variant-compute/src/field_operations.rs b/parquet-variant-compute/src/field_operations.rs index 1b82d7c01b80..a0b6d6194269 100644 --- a/parquet-variant-compute/src/field_operations.rs +++ b/parquet-variant-compute/src/field_operations.rs @@ -449,13 +449,13 @@ mod tests { fn create_test_object() -> (Vec, Vec) { let mut builder = VariantBuilder::new(); - { - let mut obj = builder.new_object(); - obj.insert("name", "Alice"); - obj.insert("age", 30i32); - obj.insert("city", "NYC"); - obj.finish().unwrap(); - } + builder + .new_object() + .with_field("name", "Alice") + .with_field("age", 30i32) + .with_field("city", "NYC") + .finish() + .unwrap(); builder.finish() } diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index bc3e70f557df..cec193c624ed 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -394,24 +394,24 @@ mod test { // Create variant 1: {"name": "Alice", "age": 30} let mut builder1 = VariantBuilder::new(); - { - let mut obj = builder1.new_object(); - obj.insert("name", "Alice"); - obj.insert("age", 30i32); - obj.finish().unwrap(); - } + builder1 + .new_object() + .with_field("name", "Alice") + .with_field("age", 30i32) + .finish() + .unwrap(); let (metadata1, value1) = builder1.finish(); builder.append_variant_buffers(&metadata1, &value1); // Create variant 2: {"name": "Bob", "age": 25, "city": "NYC"} let mut builder2 = VariantBuilder::new(); - { - let mut obj = builder2.new_object(); - obj.insert("name", "Bob"); - obj.insert("age", 25i32); - obj.insert("city", "NYC"); - obj.finish().unwrap(); - } + builder2 + .new_object() + .with_field("name", "Bob") + .with_field("age", 25i32) + .with_field("city", "NYC") + .finish() + .unwrap(); let (metadata2, value2) = builder2.finish(); builder.append_variant_buffers(&metadata2, &value2); diff --git a/parquet-variant-compute/src/variant_array_builder.rs b/parquet-variant-compute/src/variant_array_builder.rs index 18823ac71cd7..129fab583416 100644 --- a/parquet-variant-compute/src/variant_array_builder.rs +++ b/parquet-variant-compute/src/variant_array_builder.rs @@ -48,9 +48,10 @@ use std::sync::Arc; /// // append a pre-constructed metadata and value buffers /// let (metadata, value) = { /// let mut vb = VariantBuilder::new(); -/// let mut obj = vb.new_object(); -/// obj.insert("foo", "bar"); -/// obj.finish().unwrap(); +/// vb.new_object() +/// .with_field("foo", "bar") +/// .finish() +/// .unwrap(); /// vb.finish() /// }; /// builder.append_variant_buffers(&metadata, &value); From 397c7177fe82ab298daf99f41ec1bc89790ddc1d Mon Sep 17 00:00:00 2001 From: carpecodeum Date: Sun, 20 Jul 2025 23:40:55 -0400 Subject: [PATCH 10/18] [FIX] refactor code for modularity --- parquet-variant-compute/src/variant_array.rs | 38 +++++++---------- parquet-variant-compute/src/variant_parser.rs | 42 ++++++++++--------- 2 files changed, 38 insertions(+), 42 deletions(-) diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index cec193c624ed..c05deb282d16 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -169,7 +169,7 @@ impl VariantArray { } /// Get the metadata bytes for a specific index - pub fn metadata(&self, index: usize) -> &[u8] { + pub fn metadata_bytes(&self, index: usize) -> &[u8] { self.metadata_field().as_binary_view().value(index).as_ref() } @@ -210,19 +210,15 @@ impl VariantArray { if self.is_null(i) { builder.append_null(); } else { - match FieldOperations::remove_field_bytes( - self.metadata(i), + let new_value = FieldOperations::remove_field_bytes( + self.metadata_bytes(i), self.value_bytes(i), field_name, - )? { - Some(new_value) => { - builder.append_variant_buffers(self.metadata(i), &new_value); - } - None => { - // Field didn't exist, use original value - builder.append_variant_buffers(self.metadata(i), self.value_bytes(i)); - } - } + )?; + + // Use original value if the field didn't exist + let new_value = new_value.as_deref().unwrap_or_else(|| self.value_bytes(i)); + builder.append_variant_buffers(self.metadata_bytes(i), new_value); } } @@ -237,19 +233,15 @@ impl VariantArray { if self.is_null(i) { builder.append_null(); } else { - match FieldOperations::remove_fields_bytes( - self.metadata(i), + let new_value = FieldOperations::remove_fields_bytes( + self.metadata_bytes(i), self.value_bytes(i), field_names, - )? { - Some(new_value) => { - builder.append_variant_buffers(self.metadata(i), &new_value); - } - None => { - // No fields existed, use original value - builder.append_variant_buffers(self.metadata(i), self.value_bytes(i)); - } - } + )?; + + // Use original value if no fields existed + let new_value = new_value.as_deref().unwrap_or_else(|| self.value_bytes(i)); + builder.append_variant_buffers(self.metadata_bytes(i), new_value); } } diff --git a/parquet-variant-compute/src/variant_parser.rs b/parquet-variant-compute/src/variant_parser.rs index 0289445bed01..f657edbaaac7 100644 --- a/parquet-variant-compute/src/variant_parser.rs +++ b/parquet-variant-compute/src/variant_parser.rs @@ -28,15 +28,6 @@ pub enum VariantBasicType { Array = 3, } -/// Variant type enumeration covering all possible types -#[derive(Debug, Clone, PartialEq)] -pub enum VariantType { - Primitive(PrimitiveType), - ShortString(ShortStringHeader), - Object(ObjectHeader), - Array(ArrayHeader), -} - /// Primitive type variants #[derive(Debug, Clone, PartialEq)] pub enum PrimitiveType { @@ -59,6 +50,15 @@ pub enum PrimitiveType { String, } +/// Variant type enumeration covering all possible types +#[derive(Debug, Clone, PartialEq)] +pub enum VariantType { + Primitive(PrimitiveType), + ShortString(ShortStringHeader), + Object(ObjectHeader), + Array(ArrayHeader), +} + /// Short string header structure #[derive(Debug, Clone, PartialEq)] pub struct ShortStringHeader { @@ -150,6 +150,19 @@ impl VariantParser { } } + /// Get the basic type from header byte + pub fn get_basic_type(header_byte: u8) -> VariantBasicType { + match header_byte & 0x03 { + 0 => VariantBasicType::Primitive, + 1 => VariantBasicType::ShortString, + 2 => VariantBasicType::Object, + 3 => VariantBasicType::Array, + _ => panic!("Invalid basic type: {}", header_byte & 0x03), + } + } + + + /// Parse short string header pub fn parse_short_string_header(header_byte: u8) -> Result { let length = (header_byte >> 2) as usize; @@ -259,16 +272,7 @@ impl VariantParser { } } - /// Get the basic type from header byte - pub fn get_basic_type(header_byte: u8) -> VariantBasicType { - match header_byte & 0x03 { - 0 => VariantBasicType::Primitive, - 1 => VariantBasicType::ShortString, - 2 => VariantBasicType::Object, - 3 => VariantBasicType::Array, - _ => panic!("Invalid basic type: {}", header_byte & 0x03), - } - } + /// Check if value bytes represent a primitive pub fn is_primitive(value_bytes: &[u8]) -> bool { From dda30ea6217fdb382398161ff470ebcacab77474 Mon Sep 17 00:00:00 2001 From: carpecodeum Date: Sun, 20 Jul 2025 23:49:58 -0400 Subject: [PATCH 11/18] [FIX] fix issues with the spec --- parquet-variant-compute/src/variant_parser.rs | 132 ++++++++++-------- 1 file changed, 70 insertions(+), 62 deletions(-) diff --git a/parquet-variant-compute/src/variant_parser.rs b/parquet-variant-compute/src/variant_parser.rs index f657edbaaac7..5cf70b70f7f9 100644 --- a/parquet-variant-compute/src/variant_parser.rs +++ b/parquet-variant-compute/src/variant_parser.rs @@ -167,9 +167,10 @@ impl VariantParser { pub fn parse_short_string_header(header_byte: u8) -> Result { let length = (header_byte >> 2) as usize; - if length > 13 { + // Short strings can be up to 64 bytes (6-bit value: 0-63) + if length > 63 { return Err(ArrowError::InvalidArgumentError(format!( - "Short string length {} exceeds maximum of 13", + "Short string length {} exceeds maximum of 63", length ))); } @@ -307,22 +308,23 @@ impl VariantParser { } /// Get the data length for a primitive type - pub fn get_primitive_data_length(primitive_type: &PrimitiveType) -> usize { + /// Returns Some(len) for fixed-length types, None for variable-length types + pub fn get_primitive_data_length(primitive_type: &PrimitiveType) -> Option { match primitive_type { - PrimitiveType::Null | PrimitiveType::True | PrimitiveType::False => 0, - PrimitiveType::Int8 => 1, - PrimitiveType::Int16 => 2, + PrimitiveType::Null | PrimitiveType::True | PrimitiveType::False => Some(0), + PrimitiveType::Int8 => Some(1), + PrimitiveType::Int16 => Some(2), PrimitiveType::Int32 | PrimitiveType::Float | PrimitiveType::Decimal4 - | PrimitiveType::Date => 4, + | PrimitiveType::Date => Some(4), PrimitiveType::Int64 | PrimitiveType::Double | PrimitiveType::Decimal8 | PrimitiveType::TimestampNtz - | PrimitiveType::TimestampLtz => 8, - PrimitiveType::Decimal16 => 16, - PrimitiveType::Binary | PrimitiveType::String => 0, // Variable length, need to read from data + | PrimitiveType::TimestampLtz => Some(8), + PrimitiveType::Decimal16 => Some(16), + PrimitiveType::Binary | PrimitiveType::String => None, // Variable length, need to read from data } } @@ -357,43 +359,41 @@ impl VariantParser { let primitive_type = Self::parse_primitive_header(value_bytes[0])?; let data_length = Self::get_primitive_data_length(&primitive_type); - if data_length == 0 { - // Handle variable length types and null/boolean - match primitive_type { - PrimitiveType::Null | PrimitiveType::True | PrimitiveType::False => Ok(&[]), - PrimitiveType::Binary | PrimitiveType::String => { - // These require reading length from the data - if value_bytes.len() < 5 { - return Err(ArrowError::InvalidArgumentError( - "Not enough bytes for variable length primitive".to_string(), - )); - } - let length = u32::from_le_bytes([ - value_bytes[1], - value_bytes[2], - value_bytes[3], - value_bytes[4], - ]) as usize; - if value_bytes.len() < 5 + length { - return Err(ArrowError::InvalidArgumentError( - "Variable length primitive data exceeds available bytes".to_string(), - )); - } - Ok(&value_bytes[5..5 + length]) + match data_length { + Some(0) => { + // Fixed-length 0-byte types (null/true/false) + Ok(&[]) + } + Some(len) => { + // Fixed-length types with len bytes + if value_bytes.len() < 1 + len { + return Err(ArrowError::InvalidArgumentError(format!( + "Fixed length primitive data length {} exceeds available bytes", + len + ))); } - _ => Err(ArrowError::InvalidArgumentError(format!( - "Unhandled primitive type: {:?}", - primitive_type - ))), + Ok(&value_bytes[1..1 + len]) } - } else { - if value_bytes.len() < 1 + data_length { - return Err(ArrowError::InvalidArgumentError(format!( - "Primitive data length {} exceeds available bytes", - data_length - ))); + None => { + // Variable-length types (binary/string) - read length from data + if value_bytes.len() < 5 { + return Err(ArrowError::InvalidArgumentError( + "Not enough bytes for variable length primitive".to_string(), + )); + } + let length = u32::from_le_bytes([ + value_bytes[1], + value_bytes[2], + value_bytes[3], + value_bytes[4], + ]) as usize; + if value_bytes.len() < 5 + length { + return Err(ArrowError::InvalidArgumentError( + "Variable length primitive data exceeds available bytes".to_string(), + )); + } + Ok(&value_bytes[5..5 + length]) } - Ok(&value_bytes[1..1 + data_length]) } } @@ -500,14 +500,17 @@ mod tests { ShortStringHeader { length: 5 } ); - // Test 13-length short string (maximum) + // Test 63-length short string (maximum for 6-bit value) assert_eq!( - VariantParser::parse_short_string_header(0b00110101).unwrap(), - ShortStringHeader { length: 13 } + VariantParser::parse_short_string_header(0b11111101).unwrap(), + ShortStringHeader { length: 63 } ); - // Test invalid length > 13 - assert!(VariantParser::parse_short_string_header(0b00111001).is_err()); + // Test that all values 0-63 are valid + for length in 0..=63 { + let header_byte = (length << 2) | 1; // short string type + assert!(VariantParser::parse_short_string_header(header_byte as u8).is_ok()); + } } #[test] @@ -564,50 +567,55 @@ mod tests { #[test] fn test_get_primitive_data_length() { + // Test fixed-length 0-byte types assert_eq!( VariantParser::get_primitive_data_length(&PrimitiveType::Null), - 0 + Some(0) ); assert_eq!( VariantParser::get_primitive_data_length(&PrimitiveType::True), - 0 + Some(0) ); assert_eq!( VariantParser::get_primitive_data_length(&PrimitiveType::False), - 0 + Some(0) ); + + // Test fixed-length types with specific byte counts assert_eq!( VariantParser::get_primitive_data_length(&PrimitiveType::Int8), - 1 + Some(1) ); assert_eq!( VariantParser::get_primitive_data_length(&PrimitiveType::Int16), - 2 + Some(2) ); assert_eq!( VariantParser::get_primitive_data_length(&PrimitiveType::Int32), - 4 + Some(4) ); assert_eq!( VariantParser::get_primitive_data_length(&PrimitiveType::Int64), - 8 + Some(8) ); assert_eq!( VariantParser::get_primitive_data_length(&PrimitiveType::Double), - 8 + Some(8) ); assert_eq!( VariantParser::get_primitive_data_length(&PrimitiveType::Decimal16), - 16 + Some(16) ); + + // Test variable-length types (should return None) assert_eq!( VariantParser::get_primitive_data_length(&PrimitiveType::Binary), - 0 - ); // Variable length + None + ); assert_eq!( VariantParser::get_primitive_data_length(&PrimitiveType::String), - 0 - ); // Variable length + None + ); } #[test] From 32c55eac1f60964cbaa2ec1def4cbaa8aa5f5740 Mon Sep 17 00:00:00 2001 From: carpecodeum Date: Mon, 21 Jul 2025 09:21:28 -0400 Subject: [PATCH 12/18] remove redundancy with field_operations.rs and variant_parser.rs --- .../src/field_operations.rs | 480 +----------------- parquet-variant-compute/src/lib.rs | 2 +- parquet-variant-compute/src/variant_array.rs | 56 +- parquet-variant-compute/src/variant_parser.rs | 299 +---------- 4 files changed, 72 insertions(+), 765 deletions(-) diff --git a/parquet-variant-compute/src/field_operations.rs b/parquet-variant-compute/src/field_operations.rs index a0b6d6194269..44cd14be0398 100644 --- a/parquet-variant-compute/src/field_operations.rs +++ b/parquet-variant-compute/src/field_operations.rs @@ -16,477 +16,33 @@ // under the License. //! Field extraction and removal operations for variant objects +//! +//! NOTE: Most functionality in this module has been superseded by the high-level +//! Variant API (variant.as_object(), variant.get_object_field(), etc.). +//! For new code, prefer using the high-level API over these low-level operations. -use crate::variant_parser::{ObjectHeader, ObjectOffsets, VariantParser}; -use arrow::error::ArrowError; -use parquet_variant::{VariantMetadata, VariantPath, VariantPathElement}; -use std::collections::HashSet; +// This module is mostly empty now - the manual field operations have been +// replaced by high-level Variant API usage. See variant_array.rs for examples +// of how field removal is now implemented using VariantBuilder. /// Field operations for variant objects pub struct FieldOperations; +// Note: This struct is kept for backwards compatibility but most methods +// have been removed in favor of high-level Variant API usage. impl FieldOperations { - /// Extract field bytes from a single variant object - pub fn extract_field_bytes( - metadata_bytes: &[u8], - value_bytes: &[u8], - field_name: &str, - ) -> Result>, ArrowError> { - if !VariantParser::is_object(value_bytes) { - return Ok(None); - } - - let header_byte = value_bytes[0]; - let header = VariantParser::parse_object_header(header_byte)?; - let num_elements = VariantParser::unpack_int(&value_bytes[1..], header.num_elements_size)?; - let offsets = VariantParser::calculate_object_offsets(&header, num_elements); - - // Find field ID for the target field name - let target_field_id = Self::find_field_id(metadata_bytes, field_name)?; - let target_field_id = match target_field_id { - Some(id) => id, - None => return Ok(None), // Field not found - }; - - // Search for the field in the object - for i in 0..num_elements { - let field_id_offset = offsets.field_ids_start + (i * header.field_id_size); - let field_id = - VariantParser::unpack_int(&value_bytes[field_id_offset..], header.field_id_size)?; - - if field_id == target_field_id { - return Self::extract_field_value_at_index( - value_bytes, - &header, - &offsets, - i, - num_elements, - ); - } - } - - Ok(None) - } - - /// Remove field from a single variant object - pub fn remove_field_bytes( - metadata_bytes: &[u8], - value_bytes: &[u8], - field_name: &str, - ) -> Result>, ArrowError> { - Self::remove_fields_bytes(metadata_bytes, value_bytes, &[field_name]) - } - - /// Remove multiple fields from a single variant object - pub fn remove_fields_bytes( - metadata_bytes: &[u8], - value_bytes: &[u8], - field_names: &[&str], - ) -> Result>, ArrowError> { - if !VariantParser::is_object(value_bytes) { - return Ok(Some(value_bytes.to_vec())); - } - - let header_byte = value_bytes[0]; - let header = VariantParser::parse_object_header(header_byte)?; - let num_elements = VariantParser::unpack_int(&value_bytes[1..], header.num_elements_size)?; - let offsets = VariantParser::calculate_object_offsets(&header, num_elements); - - // Find field IDs for target field names - let target_field_ids = Self::find_field_ids(metadata_bytes, field_names)?; - - if target_field_ids.is_empty() { - return Ok(Some(value_bytes.to_vec())); // No fields to remove - } - - // Collect fields to keep - let fields_to_keep = Self::collect_fields_to_keep( - value_bytes, - &header, - &offsets, - num_elements, - &target_field_ids, - )?; - - if fields_to_keep.len() == num_elements { - return Ok(Some(value_bytes.to_vec())); // No fields were removed - } - - // Sort fields by name for proper variant object ordering - let sorted_fields = Self::sort_fields_by_name(metadata_bytes, fields_to_keep)?; - - // Reconstruct object with remaining fields - Self::reconstruct_object(sorted_fields) - } - - /// Find field ID for a given field name - fn find_field_id(metadata_bytes: &[u8], field_name: &str) -> Result, ArrowError> { - let metadata = VariantMetadata::try_new(metadata_bytes)?; - - for dict_idx in 0..metadata.len() { - if let Ok(name) = metadata.get(dict_idx) { - if name == field_name { - return Ok(Some(dict_idx)); - } - } - } - - Ok(None) - } - - /// Find field IDs for multiple field names - fn find_field_ids( - metadata_bytes: &[u8], - field_names: &[&str], - ) -> Result, ArrowError> { - let metadata = VariantMetadata::try_new(metadata_bytes)?; - let mut target_field_ids = HashSet::new(); - - for field_name in field_names { - for dict_idx in 0..metadata.len() { - if let Ok(name) = metadata.get(dict_idx) { - if name == *field_name { - target_field_ids.insert(dict_idx); - break; - } - } - } - } - - Ok(target_field_ids) - } - - /// Extract field value at a specific index - fn extract_field_value_at_index( - value_bytes: &[u8], - header: &ObjectHeader, - offsets: &ObjectOffsets, - field_index: usize, - num_elements: usize, - ) -> Result>, ArrowError> { - // Get all field offsets - let mut field_offsets = Vec::new(); - for i in 0..=num_elements { - let offset_idx = offsets.field_offsets_start + (i * header.field_offset_size); - let offset_val = - VariantParser::unpack_int(&value_bytes[offset_idx..], header.field_offset_size)?; - field_offsets.push(offset_val); - } - - let field_start = field_offsets[field_index]; - - // To find the end offset, we need to find the next field in byte order - // Since fields are stored in alphabetical order, we can't just use field_index + 1 - // We need to find the smallest offset that's greater than field_start - let mut field_end = field_offsets[num_elements]; // Default to final offset - - for i in 0..num_elements { - if i != field_index { - let other_offset = field_offsets[i]; - if other_offset > field_start && other_offset < field_end { - field_end = other_offset; - } - } - } - - let field_start_absolute = offsets.values_start + field_start; - let field_end_absolute = offsets.values_start + field_end; - - if field_start_absolute <= field_end_absolute && field_end_absolute <= value_bytes.len() { - let field_value_bytes = &value_bytes[field_start_absolute..field_end_absolute]; - Ok(Some(field_value_bytes.to_vec())) - } else { - Ok(None) - } - } - - /// Collect fields to keep (those not being removed) - fn collect_fields_to_keep( - value_bytes: &[u8], - header: &ObjectHeader, - offsets: &ObjectOffsets, - num_elements: usize, - target_field_ids: &HashSet, - ) -> Result)>, ArrowError> { - let mut fields_to_keep = Vec::new(); - - for i in 0..num_elements { - let field_id_offset = offsets.field_ids_start + (i * header.field_id_size); - let field_id = - VariantParser::unpack_int(&value_bytes[field_id_offset..], header.field_id_size)?; - - if !target_field_ids.contains(&field_id) { - if let Some(field_value) = Self::extract_field_value_at_index( - value_bytes, - header, - offsets, - i, - num_elements, - )? { - fields_to_keep.push((field_id, field_value)); - } - } - } - - Ok(fields_to_keep) - } - - /// Sort fields by their names (variant objects must be sorted alphabetically) - fn sort_fields_by_name( - metadata_bytes: &[u8], - mut fields: Vec<(usize, Vec)>, - ) -> Result)>, ArrowError> { - let metadata = VariantMetadata::try_new(metadata_bytes)?; - - fields.sort_by(|a, b| { - let name_a = metadata.get(a.0).unwrap_or(""); - let name_b = metadata.get(b.0).unwrap_or(""); - name_a.cmp(name_b) - }); - - Ok(fields) - } - - /// Reconstruct variant object from sorted fields - fn reconstruct_object(fields: Vec<(usize, Vec)>) -> Result>, ArrowError> { - let new_num_elements = fields.len(); - let new_is_large = new_num_elements > 255; - - // Calculate sizes for new object - let max_field_id = fields.iter().map(|(id, _)| *id).max().unwrap_or(0); - let new_field_id_size = VariantParser::calculate_int_size(max_field_id); - - let total_values_size: usize = fields.iter().map(|(_, value)| value.len()).sum(); - let new_field_offset_size = VariantParser::calculate_int_size(total_values_size); - - // Build new object - let mut new_value_bytes = Vec::new(); - - // Write header - let new_header = VariantParser::build_object_header( - new_is_large, - new_field_id_size, - new_field_offset_size, - ); - new_value_bytes.push(new_header); - - // Write num_elements - if new_is_large { - new_value_bytes.extend_from_slice(&(new_num_elements as u32).to_le_bytes()); - } else { - new_value_bytes.push(new_num_elements as u8); - } - - // Write field IDs - for (field_id, _) in &fields { - VariantParser::write_int_bytes(&mut new_value_bytes, *field_id, new_field_id_size); - } - - // Write field offsets - let mut current_offset = 0; - for (_, field_value) in &fields { - VariantParser::write_int_bytes( - &mut new_value_bytes, - current_offset, - new_field_offset_size, - ); - current_offset += field_value.len(); - } - // Write final offset - VariantParser::write_int_bytes(&mut new_value_bytes, current_offset, new_field_offset_size); - - // Write field values - for (_, field_value) in &fields { - new_value_bytes.extend_from_slice(field_value); - } - - Ok(Some(new_value_bytes)) - } - - /// Get the bytes at a specific path through the variant data - pub fn get_path_bytes( - metadata_bytes: &[u8], - value_bytes: &[u8], - path: &VariantPath, - ) -> Result>, ArrowError> { - let mut current_value = value_bytes.to_vec(); - - for element in path.iter() { - match element { - VariantPathElement::Field { name } => { - if let Some(field_bytes) = - Self::get_field_bytes(metadata_bytes, ¤t_value, name)? - { - current_value = field_bytes; - } else { - return Ok(None); - } - } - VariantPathElement::Index { index } => { - if let Some(element_bytes) = - Self::get_array_element_bytes(metadata_bytes, ¤t_value, *index)? - { - current_value = element_bytes; - } else { - return Ok(None); - } - } - } - } - - Ok(Some(current_value)) - } - - /// Get the value at a specific path and return its type and data - pub fn get_path_with_type( - metadata_bytes: &[u8], - value_bytes: &[u8], - path: &VariantPath, - ) -> Result)>, ArrowError> { - if let Some(value_bytes) = Self::get_path_bytes(metadata_bytes, value_bytes, path)? { - if !value_bytes.is_empty() { - let variant_type = VariantParser::parse_variant_header(value_bytes[0])?; - return Ok(Some((variant_type, value_bytes))); - } - } - Ok(None) - } - - /// Get field bytes from an object at the byte level - fn get_field_bytes( - metadata_bytes: &[u8], - value_bytes: &[u8], - field_name: &str, - ) -> Result>, ArrowError> { - // Use the general dispatch parser to ensure we're dealing with an object - if !value_bytes.is_empty() { - match VariantParser::parse_variant_header(value_bytes[0])? { - crate::variant_parser::VariantType::Object(_) => { - Self::extract_field_bytes(metadata_bytes, value_bytes, field_name) - } - _ => Ok(None), // Not an object, can't extract fields - } - } else { - Ok(None) - } - } - - /// Get array element bytes at the byte level - fn get_array_element_bytes( - _metadata_bytes: &[u8], - value_bytes: &[u8], - index: usize, - ) -> Result>, ArrowError> { - // Use the general dispatch parser to ensure we're dealing with an array - if value_bytes.is_empty() { - return Ok(None); - } - - match VariantParser::parse_variant_header(value_bytes[0])? { - crate::variant_parser::VariantType::Array(array_header) => { - let num_elements = - VariantParser::unpack_int(&value_bytes[1..], array_header.num_elements_size)?; - - // Check bounds - if index >= num_elements { - return Ok(None); - } - - // Calculate array offsets - let offsets = VariantParser::calculate_array_offsets(&array_header, num_elements); - - // Get element offset - let element_offset_start = - offsets.element_offsets_start + index * array_header.element_offset_size; - let element_offset_end = element_offset_start + array_header.element_offset_size; - - if element_offset_end > value_bytes.len() { - return Err(ArrowError::InvalidArgumentError( - "Element offset exceeds value buffer".to_string(), - )); - } - - let element_offset = VariantParser::unpack_int( - &value_bytes[element_offset_start..element_offset_end], - array_header.element_offset_size, - )?; - - // Get next element offset (or end of data) - let next_offset = if index + 1 < num_elements { - let next_element_offset_start = offsets.element_offsets_start - + (index + 1) * array_header.element_offset_size; - let next_element_offset_end = - next_element_offset_start + array_header.element_offset_size; - VariantParser::unpack_int( - &value_bytes[next_element_offset_start..next_element_offset_end], - array_header.element_offset_size, - )? - } else { - value_bytes.len() - }; - - // Extract element bytes - let element_start = offsets.elements_start + element_offset; - let element_end = offsets.elements_start + next_offset; - - if element_end > value_bytes.len() { - return Err(ArrowError::InvalidArgumentError( - "Element data exceeds value buffer".to_string(), - )); - } - - Ok(Some(value_bytes[element_start..element_end].to_vec())) - } - _ => Ok(None), // Not an array, can't extract elements - } - } + // All manual field manipulation methods have been removed. + // Use the high-level Variant API instead: + // - variant.get_object_field(name) instead of extract_field_bytes() + // - variant.get_list_element(index) instead of array element extraction + // - variant.get_path(&path) instead of get_path_bytes() + // - VariantBuilder::new_object() instead of manual object reconstruction } #[cfg(test)] mod tests { - use super::*; - use parquet_variant::VariantBuilder; - - fn create_test_object() -> (Vec, Vec) { - let mut builder = VariantBuilder::new(); - builder - .new_object() - .with_field("name", "Alice") - .with_field("age", 30i32) - .with_field("city", "NYC") - .finish() - .unwrap(); - builder.finish() - } - - #[test] - fn test_extract_field_bytes() { - let (metadata, value) = create_test_object(); - - let name_bytes = FieldOperations::extract_field_bytes(&metadata, &value, "name").unwrap(); - assert!(name_bytes.is_some()); - - let nonexistent_bytes = - FieldOperations::extract_field_bytes(&metadata, &value, "nonexistent").unwrap(); - assert!(nonexistent_bytes.is_none()); - } - - #[test] - fn test_remove_field_bytes() { - let (metadata, value) = create_test_object(); - - let result = FieldOperations::remove_field_bytes(&metadata, &value, "city").unwrap(); - assert!(result.is_some()); + // Tests have been removed since the functions they tested are no longer needed. + // Field operations are now tested as part of variant_array.rs tests. +} - // Verify the field was removed by checking we can't extract it - let new_value = result.unwrap(); - let city_bytes = - FieldOperations::extract_field_bytes(&metadata, &new_value, "city").unwrap(); - assert!(city_bytes.is_none()); - // Verify other fields are still there - let name_bytes = - FieldOperations::extract_field_bytes(&metadata, &new_value, "name").unwrap(); - assert!(name_bytes.is_some()); - } -} diff --git a/parquet-variant-compute/src/lib.rs b/parquet-variant-compute/src/lib.rs index e3d20ec50f12..903e0c801d06 100644 --- a/parquet-variant-compute/src/lib.rs +++ b/parquet-variant-compute/src/lib.rs @@ -31,5 +31,5 @@ pub use variant_array::VariantArray; pub use variant_array_builder::VariantArrayBuilder; pub use variant_get::{variant_get, GetOptions}; pub use variant_parser::{ - ArrayHeader, ObjectHeader, PrimitiveType, ShortStringHeader, VariantBasicType, VariantType, + PrimitiveType, ShortStringHeader, VariantBasicType, VariantType, }; diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index c05deb282d16..6dcbff1d24a1 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -17,7 +17,7 @@ //! [`VariantArray`] implementation -use crate::field_operations::FieldOperations; + use arrow::array::{Array, ArrayData, ArrayRef, AsArray, StructArray}; use arrow::buffer::NullBuffer; use arrow_schema::{ArrowError, DataType}; @@ -204,44 +204,42 @@ impl VariantArray { /// Create a new VariantArray with a field removed from all variants pub fn with_field_removed(&self, field_name: &str) -> Result { - let mut builder = crate::variant_array_builder::VariantArrayBuilder::new(self.len()); - - for i in 0..self.len() { - if self.is_null(i) { - builder.append_null(); - } else { - let new_value = FieldOperations::remove_field_bytes( - self.metadata_bytes(i), - self.value_bytes(i), - field_name, - )?; - - // Use original value if the field didn't exist - let new_value = new_value.as_deref().unwrap_or_else(|| self.value_bytes(i)); - builder.append_variant_buffers(self.metadata_bytes(i), new_value); - } - } - - Ok(builder.build()) + self.with_fields_removed(&[field_name]) } /// Create a new VariantArray with multiple fields removed from all variants pub fn with_fields_removed(&self, field_names: &[&str]) -> Result { + use parquet_variant::VariantBuilder; + use std::collections::HashSet; + + let fields_to_remove: HashSet<&str> = field_names.iter().copied().collect(); let mut builder = crate::variant_array_builder::VariantArrayBuilder::new(self.len()); for i in 0..self.len() { if self.is_null(i) { builder.append_null(); } else { - let new_value = FieldOperations::remove_fields_bytes( - self.metadata_bytes(i), - self.value_bytes(i), - field_names, - )?; - - // Use original value if no fields existed - let new_value = new_value.as_deref().unwrap_or_else(|| self.value_bytes(i)); - builder.append_variant_buffers(self.metadata_bytes(i), new_value); + let variant = self.value(i); + + // If it's an object, create a new object without the specified fields + if let Some(obj) = variant.as_object() { + let mut variant_builder = VariantBuilder::new(); + let mut object_builder = variant_builder.new_object(); + + // Add all fields except the ones to remove + for (field_name, field_value) in obj.iter() { + if !fields_to_remove.contains(field_name) { + object_builder.insert(field_name, field_value); + } + } + + object_builder.finish().unwrap(); + let (metadata, value) = variant_builder.finish(); + builder.append_variant_buffers(&metadata, &value); + } else { + // Not an object, append as-is + builder.append_variant(variant); + } } } diff --git a/parquet-variant-compute/src/variant_parser.rs b/parquet-variant-compute/src/variant_parser.rs index 5cf70b70f7f9..779d50ba2a4b 100644 --- a/parquet-variant-compute/src/variant_parser.rs +++ b/parquet-variant-compute/src/variant_parser.rs @@ -55,8 +55,6 @@ pub enum PrimitiveType { pub enum VariantType { Primitive(PrimitiveType), ShortString(ShortStringHeader), - Object(ObjectHeader), - Array(ArrayHeader), } /// Short string header structure @@ -65,61 +63,13 @@ pub struct ShortStringHeader { pub length: usize, } -/// Object header structure for variant objects -#[derive(Debug, Clone, PartialEq)] -pub struct ObjectHeader { - pub num_elements_size: usize, - pub field_id_size: usize, - pub field_offset_size: usize, - pub is_large: bool, -} - -/// Array header structure for variant objects -#[derive(Debug, Clone, PartialEq)] -pub struct ArrayHeader { - pub num_elements_size: usize, - pub element_offset_size: usize, - pub is_large: bool, -} - -/// Object byte offsets structure -#[derive(Debug, Clone)] -pub struct ObjectOffsets { - pub field_ids_start: usize, - pub field_offsets_start: usize, - pub values_start: usize, -} -/// Array byte offsets structure -#[derive(Debug, Clone)] -pub struct ArrayOffsets { - pub element_offsets_start: usize, - pub elements_start: usize, -} /// Low-level parser for variant binary format pub struct VariantParser; impl VariantParser { - /// General dispatch function to parse any variant header - pub fn parse_variant_header(header_byte: u8) -> Result { - let basic_type = Self::get_basic_type(header_byte); - - match basic_type { - VariantBasicType::Primitive => Ok(VariantType::Primitive( - Self::parse_primitive_header(header_byte)?, - )), - VariantBasicType::ShortString => Ok(VariantType::ShortString( - Self::parse_short_string_header(header_byte)?, - )), - VariantBasicType::Object => { - Ok(VariantType::Object(Self::parse_object_header(header_byte)?)) - } - VariantBasicType::Array => { - Ok(VariantType::Array(Self::parse_array_header(header_byte)?)) - } - } - } + /// Parse primitive type header pub fn parse_primitive_header(header_byte: u8) -> Result { @@ -178,40 +128,7 @@ impl VariantParser { Ok(ShortStringHeader { length }) } - /// Parse object header from header byte - pub fn parse_object_header(header_byte: u8) -> Result { - let value_header = header_byte >> 2; - let field_offset_size_minus_one = value_header & 0x03; - let field_id_size_minus_one = (value_header >> 2) & 0x03; - let is_large = (value_header & 0x10) != 0; - - let num_elements_size = if is_large { 4 } else { 1 }; - let field_id_size = (field_id_size_minus_one + 1) as usize; - let field_offset_size = (field_offset_size_minus_one + 1) as usize; - - Ok(ObjectHeader { - num_elements_size, - field_id_size, - field_offset_size, - is_large, - }) - } - - /// Parse array header from header byte - pub fn parse_array_header(header_byte: u8) -> Result { - let value_header = header_byte >> 2; - let element_offset_size_minus_one = value_header & 0x03; - let is_large = (value_header & 0x10) != 0; - let num_elements_size = if is_large { 4 } else { 1 }; - let element_offset_size = (element_offset_size_minus_one + 1) as usize; - - Ok(ArrayHeader { - num_elements_size, - element_offset_size, - is_large, - }) - } /// Unpack integer from bytes pub fn unpack_int(bytes: &[u8], size: usize) -> Result { @@ -273,40 +190,6 @@ impl VariantParser { } } - - - /// Check if value bytes represent a primitive - pub fn is_primitive(value_bytes: &[u8]) -> bool { - if value_bytes.is_empty() { - return false; - } - Self::get_basic_type(value_bytes[0]) == VariantBasicType::Primitive - } - - /// Check if value bytes represent a short string - pub fn is_short_string(value_bytes: &[u8]) -> bool { - if value_bytes.is_empty() { - return false; - } - Self::get_basic_type(value_bytes[0]) == VariantBasicType::ShortString - } - - /// Check if value bytes represent an object - pub fn is_object(value_bytes: &[u8]) -> bool { - if value_bytes.is_empty() { - return false; - } - Self::get_basic_type(value_bytes[0]) == VariantBasicType::Object - } - - /// Check if value bytes represent an array - pub fn is_array(value_bytes: &[u8]) -> bool { - if value_bytes.is_empty() { - return false; - } - Self::get_basic_type(value_bytes[0]) == VariantBasicType::Array - } - /// Get the data length for a primitive type /// Returns Some(len) for fixed-length types, None for variable-length types pub fn get_primitive_data_length(primitive_type: &PrimitiveType) -> Option { @@ -328,98 +211,44 @@ impl VariantParser { } } - /// Extract short string data from value bytes - pub fn extract_short_string_data(value_bytes: &[u8]) -> Result<&[u8], ArrowError> { - if value_bytes.is_empty() { - return Err(ArrowError::InvalidArgumentError( - "Empty value bytes".to_string(), - )); - } - let header = Self::parse_short_string_header(value_bytes[0])?; - if value_bytes.len() < 1 + header.length { - return Err(ArrowError::InvalidArgumentError(format!( - "Short string data length {} exceeds available bytes", - header.length - ))); + // Legacy type checking functions - kept for backwards compatibility but consider using Variant pattern matching instead + + /// Check if value bytes represent a primitive + /// NOTE: Consider using `matches!(variant, Variant::Int32(_) | Variant::String(_) | ...)` instead + pub fn is_primitive(value_bytes: &[u8]) -> bool { + if value_bytes.is_empty() { + return false; } - - Ok(&value_bytes[1..1 + header.length]) + Self::get_basic_type(value_bytes[0]) == VariantBasicType::Primitive } - /// Extract primitive data from value bytes - pub fn extract_primitive_data(value_bytes: &[u8]) -> Result<&[u8], ArrowError> { + /// Check if value bytes represent a short string + /// NOTE: Consider using `matches!(variant, Variant::ShortString(_))` instead + pub fn is_short_string(value_bytes: &[u8]) -> bool { if value_bytes.is_empty() { - return Err(ArrowError::InvalidArgumentError( - "Empty value bytes".to_string(), - )); - } - - let primitive_type = Self::parse_primitive_header(value_bytes[0])?; - let data_length = Self::get_primitive_data_length(&primitive_type); - - match data_length { - Some(0) => { - // Fixed-length 0-byte types (null/true/false) - Ok(&[]) - } - Some(len) => { - // Fixed-length types with len bytes - if value_bytes.len() < 1 + len { - return Err(ArrowError::InvalidArgumentError(format!( - "Fixed length primitive data length {} exceeds available bytes", - len - ))); - } - Ok(&value_bytes[1..1 + len]) - } - None => { - // Variable-length types (binary/string) - read length from data - if value_bytes.len() < 5 { - return Err(ArrowError::InvalidArgumentError( - "Not enough bytes for variable length primitive".to_string(), - )); - } - let length = u32::from_le_bytes([ - value_bytes[1], - value_bytes[2], - value_bytes[3], - value_bytes[4], - ]) as usize; - if value_bytes.len() < 5 + length { - return Err(ArrowError::InvalidArgumentError( - "Variable length primitive data exceeds available bytes".to_string(), - )); - } - Ok(&value_bytes[5..5 + length]) - } + return false; } + Self::get_basic_type(value_bytes[0]) == VariantBasicType::ShortString } - /// Calculate byte offsets for array elements - pub fn calculate_array_offsets(header: &ArrayHeader, num_elements: usize) -> ArrayOffsets { - let element_offsets_start = 1 + header.num_elements_size; - let elements_start = - element_offsets_start + ((num_elements + 1) * header.element_offset_size); - - ArrayOffsets { - element_offsets_start, - elements_start, + /// Check if value bytes represent an object + /// NOTE: Consider using `variant.as_object().is_some()` instead + pub fn is_object(value_bytes: &[u8]) -> bool { + if value_bytes.is_empty() { + return false; } + Self::get_basic_type(value_bytes[0]) == VariantBasicType::Object } - /// Calculate byte offsets for object fields - pub fn calculate_object_offsets(header: &ObjectHeader, num_elements: usize) -> ObjectOffsets { - let field_ids_start = 1 + header.num_elements_size; - let field_offsets_start = field_ids_start + (num_elements * header.field_id_size); - let values_start = field_offsets_start + ((num_elements + 1) * header.field_offset_size); - - ObjectOffsets { - field_ids_start, - field_offsets_start, - values_start, + /// Check if value bytes represent an array + /// NOTE: Consider using `variant.as_list().is_some()` instead + pub fn is_array(value_bytes: &[u8]) -> bool { + if value_bytes.is_empty() { + return false; } + Self::get_basic_type(value_bytes[0]) == VariantBasicType::Array } } @@ -513,36 +342,7 @@ mod tests { } } - #[test] - fn test_parse_variant_header_dispatch() { - // Test primitive dispatch - let primitive_header = 0b00000100; // True primitive - match VariantParser::parse_variant_header(primitive_header).unwrap() { - VariantType::Primitive(PrimitiveType::True) => {} - _ => panic!("Expected primitive True"), - } - - // Test short string dispatch - let short_string_header = 0b00010101; // 5-length short string - match VariantParser::parse_variant_header(short_string_header).unwrap() { - VariantType::ShortString(ShortStringHeader { length: 5 }) => {} - _ => panic!("Expected short string with length 5"), - } - // Test object dispatch - let object_header = 0b00000010; // Basic object - match VariantParser::parse_variant_header(object_header).unwrap() { - VariantType::Object(_) => {} - _ => panic!("Expected object"), - } - - // Test array dispatch - let array_header = 0b00000011; // Basic array - match VariantParser::parse_variant_header(array_header).unwrap() { - VariantType::Array(_) => {} - _ => panic!("Expected array"), - } - } #[test] fn test_basic_type_checks() { @@ -618,52 +418,5 @@ mod tests { ); } - #[test] - fn test_extract_short_string_data() { - // Test 0-length short string - let data = &[0b00000001]; // 0-length short string header - assert_eq!( - VariantParser::extract_short_string_data(data).unwrap(), - &[] as &[u8] - ); - // Test 5-length short string - let data = &[0b00010101, b'H', b'e', b'l', b'l', b'o']; // 5-length short string + "Hello" - assert_eq!( - VariantParser::extract_short_string_data(data).unwrap(), - b"Hello" - ); - - // Test insufficient data - let data = &[0b00010101, b'H', b'i']; // Claims 5 bytes but only has 2 - assert!(VariantParser::extract_short_string_data(data).is_err()); - } - - #[test] - fn test_extract_primitive_data() { - // Test null (no data) - let data = &[0b00000000]; // Null header - assert_eq!( - VariantParser::extract_primitive_data(data).unwrap(), - &[] as &[u8] - ); - - // Test true (no data) - let data = &[0b00000100]; // True header - assert_eq!( - VariantParser::extract_primitive_data(data).unwrap(), - &[] as &[u8] - ); - - // Test int32 (4 bytes) - let data = &[0b00010100, 0x2A, 0x00, 0x00, 0x00]; // Int32 header + 42 in little endian - assert_eq!( - VariantParser::extract_primitive_data(data).unwrap(), - &[0x2A, 0x00, 0x00, 0x00] - ); - - // Test insufficient data for int32 - let data = &[0b00010100, 0x2A, 0x00]; // Int32 header but only 2 bytes - assert!(VariantParser::extract_primitive_data(data).is_err()); - } } From 3b3c1910f591ae544557c25e357d102283991644 Mon Sep 17 00:00:00 2001 From: carpecodeum Date: Mon, 21 Jul 2025 09:27:58 -0400 Subject: [PATCH 13/18] [REMOVE] revert field_operations.rs --- .../benches/variant_kernels.rs | 3 +- .../src/field_operations.rs | 48 ------------------- parquet-variant-compute/src/lib.rs | 1 - parquet-variant-compute/src/variant_parser.rs | 9 +--- parquet-variant/Cargo.toml | 1 + 5 files changed, 5 insertions(+), 57 deletions(-) delete mode 100644 parquet-variant-compute/src/field_operations.rs diff --git a/parquet-variant-compute/benches/variant_kernels.rs b/parquet-variant-compute/benches/variant_kernels.rs index d4007076bbae..8fd6af333fed 100644 --- a/parquet-variant-compute/benches/variant_kernels.rs +++ b/parquet-variant-compute/benches/variant_kernels.rs @@ -19,7 +19,8 @@ use arrow::array::{Array, ArrayRef, StringArray}; use arrow::util::test_util::seedable_rng; use criterion::{criterion_group, criterion_main, Criterion}; use parquet_variant::{Variant, VariantBuilder}; -use parquet_variant_compute::{batch_json_string_to_variant, variant_get, GetOptions, VariantArray, VariantArrayBuilder}; +use parquet_variant_compute::variant_get::{variant_get, GetOptions}; +use parquet_variant_compute::{batch_json_string_to_variant, VariantArray, VariantArrayBuilder}; use rand::distr::Alphanumeric; use rand::rngs::StdRng; use rand::Rng; diff --git a/parquet-variant-compute/src/field_operations.rs b/parquet-variant-compute/src/field_operations.rs deleted file mode 100644 index 44cd14be0398..000000000000 --- a/parquet-variant-compute/src/field_operations.rs +++ /dev/null @@ -1,48 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Field extraction and removal operations for variant objects -//! -//! NOTE: Most functionality in this module has been superseded by the high-level -//! Variant API (variant.as_object(), variant.get_object_field(), etc.). -//! For new code, prefer using the high-level API over these low-level operations. - -// This module is mostly empty now - the manual field operations have been -// replaced by high-level Variant API usage. See variant_array.rs for examples -// of how field removal is now implemented using VariantBuilder. - -/// Field operations for variant objects -pub struct FieldOperations; - -// Note: This struct is kept for backwards compatibility but most methods -// have been removed in favor of high-level Variant API usage. -impl FieldOperations { - // All manual field manipulation methods have been removed. - // Use the high-level Variant API instead: - // - variant.get_object_field(name) instead of extract_field_bytes() - // - variant.get_list_element(index) instead of array element extraction - // - variant.get_path(&path) instead of get_path_bytes() - // - VariantBuilder::new_object() instead of manual object reconstruction -} - -#[cfg(test)] -mod tests { - // Tests have been removed since the functions they tested are no longer needed. - // Field operations are now tested as part of variant_array.rs tests. -} - - diff --git a/parquet-variant-compute/src/lib.rs b/parquet-variant-compute/src/lib.rs index 903e0c801d06..683e95f2f572 100644 --- a/parquet-variant-compute/src/lib.rs +++ b/parquet-variant-compute/src/lib.rs @@ -17,7 +17,6 @@ //! Parquet variant compute functions -pub mod field_operations; pub mod from_json; pub mod to_json; pub mod variant_array; diff --git a/parquet-variant-compute/src/variant_parser.rs b/parquet-variant-compute/src/variant_parser.rs index 779d50ba2a4b..2332326bf618 100644 --- a/parquet-variant-compute/src/variant_parser.rs +++ b/parquet-variant-compute/src/variant_parser.rs @@ -118,12 +118,7 @@ impl VariantParser { let length = (header_byte >> 2) as usize; // Short strings can be up to 64 bytes (6-bit value: 0-63) - if length > 63 { - return Err(ArrowError::InvalidArgumentError(format!( - "Short string length {} exceeds maximum of 63", - length - ))); - } + // Note: Since header_byte is u8, header_byte >> 2 can never exceed 63, so no bounds check needed Ok(ShortStringHeader { length }) } @@ -335,7 +330,7 @@ mod tests { ShortStringHeader { length: 63 } ); - // Test that all values 0-63 are valid + // Test that all values 0-63 are valid (these are all possible values since u8 >> 2 max is 63) for length in 0..=63 { let header_byte = (length << 2) | 1; // short string type assert!(VariantParser::parse_short_string_header(header_byte as u8).is_ok()); diff --git a/parquet-variant/Cargo.toml b/parquet-variant/Cargo.toml index 51fa4cc23311..f3b07f441e48 100644 --- a/parquet-variant/Cargo.toml +++ b/parquet-variant/Cargo.toml @@ -30,6 +30,7 @@ readme = "README.md" edition = { workspace = true } rust-version = { workspace = true } + [dependencies] arrow-schema = { workspace = true } chrono = { workspace = true } From 01f0be7b5cceb6becf5bace5977b0d16061e0952 Mon Sep 17 00:00:00 2001 From: carpecodeum Date: Mon, 21 Jul 2025 09:30:15 -0400 Subject: [PATCH 14/18] [REMOVE] remove extra lines in cargo.toml --- parquet-variant-compute/Cargo.toml | 1 + parquet-variant/Cargo.toml | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet-variant-compute/Cargo.toml b/parquet-variant-compute/Cargo.toml index 68b9823c8dc8..cc13810a2971 100644 --- a/parquet-variant-compute/Cargo.toml +++ b/parquet-variant-compute/Cargo.toml @@ -29,6 +29,7 @@ keywords = ["arrow", "parquet", "variant"] edition = { workspace = true } rust-version = { workspace = true } + [dependencies] arrow = { workspace = true } arrow-schema = { workspace = true } diff --git a/parquet-variant/Cargo.toml b/parquet-variant/Cargo.toml index f3b07f441e48..51fa4cc23311 100644 --- a/parquet-variant/Cargo.toml +++ b/parquet-variant/Cargo.toml @@ -30,7 +30,6 @@ readme = "README.md" edition = { workspace = true } rust-version = { workspace = true } - [dependencies] arrow-schema = { workspace = true } chrono = { workspace = true } From eb238341afc4beeb9d9a336a9c5bc75cd9a48785 Mon Sep 17 00:00:00 2001 From: carpecodeum Date: Mon, 21 Jul 2025 09:44:56 -0400 Subject: [PATCH 15/18] [REMOVE] remove variant_parser.rs file as decoder.rs already has major functionalities --- parquet-variant-compute/src/lib.rs | 4 - parquet-variant-compute/src/variant_parser.rs | 417 ------------------ 2 files changed, 421 deletions(-) delete mode 100644 parquet-variant-compute/src/variant_parser.rs diff --git a/parquet-variant-compute/src/lib.rs b/parquet-variant-compute/src/lib.rs index 683e95f2f572..a2027e2e4f57 100644 --- a/parquet-variant-compute/src/lib.rs +++ b/parquet-variant-compute/src/lib.rs @@ -22,13 +22,9 @@ pub mod to_json; pub mod variant_array; pub mod variant_array_builder; pub mod variant_get; -pub mod variant_parser; pub use from_json::batch_json_string_to_variant; pub use to_json::batch_variant_to_json_string; pub use variant_array::VariantArray; pub use variant_array_builder::VariantArrayBuilder; pub use variant_get::{variant_get, GetOptions}; -pub use variant_parser::{ - PrimitiveType, ShortStringHeader, VariantBasicType, VariantType, -}; diff --git a/parquet-variant-compute/src/variant_parser.rs b/parquet-variant-compute/src/variant_parser.rs deleted file mode 100644 index 2332326bf618..000000000000 --- a/parquet-variant-compute/src/variant_parser.rs +++ /dev/null @@ -1,417 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Low-level binary format parsing for variant objects - -use arrow::error::ArrowError; - -/// Basic variant type enumeration for the first 2 bits of header -#[derive(Debug, Clone, PartialEq)] -pub enum VariantBasicType { - Primitive = 0, - ShortString = 1, - Object = 2, - Array = 3, -} - -/// Primitive type variants -#[derive(Debug, Clone, PartialEq)] -pub enum PrimitiveType { - Null, - True, - False, - Int8, - Int16, - Int32, - Int64, - Double, - Decimal4, - Decimal8, - Decimal16, - Date, - TimestampNtz, - TimestampLtz, - Float, - Binary, - String, -} - -/// Variant type enumeration covering all possible types -#[derive(Debug, Clone, PartialEq)] -pub enum VariantType { - Primitive(PrimitiveType), - ShortString(ShortStringHeader), -} - -/// Short string header structure -#[derive(Debug, Clone, PartialEq)] -pub struct ShortStringHeader { - pub length: usize, -} - - - -/// Low-level parser for variant binary format -pub struct VariantParser; - -impl VariantParser { - - - /// Parse primitive type header - pub fn parse_primitive_header(header_byte: u8) -> Result { - let primitive_type = header_byte >> 2; - - match primitive_type { - 0 => Ok(PrimitiveType::Null), - 1 => Ok(PrimitiveType::True), - 2 => Ok(PrimitiveType::False), - 3 => Ok(PrimitiveType::Int8), - 4 => Ok(PrimitiveType::Int16), - 5 => Ok(PrimitiveType::Int32), - 6 => Ok(PrimitiveType::Int64), - 7 => Ok(PrimitiveType::Double), - 8 => Ok(PrimitiveType::Decimal4), - 9 => Ok(PrimitiveType::Decimal8), - 10 => Ok(PrimitiveType::Decimal16), - 11 => Ok(PrimitiveType::Date), - 12 => Ok(PrimitiveType::TimestampNtz), - 13 => Ok(PrimitiveType::TimestampLtz), - 14 => Ok(PrimitiveType::Float), - 15 => Ok(PrimitiveType::Binary), - 16 => Ok(PrimitiveType::String), - _ => Err(ArrowError::InvalidArgumentError(format!( - "Invalid primitive type: {}", - primitive_type - ))), - } - } - - /// Get the basic type from header byte - pub fn get_basic_type(header_byte: u8) -> VariantBasicType { - match header_byte & 0x03 { - 0 => VariantBasicType::Primitive, - 1 => VariantBasicType::ShortString, - 2 => VariantBasicType::Object, - 3 => VariantBasicType::Array, - _ => panic!("Invalid basic type: {}", header_byte & 0x03), - } - } - - - - /// Parse short string header - pub fn parse_short_string_header(header_byte: u8) -> Result { - let length = (header_byte >> 2) as usize; - - // Short strings can be up to 64 bytes (6-bit value: 0-63) - // Note: Since header_byte is u8, header_byte >> 2 can never exceed 63, so no bounds check needed - - Ok(ShortStringHeader { length }) - } - - - - /// Unpack integer from bytes - pub fn unpack_int(bytes: &[u8], size: usize) -> Result { - if bytes.len() < size { - return Err(ArrowError::InvalidArgumentError( - "Not enough bytes to unpack integer".to_string(), - )); - } - - match size { - 1 => Ok(bytes[0] as usize), - 2 => Ok(u16::from_le_bytes([bytes[0], bytes[1]]) as usize), - 3 => Ok(u32::from_le_bytes([bytes[0], bytes[1], bytes[2], 0]) as usize), - 4 => Ok(u32::from_le_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]) as usize), - _ => Err(ArrowError::InvalidArgumentError(format!( - "Invalid integer size: {}", - size - ))), - } - } - - /// Calculate the size needed to store an integer - pub fn calculate_int_size(value: usize) -> usize { - if value <= u8::MAX as usize { - 1 - } else if value <= u16::MAX as usize { - 2 - } else if value <= 0xFFFFFF { - 3 - } else { - 4 - } - } - - /// Build object header byte - pub fn build_object_header( - is_large: bool, - field_id_size: usize, - field_offset_size: usize, - ) -> u8 { - let large_bit = if is_large { 1 } else { 0 }; - (large_bit << 6) - | (((field_id_size - 1) as u8) << 4) - | (((field_offset_size - 1) as u8) << 2) - | 2 - } - - /// Write integer bytes to buffer - pub fn write_int_bytes(buffer: &mut Vec, value: usize, size: usize) { - match size { - 1 => buffer.push(value as u8), - 2 => buffer.extend_from_slice(&(value as u16).to_le_bytes()), - 3 => { - let bytes = (value as u32).to_le_bytes(); - buffer.extend_from_slice(&bytes[..3]); - } - 4 => buffer.extend_from_slice(&(value as u32).to_le_bytes()), - _ => panic!("Invalid size: {}", size), - } - } - - /// Get the data length for a primitive type - /// Returns Some(len) for fixed-length types, None for variable-length types - pub fn get_primitive_data_length(primitive_type: &PrimitiveType) -> Option { - match primitive_type { - PrimitiveType::Null | PrimitiveType::True | PrimitiveType::False => Some(0), - PrimitiveType::Int8 => Some(1), - PrimitiveType::Int16 => Some(2), - PrimitiveType::Int32 - | PrimitiveType::Float - | PrimitiveType::Decimal4 - | PrimitiveType::Date => Some(4), - PrimitiveType::Int64 - | PrimitiveType::Double - | PrimitiveType::Decimal8 - | PrimitiveType::TimestampNtz - | PrimitiveType::TimestampLtz => Some(8), - PrimitiveType::Decimal16 => Some(16), - PrimitiveType::Binary | PrimitiveType::String => None, // Variable length, need to read from data - } - } - - - - // Legacy type checking functions - kept for backwards compatibility but consider using Variant pattern matching instead - - /// Check if value bytes represent a primitive - /// NOTE: Consider using `matches!(variant, Variant::Int32(_) | Variant::String(_) | ...)` instead - pub fn is_primitive(value_bytes: &[u8]) -> bool { - if value_bytes.is_empty() { - return false; - } - Self::get_basic_type(value_bytes[0]) == VariantBasicType::Primitive - } - - /// Check if value bytes represent a short string - /// NOTE: Consider using `matches!(variant, Variant::ShortString(_))` instead - pub fn is_short_string(value_bytes: &[u8]) -> bool { - if value_bytes.is_empty() { - return false; - } - Self::get_basic_type(value_bytes[0]) == VariantBasicType::ShortString - } - - /// Check if value bytes represent an object - /// NOTE: Consider using `variant.as_object().is_some()` instead - pub fn is_object(value_bytes: &[u8]) -> bool { - if value_bytes.is_empty() { - return false; - } - Self::get_basic_type(value_bytes[0]) == VariantBasicType::Object - } - - /// Check if value bytes represent an array - /// NOTE: Consider using `variant.as_list().is_some()` instead - pub fn is_array(value_bytes: &[u8]) -> bool { - if value_bytes.is_empty() { - return false; - } - Self::get_basic_type(value_bytes[0]) == VariantBasicType::Array - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_unpack_int() { - assert_eq!(VariantParser::unpack_int(&[42], 1).unwrap(), 42); - assert_eq!(VariantParser::unpack_int(&[0, 1], 2).unwrap(), 256); - assert_eq!(VariantParser::unpack_int(&[0, 0, 1, 0], 4).unwrap(), 65536); - } - - #[test] - fn test_calculate_int_size() { - assert_eq!(VariantParser::calculate_int_size(255), 1); - assert_eq!(VariantParser::calculate_int_size(256), 2); - assert_eq!(VariantParser::calculate_int_size(65536), 3); - assert_eq!(VariantParser::calculate_int_size(16777216), 4); - } - - #[test] - fn test_write_int_bytes() { - let mut buffer = Vec::new(); - VariantParser::write_int_bytes(&mut buffer, 42, 1); - assert_eq!(buffer, vec![42]); - - let mut buffer = Vec::new(); - VariantParser::write_int_bytes(&mut buffer, 256, 2); - assert_eq!(buffer, vec![0, 1]); - } - - #[test] - fn test_parse_primitive_header() { - // Test null (primitive type 0) - assert_eq!( - VariantParser::parse_primitive_header(0b00000000).unwrap(), - PrimitiveType::Null - ); - - // Test true (primitive type 1) - assert_eq!( - VariantParser::parse_primitive_header(0b00000100).unwrap(), - PrimitiveType::True - ); - - // Test false (primitive type 2) - assert_eq!( - VariantParser::parse_primitive_header(0b00001000).unwrap(), - PrimitiveType::False - ); - - // Test int32 (primitive type 5) - assert_eq!( - VariantParser::parse_primitive_header(0b00010100).unwrap(), - PrimitiveType::Int32 - ); - - // Test double (primitive type 7) - assert_eq!( - VariantParser::parse_primitive_header(0b00011100).unwrap(), - PrimitiveType::Double - ); - } - - #[test] - fn test_parse_short_string_header() { - // Test 0-length short string - assert_eq!( - VariantParser::parse_short_string_header(0b00000001).unwrap(), - ShortStringHeader { length: 0 } - ); - - // Test 5-length short string - assert_eq!( - VariantParser::parse_short_string_header(0b00010101).unwrap(), - ShortStringHeader { length: 5 } - ); - - // Test 63-length short string (maximum for 6-bit value) - assert_eq!( - VariantParser::parse_short_string_header(0b11111101).unwrap(), - ShortStringHeader { length: 63 } - ); - - // Test that all values 0-63 are valid (these are all possible values since u8 >> 2 max is 63) - for length in 0..=63 { - let header_byte = (length << 2) | 1; // short string type - assert!(VariantParser::parse_short_string_header(header_byte as u8).is_ok()); - } - } - - - - #[test] - fn test_basic_type_checks() { - // Test primitive type check - assert!(VariantParser::is_primitive(&[0b00000000])); // Null - assert!(VariantParser::is_primitive(&[0b00000100])); // True - assert!(!VariantParser::is_primitive(&[0b00000001])); // Not primitive - - // Test short string type check - assert!(VariantParser::is_short_string(&[0b00000001])); // 0-length short string - assert!(VariantParser::is_short_string(&[0b00010101])); // 5-length short string - assert!(!VariantParser::is_short_string(&[0b00000000])); // Not short string - - // Test object type check - assert!(VariantParser::is_object(&[0b00000010])); // Basic object - assert!(!VariantParser::is_object(&[0b00000001])); // Not object - - // Test array type check - assert!(VariantParser::is_array(&[0b00000011])); // Basic array - assert!(!VariantParser::is_array(&[0b00000010])); // Not array - } - - #[test] - fn test_get_primitive_data_length() { - // Test fixed-length 0-byte types - assert_eq!( - VariantParser::get_primitive_data_length(&PrimitiveType::Null), - Some(0) - ); - assert_eq!( - VariantParser::get_primitive_data_length(&PrimitiveType::True), - Some(0) - ); - assert_eq!( - VariantParser::get_primitive_data_length(&PrimitiveType::False), - Some(0) - ); - - // Test fixed-length types with specific byte counts - assert_eq!( - VariantParser::get_primitive_data_length(&PrimitiveType::Int8), - Some(1) - ); - assert_eq!( - VariantParser::get_primitive_data_length(&PrimitiveType::Int16), - Some(2) - ); - assert_eq!( - VariantParser::get_primitive_data_length(&PrimitiveType::Int32), - Some(4) - ); - assert_eq!( - VariantParser::get_primitive_data_length(&PrimitiveType::Int64), - Some(8) - ); - assert_eq!( - VariantParser::get_primitive_data_length(&PrimitiveType::Double), - Some(8) - ); - assert_eq!( - VariantParser::get_primitive_data_length(&PrimitiveType::Decimal16), - Some(16) - ); - - // Test variable-length types (should return None) - assert_eq!( - VariantParser::get_primitive_data_length(&PrimitiveType::Binary), - None - ); - assert_eq!( - VariantParser::get_primitive_data_length(&PrimitiveType::String), - None - ); - } - - -} From cc5e1499028c760e55d374ea3364dc53a73b24ee Mon Sep 17 00:00:00 2001 From: carpecodeum Date: Mon, 21 Jul 2025 10:10:40 -0400 Subject: [PATCH 16/18] [FIX] make code modular --- parquet-variant-compute/src/variant_array.rs | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index 6dcbff1d24a1..d2167f05b261 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -159,13 +159,13 @@ impl VariantArray { /// Return a reference to the metadata field of the [`StructArray`] pub fn metadata_field(&self) -> &ArrayRef { // spec says fields order is not guaranteed, so we search by name - self.inner.column_by_name("metadata").unwrap() + &self.metadata_ref } /// Return a reference to the value field of the `StructArray` pub fn value_field(&self) -> &ArrayRef { // spec says fields order is not guaranteed, so we search by name - self.inner.column_by_name("value").unwrap() + &self.value_ref } /// Get the metadata bytes for a specific index @@ -180,23 +180,13 @@ impl VariantArray { /// Get the field names for an object at the given index pub fn get_field_names(&self, index: usize) -> Vec { - if index >= self.len() { - return vec![]; - } - - if self.is_null(index) { + if index >= self.len() || self.is_null(index) { return vec![]; } let variant = self.value(index); if let Some(obj) = variant.as_object() { - let mut field_names = Vec::new(); - for i in 0..obj.len() { - if let Some(field_name) = obj.field_name(i) { - field_names.push(field_name.to_string()); - } - } - field_names + Vec::from_iter((0..obj.len()).map(|i| obj.field_name(i).unwrap().to_string())) } else { vec![] } From 30e9cd277b69611433cd21a7b87fc08b0e0487c7 Mon Sep 17 00:00:00 2001 From: carpecodeum Date: Tue, 22 Jul 2025 19:36:55 -0400 Subject: [PATCH 17/18] [FIX] clippy and lint issues --- parquet-variant-compute/src/variant_array.rs | 4 ++-- parquet-variant/src/builder.rs | 8 ++++---- parquet-variant/src/path.rs | 6 +++--- parquet-variant/src/variant/object.rs | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index d2167f05b261..47669509f9cc 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -170,12 +170,12 @@ impl VariantArray { /// Get the metadata bytes for a specific index pub fn metadata_bytes(&self, index: usize) -> &[u8] { - self.metadata_field().as_binary_view().value(index).as_ref() + self.metadata_field().as_binary_view().value(index) } /// Get the value bytes for a specific index pub fn value_bytes(&self, index: usize) -> &[u8] { - self.value_field().as_binary_view().value(index).as_ref() + self.value_field().as_binary_view().value(index) } /// Get the field names for an object at the given index diff --git a/parquet-variant/src/builder.rs b/parquet-variant/src/builder.rs index dc66865e68ac..932b68f1363f 100644 --- a/parquet-variant/src/builder.rs +++ b/parquet-variant/src/builder.rs @@ -305,9 +305,9 @@ impl ValueBuffer { /// /// This method will panic if the variant contains duplicate field names in objects /// when validation is enabled. For a fallible version, use [`ValueBuffer::try_append_variant`] - fn append_variant<'m, 'd>( + fn append_variant( &mut self, - variant: Variant<'m, 'd>, + variant: Variant<'_, '_>, metadata_builder: &mut MetadataBuilder, ) { match variant { @@ -335,9 +335,9 @@ impl ValueBuffer { } /// Appends a variant to the buffer - fn try_append_variant<'m, 'd>( + fn try_append_variant( &mut self, - variant: Variant<'m, 'd>, + variant: Variant<'_, '_>, metadata_builder: &mut MetadataBuilder, ) -> Result<(), ArrowError> { match variant { diff --git a/parquet-variant/src/path.rs b/parquet-variant/src/path.rs index 42dbdb3abc2d..7a94d6f0a859 100644 --- a/parquet-variant/src/path.rs +++ b/parquet-variant/src/path.rs @@ -103,7 +103,7 @@ impl<'a> From<&'a str> for VariantPath<'a> { } /// Create from usize -impl<'a> From for VariantPath<'a> { +impl From for VariantPath<'_> { fn from(index: usize) -> Self { VariantPath::new(vec![VariantPathElement::index(index)]) } @@ -152,7 +152,7 @@ impl<'a> From<&'a str> for VariantPathElement<'a> { } } -impl<'a> From for VariantPathElement<'a> { +impl From for VariantPathElement<'_> { fn from(name: String) -> Self { VariantPathElement::field(Cow::Owned(name)) } @@ -164,7 +164,7 @@ impl<'a> From<&'a String> for VariantPathElement<'a> { } } -impl<'a> From for VariantPathElement<'a> { +impl From for VariantPathElement<'_> { fn from(index: usize) -> Self { VariantPathElement::index(index) } diff --git a/parquet-variant/src/variant/object.rs b/parquet-variant/src/variant/object.rs index b809fe278cb4..6a006089a6c6 100644 --- a/parquet-variant/src/variant/object.rs +++ b/parquet-variant/src/variant/object.rs @@ -410,7 +410,7 @@ impl<'m, 'v> VariantObject<'m, 'v> { // // Instead of comparing the raw bytes of 2 variant objects, this implementation recursively // checks whether the field values are equal -- regardless of their order -impl<'m, 'v> PartialEq for VariantObject<'m, 'v> { +impl PartialEq for VariantObject<'_, '_> { fn eq(&self, other: &Self) -> bool { if self.num_elements != other.num_elements { return false; From 7dd6c23f32152be3d949b57c41ce664421aa05ae Mon Sep 17 00:00:00 2001 From: carpecodeum Date: Tue, 22 Jul 2025 20:48:47 -0400 Subject: [PATCH 18/18] [FIX] remove unsafe functions doing byte operations --- parquet-variant-compute/src/variant_array.rs | 19 ++++--------------- parquet-variant-compute/src/variant_get.rs | 12 ++++++++++-- parquet-variant/src/builder.rs | 6 +----- 3 files changed, 15 insertions(+), 22 deletions(-) diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index 47669509f9cc..7913fd894ea2 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -17,7 +17,6 @@ //! [`VariantArray`] implementation - use arrow::array::{Array, ArrayData, ArrayRef, AsArray, StructArray}; use arrow::buffer::NullBuffer; use arrow_schema::{ArrowError, DataType}; @@ -168,16 +167,6 @@ impl VariantArray { &self.value_ref } - /// Get the metadata bytes for a specific index - pub fn metadata_bytes(&self, index: usize) -> &[u8] { - self.metadata_field().as_binary_view().value(index) - } - - /// Get the value bytes for a specific index - pub fn value_bytes(&self, index: usize) -> &[u8] { - self.value_field().as_binary_view().value(index) - } - /// Get the field names for an object at the given index pub fn get_field_names(&self, index: usize) -> Vec { if index >= self.len() || self.is_null(index) { @@ -201,7 +190,7 @@ impl VariantArray { pub fn with_fields_removed(&self, field_names: &[&str]) -> Result { use parquet_variant::VariantBuilder; use std::collections::HashSet; - + let fields_to_remove: HashSet<&str> = field_names.iter().copied().collect(); let mut builder = crate::variant_array_builder::VariantArrayBuilder::new(self.len()); @@ -210,19 +199,19 @@ impl VariantArray { builder.append_null(); } else { let variant = self.value(i); - + // If it's an object, create a new object without the specified fields if let Some(obj) = variant.as_object() { let mut variant_builder = VariantBuilder::new(); let mut object_builder = variant_builder.new_object(); - + // Add all fields except the ones to remove for (field_name, field_value) in obj.iter() { if !fields_to_remove.contains(field_name) { object_builder.insert(field_name, field_value); } } - + object_builder.finish().unwrap(); let (metadata, value) = variant_builder.finish(); builder.append_variant_buffers(&metadata, &value); diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index eee2cb5f19b1..e3a612288302 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -96,7 +96,11 @@ mod test { use super::{variant_get, GetOptions}; - fn single_variant_get_test(input_json: &str, path: parquet_variant::VariantPath, expected_json: &str) { + fn single_variant_get_test( + input_json: &str, + path: parquet_variant::VariantPath, + expected_json: &str, + ) { // Create input array from JSON string let input_array_ref: ArrayRef = Arc::new(StringArray::from(vec![Some(input_json)])); let input_variant_array_ref: ArrayRef = @@ -138,7 +142,11 @@ mod test { #[test] fn get_primitive_variant_list_index() { - single_variant_get_test("[1234, 5678]", parquet_variant::VariantPath::from(0), "1234"); + single_variant_get_test( + "[1234, 5678]", + parquet_variant::VariantPath::from(0), + "1234", + ); } #[test] diff --git a/parquet-variant/src/builder.rs b/parquet-variant/src/builder.rs index 932b68f1363f..5d3d1505ee90 100644 --- a/parquet-variant/src/builder.rs +++ b/parquet-variant/src/builder.rs @@ -305,11 +305,7 @@ impl ValueBuffer { /// /// This method will panic if the variant contains duplicate field names in objects /// when validation is enabled. For a fallible version, use [`ValueBuffer::try_append_variant`] - fn append_variant( - &mut self, - variant: Variant<'_, '_>, - metadata_builder: &mut MetadataBuilder, - ) { + fn append_variant(&mut self, variant: Variant<'_, '_>, metadata_builder: &mut MetadataBuilder) { match variant { Variant::Null => self.append_null(), Variant::BooleanTrue => self.append_bool(true),