From 6ae7894bf5e0e361aacc0948b867002b0c63df18 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 11 Jul 2025 09:20:02 -0400 Subject: [PATCH 1/5] Add `VariantArray` and `VariantArrayBuilder` for constructing Arrow arrays of Variants --- parquet-variant-compute/src/from_json.rs | 126 +++------- parquet-variant-compute/src/lib.rs | 5 + parquet-variant-compute/src/variant_array.rs | 225 ++++++++++++++++++ .../src/variant_array_builder.rs | 223 +++++++++++++++++ 4 files changed, 480 insertions(+), 99 deletions(-) create mode 100644 parquet-variant-compute/src/variant_array.rs create mode 100644 parquet-variant-compute/src/variant_array_builder.rs diff --git a/parquet-variant-compute/src/from_json.rs b/parquet-variant-compute/src/from_json.rs index 85777c6af25f..df4d7c2753ef 100644 --- a/parquet-variant-compute/src/from_json.rs +++ b/parquet-variant-compute/src/from_json.rs @@ -18,27 +18,16 @@ //! Module for transforming a batch of JSON strings into a batch of Variants represented as //! STRUCT -use std::sync::Arc; - -use arrow::array::{Array, ArrayRef, BinaryArray, BooleanBufferBuilder, StringArray, StructArray}; -use arrow::buffer::{Buffer, NullBuffer, OffsetBuffer, ScalarBuffer}; -use arrow::datatypes::{DataType, Field}; +use crate::{VariantArray, VariantArrayBuilder}; +use arrow::array::{Array, ArrayRef, StringArray}; use arrow_schema::ArrowError; use parquet_variant::VariantBuilder; use parquet_variant_json::json_to_variant; -fn variant_arrow_repr() -> DataType { - // The subfields are expected to be non-nullable according to the parquet variant spec. - let metadata_field = Field::new("metadata", DataType::Binary, false); - let value_field = Field::new("value", DataType::Binary, false); - let fields = vec![metadata_field, value_field]; - DataType::Struct(fields.into()) -} - /// Parse a batch of JSON strings into a batch of Variants represented as /// STRUCT where nulls are preserved. The JSON strings in the input /// must be valid. -pub fn batch_json_string_to_variant(input: &ArrayRef) -> Result { +pub fn batch_json_string_to_variant(input: &ArrayRef) -> Result { let input_string_array = match input.as_any().downcast_ref::() { Some(string_array) => Ok(string_array), None => Err(ArrowError::CastError( @@ -46,81 +35,25 @@ pub fn batch_json_string_to_variant(input: &ArrayRef) -> Result = Vec::with_capacity(input.len() * 128); - let mut metadata_offsets: Vec = Vec::with_capacity(input.len() + 1); - let mut metadata_validity = BooleanBufferBuilder::new(input.len()); - let mut metadata_current_offset: i32 = 0; - metadata_offsets.push(metadata_current_offset); - - let mut value_buffer: Vec = Vec::with_capacity(input.len() * 128); - let mut value_offsets: Vec = Vec::with_capacity(input.len() + 1); - let mut value_validity = BooleanBufferBuilder::new(input.len()); - let mut value_current_offset: i32 = 0; - value_offsets.push(value_current_offset); - - let mut validity = BooleanBufferBuilder::new(input.len()); + let mut variant_array_builder = VariantArrayBuilder::new(input_string_array.len()); for i in 0..input.len() { if input.is_null(i) { // The subfields are expected to be non-nullable according to the parquet variant spec. - metadata_validity.append(true); - value_validity.append(true); - metadata_offsets.push(metadata_current_offset); - value_offsets.push(value_current_offset); - validity.append(false); + variant_array_builder.append_null(); } else { let mut vb = VariantBuilder::new(); json_to_variant(input_string_array.value(i), &mut vb)?; let (metadata, value) = vb.finish(); - validity.append(true); - - metadata_current_offset += metadata.len() as i32; - metadata_buffer.extend(metadata); - metadata_offsets.push(metadata_current_offset); - metadata_validity.append(true); - - value_current_offset += value.len() as i32; - value_buffer.extend(value); - value_offsets.push(value_current_offset); - value_validity.append(true); + variant_array_builder.append_variant_buffers(&metadata, &value); } } - let metadata_offsets_buffer = OffsetBuffer::new(ScalarBuffer::from(metadata_offsets)); - let metadata_data_buffer = Buffer::from_vec(metadata_buffer); - let metadata_null_buffer = NullBuffer::new(metadata_validity.finish()); - - let value_offsets_buffer = OffsetBuffer::new(ScalarBuffer::from(value_offsets)); - let value_data_buffer = Buffer::from_vec(value_buffer); - let value_null_buffer = NullBuffer::new(value_validity.finish()); - - let metadata_array = BinaryArray::new( - metadata_offsets_buffer, - metadata_data_buffer, - Some(metadata_null_buffer), - ); - let value_array = BinaryArray::new( - value_offsets_buffer, - value_data_buffer, - Some(value_null_buffer), - ); - - let struct_fields: Vec = vec![Arc::new(metadata_array), Arc::new(value_array)]; - let variant_fields = match variant_arrow_repr() { - DataType::Struct(fields) => fields, - _ => unreachable!("variant_arrow_repr is hard-coded and must match the expected schema"), - }; - let null_buffer = NullBuffer::new(validity.finish()); - Ok(StructArray::new( - variant_fields, - struct_fields, - Some(null_buffer), - )) + Ok(variant_array_builder.build()) } #[cfg(test)] mod test { use crate::batch_json_string_to_variant; - use arrow::array::{Array, ArrayRef, BinaryArray, StringArray}; + use arrow::array::{Array, ArrayRef, AsArray, StringArray}; use arrow_schema::ArrowError; use parquet_variant::{Variant, VariantBuilder}; use std::sync::Arc; @@ -135,43 +68,38 @@ mod test { None, ]); let array_ref: ArrayRef = Arc::new(input); - let output = batch_json_string_to_variant(&array_ref).unwrap(); + let variant_array = batch_json_string_to_variant(&array_ref).unwrap(); - let struct_array = &output; - let metadata_array = struct_array - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - let value_array = struct_array - .column(1) - .as_any() - .downcast_ref::() - .unwrap(); + let metadata_array = variant_array.metadata_field().as_binary_view(); + let value_array = variant_array.value_field().as_binary_view(); - assert!(!struct_array.is_null(0)); - assert!(struct_array.is_null(1)); - assert!(!struct_array.is_null(2)); - assert!(!struct_array.is_null(3)); - assert!(struct_array.is_null(4)); + // Compare row 0 + assert!(!variant_array.is_null(0)); + assert_eq!(variant_array.value(0), Variant::Int8(1)); - assert_eq!(metadata_array.value(0), &[1, 0, 0]); - assert_eq!(value_array.value(0), &[12, 1]); + // Compare row 1 + assert!(variant_array.is_null(1)); + // Compare row 2 + assert!(!variant_array.is_null(2)); { let mut vb = VariantBuilder::new(); let mut ob = vb.new_object(); ob.insert("a", Variant::Int8(32)); ob.finish()?; let (object_metadata, object_value) = vb.finish(); - assert_eq!(metadata_array.value(2), &object_metadata); - assert_eq!(value_array.value(2), &object_value); + let expected = Variant::new(&object_metadata, &object_value); + assert_eq!(variant_array.value(2), expected); } - assert_eq!(metadata_array.value(3), &[1, 0, 0]); - assert_eq!(value_array.value(3), &[0]); + // Compare row 3 (Note this is a variant NULL, not a null row) + assert!(!variant_array.is_null(3)); + assert_eq!(variant_array.value(3), Variant::Null); + + // Compare row 4 + assert!(variant_array.is_null(4)); - // Ensure that the subfields are not actually nullable + // Ensure that the subfields are not nullable assert!(!metadata_array.is_null(1)); assert!(!value_array.is_null(1)); assert!(!metadata_array.is_null(4)); diff --git a/parquet-variant-compute/src/lib.rs b/parquet-variant-compute/src/lib.rs index 599ba328146e..c593cf405171 100644 --- a/parquet-variant-compute/src/lib.rs +++ b/parquet-variant-compute/src/lib.rs @@ -17,6 +17,11 @@ mod from_json; mod to_json; +mod variant_array; +mod variant_array_builder; + +pub use variant_array::VariantArray; +pub use variant_array_builder::VariantArrayBuilder; pub use from_json::batch_json_string_to_variant; pub use to_json::batch_variant_to_json_string; diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs new file mode 100644 index 000000000000..f125ae4b0a3b --- /dev/null +++ b/parquet-variant-compute/src/variant_array.rs @@ -0,0 +1,225 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! [`VariantArray`] implementation + +use arrow::array::{Array, ArrayData, ArrayRef, AsArray, StructArray}; +use arrow::buffer::NullBuffer; +use arrow_schema::{ArrowError, DataType}; +use parquet_variant::Variant; +use std::any::Any; +use std::sync::Arc; + +/// An array of Parquet [`Variant`] values +/// +/// A [`VariantArray`] wraps an Arrow [`StructArray`] that stores the underlying +/// `metadata` and `value` fields, and adds convenience methods to access +/// the `Variant`s +/// +/// See [`VariantArrayBuilder`] for constructing a `VariantArray`. +/// +/// # Specification +/// 1. This code follows the conventions for storing variants in Arrow Struct Array +/// defined by [Extension Type for Parquet Variant arrow] and this [document]. +/// At the time of this writing, this is not yet a standardized Arrow extension type. +/// +/// [Extension Type for Parquet Variant arrow]: https://github.com/apache/arrow/issues/46908 +/// [document]: https://docs.google.com/document/d/1pw0AWoMQY3SjD7R4LgbPvMjG_xSCtXp3rZHkVp9jpZ4/edit?usp=sharing +#[derive(Debug)] +pub struct VariantArray { + /// StructArray or up to three fields: + /// 1. A required field named metadata which is binary, large_binary, or binary_view + /// 2. An optional field named value that is binary, large_binary, or binary_view + /// 3. An optional field named typed_value which can be any primitive type or be a list, large_list, list_view or struct + /// + /// If typed_value is a nested type, its elements must be required and must + /// be a struct containing only one of the following: + /// + /// 1. A single required field, of type binary, large_binary, or binary_view named value + /// + /// 2. An optional field named value of type binary, large_binary, or + /// binary_view AND an optional field named typed_value which follows these + /// same rules + /// + /// NOTE: It is also permissible for the metadata field to be + /// Dictionary-Encoded, preferably (but not required) with an index type of + /// int8. + inner: StructArray, +} + +impl VariantArray { + /// Creates a new `VariantArray` from a [`StructArray`]. + /// + /// # Arguments + /// - `inner` - The underlying [`StructArray`] that contains the variant data. + /// + /// # Returns + /// - A new instance of `VariantArray`. + /// + /// # Errors: + /// If the `StructArray` does not contain the required fields + pub fn try_new(inner: ArrayRef) -> Result { + let Some(inner) = inner.as_struct_opt() else { + return Err(ArrowError::InvalidArgumentError( + "Invalid VariantArray: requires StructArray as input".to_string(), + )); + }; + // Ensure the StructArray has the expected fields + if !inner.fields().iter().any(|f| f.name() == "metadata") { + return Err(ArrowError::InvalidArgumentError( + "Invalid VariantArray: StructArray must contain a 'metadata' field".to_string(), + )); + } + if !inner.fields().iter().any(|f| f.name() == "value") { + return Err(ArrowError::InvalidArgumentError( + "Invalid VariantArray: StructArray must contain a 'value' field".to_string(), + )); + } + + Ok(Self { + inner: inner.clone(), + }) + } + + /// Returns a reference to the underlying [`StructArray`]. + pub fn inner(&self) -> &StructArray { + &self.inner + } + + /// Returns the inner [`StructArray`], consuming self + pub fn into_inner(self) -> StructArray { + self.inner + } + + /// Return the [`Variant`] instance stored at the given row + /// + /// Panics if the index is out of bounds. + /// + /// Note: Does not do deep validation of the [`Variant`], so it is up to the + /// caller to ensure that the metadata and value were constructed correctly. + pub fn value(&self, index: usize) -> Variant { + let metadata = self.metadata_field().as_binary_view().value(index); + let value = self.value_field().as_binary_view().value(index); + Variant::new(metadata, value) + } + + /// Return a reference to the metadata field of the [`StructArray`] + pub fn metadata_field(&self) -> &ArrayRef { + // spec says fields order is not guaranteed, so we search by name + self.inner.column_by_name("metadata").unwrap() + } + + /// Return a reference to the value field of the `StructArray` + pub fn value_field(&self) -> &ArrayRef { + // spec says fields order is not guaranteed, so we search by name + self.inner.column_by_name("value").unwrap() + } +} + +impl Array for VariantArray { + fn as_any(&self) -> &dyn Any { + self + } + + fn to_data(&self) -> ArrayData { + self.inner.to_data() + } + + fn into_data(self) -> ArrayData { + self.inner.into_data() + } + + fn data_type(&self) -> &DataType { + self.inner.data_type() + } + + fn slice(&self, offset: usize, length: usize) -> ArrayRef { + Arc::new(Self { + inner: self.inner.slice(offset, length), + }) + } + + fn len(&self) -> usize { + self.inner.len() + } + + fn is_empty(&self) -> bool { + self.inner.is_empty() + } + + fn offset(&self) -> usize { + self.inner.offset() + } + + fn nulls(&self) -> Option<&NullBuffer> { + self.inner.nulls() + } + + fn get_buffer_memory_size(&self) -> usize { + self.inner.get_buffer_memory_size() + } + + fn get_array_memory_size(&self) -> usize { + self.inner.get_array_memory_size() + } +} + +#[cfg(test)] +mod test { + use super::*; + use arrow::array::BinaryViewArray; + use arrow_schema::{Field, Fields}; + + #[test] + fn invalid_not_a_struct_array() { + let array = test_array(); + // Should fail because the input is not a StructArray + let err = VariantArray::try_new(array); + assert_eq!( + err.unwrap_err().to_string(), + "Invalid argument error: Invalid VariantArray: requires StructArray as input" + ); + } + + #[test] + fn invalid_missing_metadata() { + let fields = Fields::from(vec![Field::new("value", DataType::BinaryView, true)]); + let array = StructArray::new(fields, vec![test_array()], None); + // Should fail because the StructArray does not contain a 'metadata' field + let err = VariantArray::try_new(Arc::new(array)); + assert_eq!( + err.unwrap_err().to_string(), + "Invalid argument error: Invalid VariantArray: StructArray must contain a 'metadata' field" + ); + } + + #[test] + fn invalid_missing_value() { + let fields = Fields::from(vec![Field::new("metadata", DataType::BinaryView, false)]); + let array = StructArray::new(fields, vec![test_array()], None); + // Should fail because the StructArray does not contain a 'value' field + let err = VariantArray::try_new(Arc::new(array)); + assert_eq!( + err.unwrap_err().to_string(), + "Invalid argument error: Invalid VariantArray: StructArray must contain a 'value' field" + ); + } + + fn test_array() -> ArrayRef { + Arc::new(BinaryViewArray::from(vec![b"test" as &[u8]])) + } +} diff --git a/parquet-variant-compute/src/variant_array_builder.rs b/parquet-variant-compute/src/variant_array_builder.rs new file mode 100644 index 000000000000..6bc405c27b06 --- /dev/null +++ b/parquet-variant-compute/src/variant_array_builder.rs @@ -0,0 +1,223 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! [`VariantArrayBuilder`] implementation + +use crate::VariantArray; +use arrow::array::{ArrayRef, BinaryViewArray, BinaryViewBuilder, NullBufferBuilder, StructArray}; +use arrow_schema::{DataType, Field, Fields}; +use parquet_variant::{Variant, VariantBuilder}; +use std::sync::Arc; + +/// A builder for [`VariantArray`] +/// +/// This builder is used to construct a `VariantArray` and allows APIs for +/// adding metadata +/// +/// This builder always creates a `VariantArray` using [`BinaryViewArray`] for both +/// the metadata and value fields. +/// +/// # TODO +/// 1. Support shredding: +/// +/// ## Example: +/// ``` +/// # use arrow::array::Array; +/// # use parquet_variant::{Variant, VariantBuilder}; +/// # use parquet_variant_compute::VariantArrayBuilder; +/// // Create a new VariantArrayBuilder with a capacity of 100 rows +/// let mut builder = VariantArrayBuilder::new(100); +/// // append variant values +/// builder.append_variant(Variant::from(42)); +/// // append a null row +/// builder.append_null(); +/// // append a pre-constructed metadata and value buffers +/// let (metadata, value) = { +/// let mut vb = VariantBuilder::new(); +/// let mut obj = vb.new_object(); +/// obj.insert("foo", "bar"); +/// obj.finish().unwrap(); +/// vb.finish() +/// }; +/// builder.append_variant_buffers(&metadata, &value); +/// +/// // create the final VariantArray +/// let variant_array = builder.build(); +/// assert_eq!(variant_array.len(), 3); +/// // // Access the values +/// // row 1 is not null and is an integer +/// assert!(!variant_array.is_null(0)); +/// assert_eq!(variant_array.value(0), Variant::from(42i32)); +/// // row 1 is null +/// assert!(variant_array.is_null(1)); +/// // row 2 is not null and is an object +/// assert!(!variant_array.is_null(2)); +/// assert!(variant_array.value(2).as_object().is_some()); +/// ``` +#[derive(Debug)] +pub struct VariantArrayBuilder { + /// Nulls + nulls: NullBufferBuilder, + /// buffer for all the metadata + metadata_buffer: Vec, + /// (offset, len) pairs for locations of metadata in the buffer + metadata_locations: Vec<(usize, usize)>, + /// buffer for values + value_buffer: Vec, + /// (offset, len) pairs for locations of values in the buffer + value_locations: Vec<(usize, usize)>, + /// The fields of the final `StructArray` + /// + /// TODO: 1) Add extension type metadata + /// TODO: 2) Add support for shredding + fields: Fields, +} + +impl VariantArrayBuilder { + pub fn new(row_capacity: usize) -> Self { + // The subfields are expected to be non-nullable according to the parquet variant spec. + let metadata_field = Field::new("metadata", DataType::BinaryView, false); + let value_field = Field::new("value", DataType::BinaryView, false); + + Self { + nulls: NullBufferBuilder::new(row_capacity), + metadata_buffer: Vec::new(), // todo allocation capacity + metadata_locations: Vec::with_capacity(row_capacity), + value_buffer: Vec::new(), + value_locations: Vec::with_capacity(row_capacity), + fields: Fields::from(vec![metadata_field, value_field]), + } + } + + /// Build the final builder + pub fn build(self) -> VariantArray { + let Self { + mut nulls, + metadata_buffer, + metadata_locations, + value_buffer, + value_locations, + fields, + } = self; + + let metadata_array = binary_view_array_from_buffers(metadata_buffer, metadata_locations); + + let value_array = binary_view_array_from_buffers(value_buffer, value_locations); + + // The build the final struct array + let inner = StructArray::new( + fields, + vec![ + Arc::new(metadata_array) as ArrayRef, + Arc::new(value_array) as ArrayRef, + ], + nulls.finish(), + ); + // TODO add arrow extension type metadata + + VariantArray::try_new(Arc::new(inner)).expect("valid VariantArray by construction") + } + + /// Appends a null row to the builder. + pub fn append_null(&mut self) { + self.nulls.append_null(); + // The subfields are expected to be non-nullable according to the parquet variant spec. + let metadata_offset = self.metadata_buffer.len(); + let metadata_length = 0; + self.metadata_locations + .push((metadata_offset, metadata_length)); + let value_offset = self.value_buffer.len(); + let value_length = 0; + self.value_locations.push((value_offset, value_length)); + } + + /// Append the [`Variant`] to the builder as the next row + pub fn append_variant(&mut self, variant: Variant) { + // TODO make this more efficient by avoiding the intermediate buffers + let mut variant_builder = VariantBuilder::new(); + variant_builder.append_value(variant); + let (metadata, value) = variant_builder.finish(); + self.append_variant_buffers(&metadata, &value); + } + + /// Append a metadata and values buffer to the builder + pub fn append_variant_buffers(&mut self, metadata: &[u8], value: &[u8]) { + self.nulls.append_non_null(); + let metadata_length = metadata.len(); + let metadata_offset = self.metadata_buffer.len(); + self.metadata_locations + .push((metadata_offset, metadata_length)); + self.metadata_buffer.extend_from_slice(metadata); + let value_length = value.len(); + let value_offset = self.value_buffer.len(); + self.value_locations.push((value_offset, value_length)); + self.value_buffer.extend_from_slice(value); + } + + // TODO: Return a Variant builder that will write to the underlying buffers (TODO) +} + +fn binary_view_array_from_buffers( + buffer: Vec, + locations: Vec<(usize, usize)>, +) -> BinaryViewArray { + let mut builder = BinaryViewBuilder::with_capacity(locations.len()); + let block = builder.append_block(buffer.into()); + // TODO this can be much faster if it creates the views directly during append + for (offset, length) in locations { + let offset = offset.try_into().expect("offset should fit in u32"); + let length = length.try_into().expect("length should fit in u32"); + builder + .try_append_view(block, offset, length) + .expect("Failed to append view"); + } + builder.finish() +} + +#[cfg(test)] +mod test { + use super::*; + use arrow::array::Array; + + /// Test that both the metadata and value buffers are non nullable + #[test] + fn test_variant_array_builder_non_nullable() { + let mut builder = VariantArrayBuilder::new(10); + builder.append_null(); // should not panic + builder.append_variant(Variant::from(42i32)); + let variant_array = builder.build(); + + assert_eq!(variant_array.len(), 2); + assert!(variant_array.is_null(0)); + assert!(!variant_array.is_null(1)); + assert_eq!(variant_array.value(1), Variant::from(42i32)); + + // the metadata and value fields of non shredded variants should not be null + assert!(variant_array.metadata_field().nulls().is_none()); + assert!(variant_array.value_field().nulls().is_none()); + let DataType::Struct(fields) = variant_array.data_type() else { + panic!("Expected VariantArray to have Struct data type"); + }; + for field in fields { + assert!( + !field.is_nullable(), + "Field {} should be non-nullable", + field.name() + ); + } + } +} From e741c165b3b794005b3b3681c83f10c602144b3c Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 11 Jul 2025 11:07:44 -0400 Subject: [PATCH 2/5] fix docs --- parquet-variant-compute/src/variant_array.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index f125ae4b0a3b..46605b82269d 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -32,6 +32,8 @@ use std::sync::Arc; /// /// See [`VariantArrayBuilder`] for constructing a `VariantArray`. /// +/// [`VariantArrayBuilder`]: crate::VariantArrayBuilder +/// /// # Specification /// 1. This code follows the conventions for storing variants in Arrow Struct Array /// defined by [Extension Type for Parquet Variant arrow] and this [document]. From 6fad15f208c872a29e9bdfc843d0416845bb11cb Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 12 Jul 2025 13:45:21 -0400 Subject: [PATCH 3/5] Remove details of shredded column details --- parquet-variant-compute/src/variant_array.rs | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index 46605b82269d..adb39c8cbe3b 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -35,7 +35,8 @@ use std::sync::Arc; /// [`VariantArrayBuilder`]: crate::VariantArrayBuilder /// /// # Specification -/// 1. This code follows the conventions for storing variants in Arrow Struct Array +/// +/// 1. This code follows the conventions for storing variants in Arrow `StructArray` /// defined by [Extension Type for Parquet Variant arrow] and this [document]. /// At the time of this writing, this is not yet a standardized Arrow extension type. /// @@ -43,19 +44,16 @@ use std::sync::Arc; /// [document]: https://docs.google.com/document/d/1pw0AWoMQY3SjD7R4LgbPvMjG_xSCtXp3rZHkVp9jpZ4/edit?usp=sharing #[derive(Debug)] pub struct VariantArray { - /// StructArray or up to three fields: - /// 1. A required field named metadata which is binary, large_binary, or binary_view - /// 2. An optional field named value that is binary, large_binary, or binary_view - /// 3. An optional field named typed_value which can be any primitive type or be a list, large_list, list_view or struct + /// StructArray of up to three fields: /// - /// If typed_value is a nested type, its elements must be required and must - /// be a struct containing only one of the following: + /// 1. A required field named `metadata` which is binary, large_binary, or + /// binary_view /// - /// 1. A single required field, of type binary, large_binary, or binary_view named value + /// 2. An optional field named `value` that is binary, large_binary, or + /// binary_view /// - /// 2. An optional field named value of type binary, large_binary, or - /// binary_view AND an optional field named typed_value which follows these - /// same rules + /// 3. An optional field named `typed_value` which can be any primitive type + /// or be a list, large_list, list_view or struct /// /// NOTE: It is also permissible for the metadata field to be /// Dictionary-Encoded, preferably (but not required) with an index type of From 595ded1b619fc71b9904bbfd8ad333737625b19f Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 12 Jul 2025 14:50:31 -0400 Subject: [PATCH 4/5] Check for BinaryView --- parquet-variant-compute/src/variant_array.rs | 78 +++++++++++++++++--- 1 file changed, 69 insertions(+), 9 deletions(-) diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index adb39c8cbe3b..e0a7a7b07ca3 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -71,23 +71,43 @@ impl VariantArray { /// - A new instance of `VariantArray`. /// /// # Errors: - /// If the `StructArray` does not contain the required fields + /// - If the `StructArray` does not contain the required fields + /// + /// # Current support + /// This structure does not (yet) support the full Arrow Variant Array specification. + /// + /// Only `StructArrays` with `metadata` and `value` fields that are + /// [`BinaryViewArray`] are supported. Shredded values are not currently supported + /// nor are using types other than `BinaryViewArray` + /// pub fn try_new(inner: ArrayRef) -> Result { let Some(inner) = inner.as_struct_opt() else { return Err(ArrowError::InvalidArgumentError( "Invalid VariantArray: requires StructArray as input".to_string(), )); }; - // Ensure the StructArray has the expected fields - if !inner.fields().iter().any(|f| f.name() == "metadata") { + // Ensure the StructArray has a metadata field of BinaryView + let Some(metadata_field) = inner.fields().iter().find(|f| f.name() == "metadata") else { return Err(ArrowError::InvalidArgumentError( "Invalid VariantArray: StructArray must contain a 'metadata' field".to_string(), )); + }; + if metadata_field.data_type() != &DataType::BinaryView { + return Err(ArrowError::NotYetImplemented(format!( + "VariantArray 'metadata' field must be BinaryView, got {}", + metadata_field.data_type() + ))); } - if !inner.fields().iter().any(|f| f.name() == "value") { + let Some(value_field) = inner.fields().iter().find(|f| f.name() == "value") else { return Err(ArrowError::InvalidArgumentError( "Invalid VariantArray: StructArray must contain a 'value' field".to_string(), )); + }; + if value_field.data_type() != &DataType::BinaryView { + return Err(ArrowError::NotYetImplemented(format!( + "VariantArray 'value' field must be BinaryView, got {}", + value_field.data_type() + ))); } Ok(Self { @@ -181,12 +201,12 @@ impl Array for VariantArray { #[cfg(test)] mod test { use super::*; - use arrow::array::BinaryViewArray; + use arrow::array::{BinaryArray, BinaryViewArray}; use arrow_schema::{Field, Fields}; #[test] fn invalid_not_a_struct_array() { - let array = test_array(); + let array = make_binary_view_array(); // Should fail because the input is not a StructArray let err = VariantArray::try_new(array); assert_eq!( @@ -198,7 +218,7 @@ mod test { #[test] fn invalid_missing_metadata() { let fields = Fields::from(vec![Field::new("value", DataType::BinaryView, true)]); - let array = StructArray::new(fields, vec![test_array()], None); + let array = StructArray::new(fields, vec![make_binary_view_array()], None); // Should fail because the StructArray does not contain a 'metadata' field let err = VariantArray::try_new(Arc::new(array)); assert_eq!( @@ -210,7 +230,7 @@ mod test { #[test] fn invalid_missing_value() { let fields = Fields::from(vec![Field::new("metadata", DataType::BinaryView, false)]); - let array = StructArray::new(fields, vec![test_array()], None); + let array = StructArray::new(fields, vec![make_binary_view_array()], None); // Should fail because the StructArray does not contain a 'value' field let err = VariantArray::try_new(Arc::new(array)); assert_eq!( @@ -219,7 +239,47 @@ mod test { ); } - fn test_array() -> ArrayRef { + #[test] + fn invalid_metadata_field_type() { + let fields = Fields::from(vec![ + Field::new("metadata", DataType::Binary, true), // Not yet supported + Field::new("value", DataType::BinaryView, true), + ]); + let array = StructArray::new( + fields, + vec![make_binary_array(), make_binary_view_array()], + None, + ); + let err = VariantArray::try_new(Arc::new(array)); + assert_eq!( + err.unwrap_err().to_string(), + "Not yet implemented: VariantArray 'metadata' field must be BinaryView, got Binary" + ); + } + + #[test] + fn invalid_value_field_type() { + let fields = Fields::from(vec![ + Field::new("metadata", DataType::BinaryView, true), + Field::new("value", DataType::Binary, true), // Not yet supported + ]); + let array = StructArray::new( + fields, + vec![make_binary_view_array(), make_binary_array()], + None, + ); + let err = VariantArray::try_new(Arc::new(array)); + assert_eq!( + err.unwrap_err().to_string(), + "Not yet implemented: VariantArray 'value' field must be BinaryView, got Binary" + ); + } + + fn make_binary_view_array() -> ArrayRef { Arc::new(BinaryViewArray::from(vec![b"test" as &[u8]])) } + + fn make_binary_array() -> ArrayRef { + Arc::new(BinaryArray::from(vec![b"test" as &[u8]])) + } } From 15b02bdd76702d84e64fec52dcb6b425e66877a9 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 12 Jul 2025 15:38:15 -0400 Subject: [PATCH 5/5] fix doc --- parquet-variant-compute/src/variant_array.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index e0a7a7b07ca3..e18d9d3b21b3 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -80,6 +80,7 @@ impl VariantArray { /// [`BinaryViewArray`] are supported. Shredded values are not currently supported /// nor are using types other than `BinaryViewArray` /// + /// [`BinaryViewArray`]: arrow::array::BinaryViewArray pub fn try_new(inner: ArrayRef) -> Result { let Some(inner) = inner.as_struct_opt() else { return Err(ArrowError::InvalidArgumentError(