Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 80 additions & 8 deletions parquet-variant-compute/src/shred_variant.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,14 @@ use std::sync::Arc;
/// See [`ShreddedSchemaBuilder`] for a convenient way to build the `as_type`
/// value passed to this function.
pub fn shred_variant(array: &VariantArray, as_type: &DataType) -> Result<VariantArray> {
shred_variant_with_options(array, as_type, &CastOptions::default())
}

pub(crate) fn shred_variant_with_options(
array: &VariantArray,
as_type: &DataType,
cast_options: &CastOptions,
) -> Result<VariantArray> {
if array.typed_value_field().is_some() {
return Err(ArrowError::InvalidArgumentError(
"Input is already shredded".to_string(),
Expand All @@ -79,10 +87,9 @@ pub fn shred_variant(array: &VariantArray, as_type: &DataType) -> Result<Variant
return Ok(array.clone());
};

let cast_options = CastOptions::default();
let mut builder = make_variant_to_shredded_variant_arrow_row_builder(
as_type,
&cast_options,
cast_options,
array.len(),
NullValue::TopLevelVariant,
)?;
Expand Down Expand Up @@ -324,6 +331,8 @@ impl<'a> VariantToShreddedArrayVariantRowBuilder<'a> {
Variant::List(list) => {
self.nulls.append_non_null();
self.value_builder.append_null();

// NOTE: A `FixedSizeList` with incorrect size will hard fail during shredding.
self.typed_value_builder
.append_value(&Variant::List(list))?;
Comment on lines +336 to +337

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Aside: a bit surprised fmt liked this line break?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sometimes fmt seems to give up for some reason 🤷

Ok(true)
Expand Down Expand Up @@ -694,9 +703,9 @@ mod tests {
use crate::VariantArrayBuilder;
use crate::variant_array::{binary_array_value, variant_from_arrays_at};
use arrow::array::{
Array, BinaryViewArray, FixedSizeBinaryArray, Float64Array, GenericListArray,
GenericListViewArray, Int64Array, LargeBinaryArray, LargeStringArray, ListArray,
ListLikeArray, OffsetSizeTrait, PrimitiveArray, StringArray,
Array, BinaryViewArray, FixedSizeBinaryArray, FixedSizeListArray, Float64Array,
GenericListArray, GenericListViewArray, Int64Array, LargeBinaryArray, LargeStringArray,
ListArray, ListLikeArray, OffsetSizeTrait, PrimitiveArray, StringArray, StructArray,
};
use arrow::datatypes::{
ArrowPrimitiveType, DataType, Field, Fields, Int64Type, TimeUnit, UnionFields, UnionMode,
Expand Down Expand Up @@ -1621,17 +1630,80 @@ mod tests {

#[test]
fn test_array_shredding_as_fixed_size_list() {
let input = build_variant_array(vec![
VariantRow::List(vec![VariantValue::from(1i64), VariantValue::from(2i64)]),
VariantRow::Value(VariantValue::from("This should not be shredded")),
VariantRow::List(vec![VariantValue::from(3i64), VariantValue::from(4i64)]),
]);

let list_schema =
DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int64, true)), 2);
let result = shred_variant(&input, &list_schema).unwrap();
assert_eq!(result.len(), 3);

// The first row should be shredded, so the `value` field should be null and the
// `typed_value` field should contain the list
assert!(result.is_valid(0));
assert!(result.value_field().unwrap().is_null(0));
assert!(result.typed_value_field().unwrap().is_valid(0));

// The second row should not be shredded because the provided schema for shredding did not
// match. Hence, the `value` field should contain the raw value and the `typed_value` field
// should be null.
assert!(result.is_valid(1));
assert!(result.value_field().unwrap().is_valid(1));
assert!(result.typed_value_field().unwrap().is_null(1));

// The third row should be shredded, so the `value` field should be null and the
// `typed_value` field should contain the list
assert!(result.is_valid(2));
assert!(result.value_field().unwrap().is_null(2));
assert!(result.typed_value_field().unwrap().is_valid(2));

let typed_value = result.typed_value_field().unwrap();
let fixed_size_list = typed_value
.as_any()
.downcast_ref::<FixedSizeListArray>()
.expect("Expected FixedSizeListArray");

// Verify that typed value is `FixedSizeList`.
assert_eq!(fixed_size_list.len(), 3);
assert_eq!(fixed_size_list.value_length(), 2);

// Verify that the first entry in the `FixedSizeList` contains the expected value.
let val0 = fixed_size_list.value(0);
let val0_struct = val0.as_any().downcast_ref::<StructArray>().unwrap();
let val0_typed = val0_struct.column_by_name("typed_value").unwrap();
let val0_ints = val0_typed.as_any().downcast_ref::<Int64Array>().unwrap();
assert_eq!(val0_ints.values(), &[1i64, 2i64]);

// Verify that second entry in the `FixedSizeList` cannot be shredded hence the value is
// invalid.
assert!(fixed_size_list.is_null(1));

// Verify that the third entry in the `FixedSizeList` contains the expected value.
let val2 = fixed_size_list.value(2);
let val2_struct = val2.as_any().downcast_ref::<StructArray>().unwrap();
let val2_typed = val2_struct.column_by_name("typed_value").unwrap();
let val2_ints = val2_typed.as_any().downcast_ref::<Int64Array>().unwrap();
assert_eq!(val2_ints.values(), &[3i64, 4i64]);
}

#[test]
fn test_array_shredding_as_fixed_size_list_wrong_size() {
Comment thread
rishvin marked this conversation as resolved.
let input = build_variant_array(vec![VariantRow::List(vec![
VariantValue::from(1i64),
VariantValue::from(2i64),
VariantValue::from(3i64),
])]);
let list_schema =
DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int64, true)), 2);

let err = shred_variant(&input, &list_schema).unwrap_err();
assert_eq!(
err.to_string(),
"Not yet implemented: Converting unshredded variant arrays to arrow fixed-size lists"
assert!(
err.to_string()
.contains("Expected fixed size list of size 2, got size 3"),
"got: {err}",
);
}

Expand Down
87 changes: 62 additions & 25 deletions parquet-variant-compute/src/variant_get.rs
Original file line number Diff line number Diff line change
Expand Up @@ -361,10 +361,11 @@ mod test {
use arrow::array::{
Array, ArrayRef, AsArray, BinaryArray, BinaryViewArray, BooleanArray, Date32Array,
Date64Array, Decimal32Array, Decimal64Array, Decimal128Array, Decimal256Array,
Float32Array, Float64Array, Int8Array, Int16Array, Int32Array, Int64Array,
LargeBinaryArray, LargeListArray, LargeListViewArray, LargeStringArray, ListArray,
ListViewArray, NullArray, NullBuilder, StringArray, StringViewArray, StructArray,
Time32MillisecondArray, Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray,
FixedSizeListArray, Float32Array, Float64Array, Int8Array, Int16Array, Int32Array,
Int64Array, LargeBinaryArray, LargeListArray, LargeListViewArray, LargeStringArray,
ListArray, ListViewArray, NullArray, NullBuilder, StringArray, StringViewArray,
StructArray, Time32MillisecondArray, Time32SecondArray, Time64MicrosecondArray,
Time64NanosecondArray,
};
use arrow::buffer::{NullBuffer, OffsetBuffer, ScalarBuffer};
use arrow::compute::CastOptions;
Expand Down Expand Up @@ -4138,13 +4139,29 @@ mod test {
(
DataType::LargeListView(field.clone()),
Arc::new(LargeListViewArray::new(
field,
field.clone(),
ScalarBuffer::from(vec![0, 3]),
ScalarBuffer::from(vec![3, 0]),
element_array,
Some(NullBuffer::from(vec![true, false])),
)) as ArrayRef,
),
(
DataType::FixedSizeList(field.clone(), 3),
Arc::new(FixedSizeListArray::new(
field,
3,
Arc::new(Int64Array::from(vec![
Some(1),
None,
Some(3),
None,
None,
None,
])),
Some(NullBuffer::from(vec![true, false])),
)) as ArrayRef,
),
];

for (request_type, expected) in expectations {
Expand Down Expand Up @@ -4307,7 +4324,8 @@ mod test {
DataType::List(item_field.clone()),
DataType::LargeList(item_field.clone()),
DataType::ListView(item_field.clone()),
DataType::LargeListView(item_field),
DataType::LargeListView(item_field.clone()),
DataType::FixedSizeList(item_field, 2),
];

for data_type in data_types {
Expand All @@ -4324,28 +4342,47 @@ mod test {
}

#[test]
fn test_variant_get_fixed_size_list_not_implemented() {
let string_array: ArrayRef = Arc::new(StringArray::from(vec!["[1, 2]", "\"not a list\""]));
fn test_variant_get_fixed_size_list_wrong_size() {
let string_array: ArrayRef = Arc::new(StringArray::from(vec!["[1, 2, 3]"]));
let variant_array = ArrayRef::from(json_to_variant(&string_array).unwrap());
let item_field = Arc::new(Field::new("item", Int64, true));
for safe in [true, false] {
let options = GetOptions::new()
.with_as_type(Some(FieldRef::from(Field::new(
"result",
DataType::FixedSizeList(item_field.clone(), 2),
true,
))))
.with_cast_options(CastOptions {
safe,
..Default::default()
});

let err = variant_get(&variant_array, options).unwrap_err();
assert!(
err.to_string()
.contains("Converting unshredded variant arrays to arrow fixed-size lists")
);
}
// With `safe` set to true, size mismatch should return Null.
let options = GetOptions::new()
.with_as_type(Some(FieldRef::from(Field::new(
"result",
DataType::FixedSizeList(item_field.clone(), 2),
true,
))))
.with_cast_options(CastOptions {
safe: true,
..Default::default()
});
let result = variant_get(&variant_array, options).unwrap();
let fixed_size_list = result
.as_any()
.downcast_ref::<FixedSizeListArray>()
.expect("Expected FixedSizeListArray");
assert_eq!(fixed_size_list.len(), 1);
assert!(fixed_size_list.is_null(0));

// With `safe` set to false, error should be raised on wrong sized fixed list.
let options = GetOptions::new()
.with_as_type(Some(FieldRef::from(Field::new(
"result",
DataType::FixedSizeList(item_field.clone(), 2),
true,
))))
.with_cast_options(CastOptions {
safe: false,
..Default::default()
});
let err = variant_get(&variant_array, options).unwrap_err();
assert!(
err.to_string()
.contains("Expected fixed size list of size 2, got size 3"),
"got: {err}",
);
}

macro_rules! perfectly_shredded_preserves_top_level_nulls_test {
Expand Down
Loading
Loading