diff --git a/docs/source/user-guide/latest/compatibility.md b/docs/source/user-guide/latest/compatibility.md index 64bd9d2bcd..1bf38b1cea 100644 --- a/docs/source/user-guide/latest/compatibility.md +++ b/docs/source/user-guide/latest/compatibility.md @@ -85,16 +85,100 @@ Cast operations in Comet fall into three levels of support: ### Legacy Mode + +| | binary | boolean | byte | date | decimal | double | float | integer | long | short | string | timestamp | +|---|---|---|---|---|---|---|---|---|---|---|---|---| +| binary | - | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | C | N/A | +| boolean | N/A | - | C | N/A | U | C | C | C | C | C | C | U | +| byte | U | C | - | N/A | C | C | C | C | C | C | C | U | +| date | N/A | U | U | - | U | U | U | U | U | U | C | U | +| decimal | N/A | C | C | N/A | - | C | C | C | C | C | C | U | +| double | N/A | C | C | N/A | I | - | C | C | C | C | C | U | +| float | N/A | C | C | N/A | I | C | - | C | C | C | C | U | +| integer | U | C | C | N/A | C | C | C | - | C | C | C | U | +| long | U | C | C | N/A | C | C | C | C | - | C | C | U | +| short | U | C | C | N/A | C | C | C | C | C | - | C | U | +| string | C | C | C | C | I | C | C | C | C | C | - | I | +| timestamp | N/A | U | U | C | U | U | U | U | C | U | C | - | + + +**Notes:** + +- **decimal -> string**: There can be formatting differences in some case due to Spark using scientific notation where Comet does not +- **double -> decimal**: There can be rounding differences +- **double -> string**: There can be differences in precision. For example, the input "1.4E-45" will produce 1.0E-45 instead of 1.4E-45 +- **float -> decimal**: There can be rounding differences +- **float -> string**: There can be differences in precision. For example, the input "1.4E-45" will produce 1.0E-45 instead of 1.4E-45 +- **string -> date**: Only supports years between 262143 BC and 262142 AD +- **string -> decimal**: Does not support fullwidth unicode digits (e.g \\uFF10) + or strings containing null bytes (e.g \\u0000) +- **string -> timestamp**: Not all valid formats are supported ### Try Mode + +| | binary | boolean | byte | date | decimal | double | float | integer | long | short | string | timestamp | +|---|---|---|---|---|---|---|---|---|---|---|---|---| +| binary | - | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | C | N/A | +| boolean | N/A | - | C | N/A | U | C | C | C | C | C | C | U | +| byte | U | C | - | N/A | C | C | C | C | C | C | C | U | +| date | N/A | U | U | - | U | U | U | U | U | U | C | U | +| decimal | N/A | C | C | N/A | - | C | C | C | C | C | C | U | +| double | N/A | C | C | N/A | I | - | C | C | C | C | C | U | +| float | N/A | C | C | N/A | I | C | - | C | C | C | C | U | +| integer | U | C | C | N/A | C | C | C | - | C | C | C | U | +| long | U | C | C | N/A | C | C | C | C | - | C | C | U | +| short | U | C | C | N/A | C | C | C | C | C | - | C | U | +| string | C | C | C | C | I | C | C | C | C | C | - | I | +| timestamp | N/A | U | U | C | U | U | U | U | C | U | C | - | + + +**Notes:** + +- **decimal -> string**: There can be formatting differences in some case due to Spark using scientific notation where Comet does not +- **double -> decimal**: There can be rounding differences +- **double -> string**: There can be differences in precision. For example, the input "1.4E-45" will produce 1.0E-45 instead of 1.4E-45 +- **float -> decimal**: There can be rounding differences +- **float -> string**: There can be differences in precision. For example, the input "1.4E-45" will produce 1.0E-45 instead of 1.4E-45 +- **string -> date**: Only supports years between 262143 BC and 262142 AD +- **string -> decimal**: Does not support fullwidth unicode digits (e.g \\uFF10) + or strings containing null bytes (e.g \\u0000) +- **string -> timestamp**: Not all valid formats are supported ### ANSI Mode + +| | binary | boolean | byte | date | decimal | double | float | integer | long | short | string | timestamp | +|---|---|---|---|---|---|---|---|---|---|---|---|---| +| binary | - | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | C | N/A | +| boolean | N/A | - | C | N/A | U | C | C | C | C | C | C | U | +| byte | U | C | - | N/A | C | C | C | C | C | C | C | U | +| date | N/A | U | U | - | U | U | U | U | U | U | C | U | +| decimal | N/A | C | C | N/A | - | C | C | C | C | C | C | U | +| double | N/A | C | C | N/A | I | - | C | C | C | C | C | U | +| float | N/A | C | C | N/A | I | C | - | C | C | C | C | U | +| integer | U | C | C | N/A | C | C | C | - | C | C | C | U | +| long | U | C | C | N/A | C | C | C | C | - | C | C | U | +| short | U | C | C | N/A | C | C | C | C | C | - | C | U | +| string | C | C | C | C | I | C | C | C | C | C | - | I | +| timestamp | N/A | U | U | C | U | U | U | U | C | U | C | - | + + +**Notes:** + +- **decimal -> string**: There can be formatting differences in some case due to Spark using scientific notation where Comet does not +- **double -> decimal**: There can be rounding differences +- **double -> string**: There can be differences in precision. For example, the input "1.4E-45" will produce 1.0E-45 instead of 1.4E-45 +- **float -> decimal**: There can be rounding differences +- **float -> string**: There can be differences in precision. For example, the input "1.4E-45" will produce 1.0E-45 instead of 1.4E-45 +- **string -> date**: Only supports years between 262143 BC and 262142 AD +- **string -> decimal**: Does not support fullwidth unicode digits (e.g \\uFF10) + or strings containing null bytes (e.g \\u0000) +- **string -> timestamp**: ANSI mode not supported See the [tracking issue](https://github.com/apache/datafusion-comet/issues/286) for more details. diff --git a/native/core/src/execution/shuffle/row.rs b/native/core/src/execution/shuffle/row.rs index 821607ddb9..93eadd8d93 100644 --- a/native/core/src/execution/shuffle/row.rs +++ b/native/core/src/execution/shuffle/row.rs @@ -634,29 +634,42 @@ pub(crate) fn append_columns( let struct_builder = builder .as_any_mut() .downcast_mut::() - .expect("StructBuilder"); - let mut row = SparkUnsafeRow::new(schema); - - for i in row_start..row_end { - let row_addr = unsafe { *row_addresses_ptr.add(i) }; - let row_size = unsafe { *row_sizes_ptr.add(i) }; - row.point_to(row_addr, row_size); + .expect("Should be a StructBuilder"); - let is_null = row.is_null_at(column_idx); - - let nested_row = if is_null { - // The struct is null. - // Append a null value to the struct builder and field builders. - struct_builder.append_null(); - SparkUnsafeRow::default() - } else { - struct_builder.append(true); - row.get_struct(column_idx, fields.len()) - }; + let mut row = SparkUnsafeRow::new(schema); - for (idx, field) in fields.into_iter().enumerate() { - append_field(field.data_type(), struct_builder, &nested_row, idx)?; - } + // 1. Calculate validity and record it in the parent struct + // FIXED: Added underscore prefix to variable name to silence 'unused' error + let _nested_is_null: Vec = (row_start..row_end) + .map(|i| { + let row_addr = unsafe { *row_addresses_ptr.add(i) }; + let row_size = unsafe { *row_sizes_ptr.add(i) }; + row.point_to(row_addr, row_size); + + let is_null = row.is_null_at(column_idx); + + // Record the parent's null status + if is_null { + struct_builder.append_null(); + } else { + struct_builder.append(true); + } + is_null + }) + .collect(); + + // 2. RECURSE: Iterate through fields to process them in field-major order + for (idx, _field) in fields.into_iter().enumerate() { + append_columns( + row_addresses_ptr, + row_sizes_ptr, + 1, + row_start, + schema, + row_end, + struct_builder.field_builder(idx).unwrap(), + prefer_dictionary_ratio, + )?; } } _ => {