Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,7 @@
import org.apache.iceberg.mr.hive.actions.HiveIcebergDeleteOrphanFiles;
import org.apache.iceberg.mr.hive.plan.IcebergBucketFunction;
import org.apache.iceberg.mr.hive.udf.GenericUDFIcebergZorder;
import org.apache.iceberg.parquet.VariantUtil;
import org.apache.iceberg.puffin.Blob;
import org.apache.iceberg.puffin.BlobMetadata;
import org.apache.iceberg.puffin.Puffin;
Expand Down Expand Up @@ -1739,7 +1740,8 @@ private void fallbackToNonVectorizedModeBasedOnProperties(Properties tableProps)
if (FileFormat.AVRO.name().equalsIgnoreCase(tableProps.getProperty(TableProperties.DEFAULT_FILE_FORMAT)) ||
isValidMetadataTable(tableProps.getProperty(IcebergAcidUtil.META_TABLE_PROPERTY)) ||
hasOrcTimeInSchema(tableProps, tableSchema) ||
!hasParquetNestedTypeWithinListOrMap(tableProps, tableSchema)) {
!hasParquetNestedTypeWithinListOrMap(tableProps, tableSchema) ||
VariantUtil.shouldUseVariantShredding(tableProps::getProperty, tableSchema)) {
// disable vectorization
SessionStateUtil.getQueryState(conf).ifPresent(queryState ->
queryState.getConf().setBoolVar(ConfVars.HIVE_VECTORIZATION_ENABLED, false));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ protected void configurePositionDelete(Avro.DeleteWriteBuilder builder) {
protected void configureDataWrite(Parquet.DataWriteBuilder builder) {
builder.createWriterFunc(GenericParquetWriter::create);
// Configure variant shredding if enabled and a sample record is available
if (VariantUtil.shouldUseVariantShredding(properties, dataSchema())) {
if (VariantUtil.shouldUseVariantShredding(properties::get, dataSchema())) {
setVariantShreddingFunc(builder, VariantUtil.variantShreddingFunc(sampleRecord, dataSchema()));
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

import java.util.List;
import java.util.Map;
import java.util.function.UnaryOperator;
import org.apache.iceberg.Accessor;
import org.apache.iceberg.Schema;
import org.apache.iceberg.StructLike;
Expand Down Expand Up @@ -59,8 +60,8 @@ public record VariantField(int fieldId, Accessor<StructLike> accessor, String[]
/**
* Check if variant shredding is enabled via table properties.
*/
public static boolean isVariantShreddingEnabled(Map<String, String> properties) {
String shreddingEnabled = properties.get(InputFormatConfig.VARIANT_SHREDDING_ENABLED);
public static boolean isVariantShreddingEnabled(UnaryOperator<String> propertyLookup) {
String shreddingEnabled = propertyLookup.apply(InputFormatConfig.VARIANT_SHREDDING_ENABLED);
return Boolean.parseBoolean(shreddingEnabled);
}

Expand All @@ -73,7 +74,7 @@ public static boolean isShreddable(Object value) {

public static List<VariantField> variantFieldsForShredding(
Map<String, String> properties, Schema schema) {
if (!isVariantShreddingEnabled(properties)) {
if (!isVariantShreddingEnabled(properties::get)) {
return List.of();
}
return variantFieldsForShredding(schema);
Expand All @@ -89,8 +90,8 @@ private static List<VariantField> variantFieldsForShredding(Schema schema) {
return results;
}

public static boolean shouldUseVariantShredding(Map<String, String> properties, Schema schema) {
return isVariantShreddingEnabled(properties) && hasVariantFields(schema);
public static boolean shouldUseVariantShredding(UnaryOperator<String> propertyLookup, Schema schema) {
return isVariantShreddingEnabled(propertyLookup) && hasVariantFields(schema);
}

private static boolean hasVariantFields(Schema schema) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,30 @@ INSERT INTO tbl_shredded_variant VALUES
(2, parse_json('{"name": "Bill", "active": false}')),
(3, parse_json('{"name": "Henry", "age": 20}'));

-- Disable vectorized execution until Variant type is supported
set hive.vectorized.execution.enabled=false;

-- Retrieve and verify
SELECT id, try_variant_get(data, '$.name') FROM tbl_shredded_variant
WHERE variant_get(data, '$.age') > 25;

EXPLAIN
SELECT id, try_variant_get(data, '$.name') FROM tbl_shredded_variant
WHERE variant_get(data, '$.age') > 25;

CREATE TABLE t (
id INT,
v VARIANT
)
STORED BY ICEBERG
TBLPROPERTIES (
'format-version'='3',
'variant.shredding.enabled'='true'
);

INSERT INTO t VALUES
(1, parse_json('{"a": 1}')),
(2, parse_json('{"b": 2}'));

SELECT
Copy link
Copy Markdown
Member

@deniskuzZ deniskuzZ Dec 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does it cover some specific test-case? with qtest you can't test the physical layout anyways.

Copy link
Copy Markdown
Member Author

@ayushtkn ayushtkn Dec 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

earlier it was giving wrong results:

NULL	2
NULL	NULL

now it is correct, I added the scenario that I tried in #6152 (comment)

I didn't put like all cases to show output is same, just it is correct, the previous original case was throwing exception, this was giving wrong result.

Copy link
Copy Markdown
Member

@deniskuzZ deniskuzZ Dec 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wonder if we same issue could be reproduced on existing table tbl_shredded_variant by inserting (1, parse_json('{"name": "John", "age": 30, "active": true}')) with complete schema last

SELECT 
   variant_get(data, '$.name') as name;
   variant_get(data, '$.age', 'int') as age;
FROM tbl_shredded_variant

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI, added vectorization support in #6224

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No Denys, it throws the same MALFORMED_VARIANT exception like now, I tried pulling the full one below, added a new record but any query is giving the same exception as in the previous thread.

Copy link
Copy Markdown
Member

@deniskuzZ deniskuzZ Dec 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IDK, works fine for me: 1062dc5#diff-779297369ca87b0f30855401004c6819e0b64fb82167861f08dcba08e2ae3859

Execution mode: vectorized
1	NULL
NULL	2

try_variant_get(v, '$.a'),
try_variant_get(v, '$.b')
FROM t
ORDER BY id;
Original file line number Diff line number Diff line change
Expand Up @@ -99,3 +99,57 @@ STAGE PLANS:
Processor Tree:
ListSink

PREHOOK: query: CREATE TABLE t (
id INT,
v VARIANT
)
STORED BY ICEBERG
TBLPROPERTIES (
'format-version'='3',
'variant.shredding.enabled'='true'
)
PREHOOK: type: CREATETABLE
PREHOOK: Output: database:default
PREHOOK: Output: default@t
POSTHOOK: query: CREATE TABLE t (
id INT,
v VARIANT
)
STORED BY ICEBERG
TBLPROPERTIES (
'format-version'='3',
'variant.shredding.enabled'='true'
)
POSTHOOK: type: CREATETABLE
POSTHOOK: Output: database:default
POSTHOOK: Output: default@t
PREHOOK: query: INSERT INTO t VALUES
(1, parse_json('{"a": 1}')),
(2, parse_json('{"b": 2}'))
PREHOOK: type: QUERY
PREHOOK: Input: _dummy_database@_dummy_table
PREHOOK: Output: default@t
POSTHOOK: query: INSERT INTO t VALUES
(1, parse_json('{"a": 1}')),
(2, parse_json('{"b": 2}'))
POSTHOOK: type: QUERY
POSTHOOK: Input: _dummy_database@_dummy_table
POSTHOOK: Output: default@t
PREHOOK: query: SELECT
try_variant_get(v, '$.a'),
try_variant_get(v, '$.b')
FROM t
ORDER BY id
PREHOOK: type: QUERY
PREHOOK: Input: default@t
PREHOOK: Output: hdfs://### HDFS PATH ###
POSTHOOK: query: SELECT
try_variant_get(v, '$.a'),
try_variant_get(v, '$.b')
FROM t
ORDER BY id
POSTHOOK: type: QUERY
POSTHOOK: Input: default@t
POSTHOOK: Output: hdfs://### HDFS PATH ###
1 NULL
NULL 2