Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,31 +1,45 @@
/**
* Copyright 2017-2022 LinkedIn Corporation. All rights reserved.
* Copyright 2017-2026 LinkedIn Corporation. All rights reserved.
* Licensed under the BSD-2 Clause license.
* See LICENSE in the project root for license information.
*/
package com.linkedin.coral.hive.hive2rel;

import java.util.ArrayList;
import java.util.List;

import javax.annotation.Nonnull;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;

import org.apache.calcite.plan.RelOptTable;
import org.apache.calcite.rel.RelNode;
import org.apache.calcite.rel.RelRoot;
import org.apache.calcite.rel.logical.LogicalProject;
import org.apache.calcite.rel.type.RelDataType;
import org.apache.calcite.rel.type.RelDataTypeField;
import org.apache.calcite.rex.RexBuilder;
import org.apache.calcite.rex.RexNode;
import org.apache.calcite.sql.SqlNode;
import org.apache.calcite.util.Util;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.linkedin.coral.common.FuzzyUnionSqlRewriter;


/**
* Class that implements {@link org.apache.calcite.plan.RelOptTable.ViewExpander}
* interface to support expansion of Hive Views to relational algebra.
* interface to support expansion of Hive Views to relational algebra. The
* returned {@link RelRoot} is realigned to the caller-provided
* {@link RelDataType} by name (case-insensitive) to preserve the
* {@link RelOptTable.ViewExpander} contract.
*/
public class HiveViewExpander implements RelOptTable.ViewExpander {

private static final Logger LOG = LoggerFactory.getLogger(HiveViewExpander.class);

private final HiveToRelConverter hiveToRelConverter;
/**
* Instantiates a new Hive view expander.
Expand All @@ -46,6 +60,58 @@ public RelRoot expandView(RelDataType rowType, String queryString, List<String>

SqlNode sqlNode = hiveToRelConverter.processView(dbName, tableName)
.accept(new FuzzyUnionSqlRewriter(tableName, hiveToRelConverter));
return hiveToRelConverter.getSqlToRelConverter().convertQuery(sqlNode, true, true);
RelRoot root = hiveToRelConverter.getSqlToRelConverter().convertQuery(sqlNode, true, true);
return alignToRowType(root, rowType);
}

/**
* Wrap {@code root} in a {@link LogicalProject} that re-orders its output
* fields to match {@code expected} by name (case-insensitive). No-op when the
* orderings already agree. If a name is missing from {@code root} or arities
* differ, returns {@code root} unchanged and logs a warning.
*/
Comment on lines +67 to +72
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is not the right behavior to throw an exception? There should not be a disagreement between the caller expectation and the view structure in the first place.

Copy link
Copy Markdown
Collaborator Author

@simbadzina simbadzina May 13, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you mean for when there is a mismatch like an missing field, or even when there is an ordering issue?

For the ordering, I think we can self-heal drift with the alignment code since there is a complete info.

I'm assuming a change like a new column could show up a mismatch, missing field.
Automatically taking the root would expose the column to consumers, whereas an error would break the view. I think for select * we'd want the new column to seamless appear.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The source of the divergence I observed was in another system, outside of Coral. During view creation, the storage columns were being explicitly set in the storageDescriptor of the view in a different order than in the view text. This particular instance of divergence has been fixed in that system.

To guard against similar broken views Coral can throw an exception as you suggested, or at the least log a warning.

@VisibleForTesting
static RelRoot alignToRowType(RelRoot root, RelDataType expected) {
final RelNode rel = root.rel;
final RelDataType actual = rel.getRowType();
if (expected.getFieldCount() != actual.getFieldCount()) {
LOG.warn("Skipping row-type alignment: expected {} fields, expanded view produced {}. expected={}, actual={}",
expected.getFieldCount(), actual.getFieldCount(), expected, actual);
return root;
}
if (fieldNamesAlignedByOrder(expected, actual)) {
return root;
}
final List<RexNode> projects = new ArrayList<>(expected.getFieldCount());
final List<String> names = new ArrayList<>(expected.getFieldCount());
final RexBuilder rexBuilder = rel.getCluster().getRexBuilder();
for (RelDataTypeField expectedField : expected.getFieldList()) {
// case-insensitive name lookup, no struct-field traversal
final RelDataTypeField actualField = actual.getField(expectedField.getName(), false, false);
if (actualField == null) {
LOG.warn(
"Skipping row-type alignment: expected field '{}' is absent from expanded view. expected={}, actual={}",
expectedField.getName(), expected, actual);
return root;
}
projects.add(rexBuilder.makeInputRef(actualField.getType(), actualField.getIndex()));
names.add(expectedField.getName());
}
final RelNode aligned = LogicalProject.create(rel, projects, names);
return RelRoot.of(aligned, root.kind);
}

private static boolean fieldNamesAlignedByOrder(RelDataType a, RelDataType b) {
if (a.getFieldCount() != b.getFieldCount()) {
return false;
}
final List<RelDataTypeField> af = a.getFieldList();
final List<RelDataTypeField> bf = b.getFieldList();
for (int i = 0; i < af.size(); i++) {
if (!af.get(i).getName().equalsIgnoreCase(bf.get(i).getName())) {
return false;
}
}
return true;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,13 @@
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;

import org.apache.calcite.jdbc.JavaTypeFactoryImpl;
import org.apache.calcite.plan.RelOptUtil;
import org.apache.calcite.rel.RelNode;
import org.apache.calcite.rel.RelRoot;
import org.apache.calcite.rel.type.RelDataType;
import org.apache.calcite.rel.type.RelDataTypeFactory;
import org.apache.calcite.rel.type.RelDataTypeField;
import org.apache.calcite.sql.SqlNode;
import org.apache.calcite.sql.type.ReturnTypes;
import org.apache.calcite.sql.type.SqlTypeFamily;
Expand Down Expand Up @@ -668,6 +673,40 @@ public void testIntCastToBigIntDuringComparison() {
assertEquals(relToString(sql), expected);
}

/**
* Regression test for the {@link org.apache.calcite.plan.RelOptTable.ViewExpander} contract:
* the {@link org.apache.calcite.rel.RelRoot} returned from {@link HiveViewExpander#expandView}
* must have a row type that matches the caller-provided {@code rowType}, even when the
* re-parsed view body would naturally produce its columns in a different order. Without the
* alignment, downstream consumers that rely on positional access (e.g. {@code SELECT *} over
* a view-on-view) would index into the wrong RelNode output column.
*/
@Test
public void testExpandViewAlignsToCallerRowType() {
// Build a caller-provided rowType whose order is the reverse of the view body's
// natural order. The view body produces [col_a INT, col_b STRING]; we ask for
// [col_b STRING, col_a INT].
RelDataTypeFactory typeFactory = new JavaTypeFactoryImpl();
RelDataType reversedRowType = typeFactory.builder().add("col_b", typeFactory.createSqlType(SqlTypeName.VARCHAR))
.add("col_a", typeFactory.createSqlType(SqlTypeName.INTEGER)).build();

HiveViewExpander expander = new HiveViewExpander(converter);
RelRoot result = expander.expandView(reversedRowType, "ignored", ImmutableList.of("hive", "test"),
ImmutableList.of("realign_view"));

List<String> actualNames = result.rel.getRowType().getFieldNames();
assertEquals(actualNames.size(), 2, "Expected 2 output columns");
assertEquals(actualNames.get(0).toLowerCase(), "col_b",
"Position 0 must be col_b (matches caller-provided rowType), got: " + actualNames);
assertEquals(actualNames.get(1).toLowerCase(), "col_a",
"Position 1 must be col_a (matches caller-provided rowType), got: " + actualNames);

// The types must also line up with the caller-provided rowType, not the body's natural order.
List<RelDataTypeField> fields = result.rel.getRowType().getFieldList();
assertEquals(fields.get(0).getType().getSqlTypeName(), SqlTypeName.VARCHAR);
assertEquals(fields.get(1).getType().getSqlTypeName(), SqlTypeName.INTEGER);
}

private String relToString(String sql) {
return RelOptUtil.toString(converter.convertSql(sql));
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
/**
* Copyright 2026 LinkedIn Corporation. All rights reserved.
* Licensed under the BSD-2 Clause license.
* See LICENSE in the project root for license information.
*/
package com.linkedin.coral.hive.hive2rel;

import java.util.List;

import org.apache.calcite.jdbc.JavaTypeFactoryImpl;
import org.apache.calcite.plan.RelOptCluster;
import org.apache.calcite.plan.volcano.VolcanoPlanner;
import org.apache.calcite.rel.RelNode;
import org.apache.calcite.rel.RelRoot;
import org.apache.calcite.rel.core.Values;
import org.apache.calcite.rel.logical.LogicalProject;
import org.apache.calcite.rel.logical.LogicalValues;
import org.apache.calcite.rel.type.RelDataType;
import org.apache.calcite.rel.type.RelDataTypeFactory;
import org.apache.calcite.rel.type.RelDataTypeField;
import org.apache.calcite.rex.RexBuilder;
import org.apache.calcite.rex.RexInputRef;
import org.apache.calcite.sql.SqlKind;
import org.apache.calcite.sql.type.SqlTypeName;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;

import static org.testng.Assert.assertEquals;
import static org.testng.Assert.assertFalse;
import static org.testng.Assert.assertSame;
import static org.testng.Assert.assertTrue;


/**
* Unit tests for {@link HiveViewExpander#alignToRowType}. These exercise the
* alignment logic in isolation -- no Hive metastore or end-to-end view expansion
* required.
*/
public class HiveViewExpanderTest {

private RelDataTypeFactory typeFactory;
private RelOptCluster cluster;

@BeforeClass
public void setUp() {
typeFactory = new JavaTypeFactoryImpl();
cluster = RelOptCluster.create(new VolcanoPlanner(), new RexBuilder(typeFactory));
}

private RelDataType rowType(Object... nameTypePairs) {
if (nameTypePairs.length % 2 != 0) {
throw new IllegalArgumentException("Expected (name, type) pairs");
}
RelDataTypeFactory.Builder builder = typeFactory.builder();
for (int i = 0; i < nameTypePairs.length; i += 2) {
builder.add((String) nameTypePairs[i], (SqlTypeName) nameTypePairs[i + 1]);
}
return builder.build();
}

private RelRoot rootWithRowType(RelDataType type) {
RelNode rel = LogicalValues.createEmpty(cluster, type);
return RelRoot.of(rel, SqlKind.SELECT);
}

@Test
public void testAlignedByOrderReturnsSameRoot() {
RelDataType type = rowType("a", SqlTypeName.INTEGER, "b", SqlTypeName.VARCHAR);
RelRoot root = rootWithRowType(type);

RelRoot result = HiveViewExpander.alignToRowType(root, type);

assertSame(result, root, "Already-aligned root should be returned unchanged");
}

@Test
public void testCaseDifferencesOnlyReturnsSameRoot() {
RelRoot root = rootWithRowType(rowType("colA", SqlTypeName.INTEGER, "colB", SqlTypeName.VARCHAR));
RelDataType expected = rowType("cola", SqlTypeName.INTEGER, "colb", SqlTypeName.VARCHAR);

RelRoot result = HiveViewExpander.alignToRowType(root, expected);

assertSame(result, root, "Names differing only in case should be treated as aligned (case-insensitive)");
}

@Test
public void testReorderedFieldsWrapInProject() {
RelRoot root = rootWithRowType(rowType("b", SqlTypeName.VARCHAR, "a", SqlTypeName.INTEGER));
RelDataType expected = rowType("a", SqlTypeName.INTEGER, "b", SqlTypeName.VARCHAR);

RelRoot result = HiveViewExpander.alignToRowType(root, expected);

assertTrue(result.rel instanceof LogicalProject,
"Reordered fields should result in a LogicalProject wrapper, got " + result.rel.getClass());
LogicalProject project = (LogicalProject) result.rel;
assertTrue(project.getInput() instanceof Values, "Wrapped Project should sit directly on the input");

List<RelDataTypeField> outFields = result.rel.getRowType().getFieldList();
assertEquals(outFields.size(), 2);
assertEquals(outFields.get(0).getName(), "a");
assertEquals(outFields.get(0).getType().getSqlTypeName(), SqlTypeName.INTEGER);
assertEquals(outFields.get(1).getName(), "b");
assertEquals(outFields.get(1).getType().getSqlTypeName(), SqlTypeName.VARCHAR);
}

/**
* Models the production failure mode this method exists to prevent: an
* upstream view body that lists {@code idHash} (the longer name) ahead of
* its sibling {@code id} (the shorter name, a prefix of the other)
* introduces column aliases whose ordering disagrees with the downstream
* consumer's expected ordering. Without realignment, the downstream's
* positional access lands {@code id} on the {@code idHash} position and
* vice versa, silently swapping every value read from those columns.
*/
@Test
public void testPrefixSiblingsReorderedCorrectly() {
RelRoot root = rootWithRowType(rowType("idHash", SqlTypeName.VARCHAR, "id", SqlTypeName.INTEGER));
RelDataType expected = rowType("id", SqlTypeName.INTEGER, "idHash", SqlTypeName.VARCHAR);

RelRoot result = HiveViewExpander.alignToRowType(root, expected);

assertTrue(result.rel instanceof LogicalProject);
LogicalProject project = (LogicalProject) result.rel;

// Output field order matches expected: id (INTEGER) first, idHash (VARCHAR) second.
List<RelDataTypeField> outFields = result.rel.getRowType().getFieldList();
assertEquals(outFields.get(0).getName(), "id");
assertEquals(outFields.get(0).getType().getSqlTypeName(), SqlTypeName.INTEGER);
assertEquals(outFields.get(1).getName(), "idHash");
assertEquals(outFields.get(1).getType().getSqlTypeName(), SqlTypeName.VARCHAR);

// And the underlying input refs cross-link to the correct source positions:
// expected[0]=id -> input position 1; expected[1]=idHash -> input position 0.
assertTrue(project.getProjects().get(0) instanceof RexInputRef);
assertTrue(project.getProjects().get(1) instanceof RexInputRef);
assertEquals(((RexInputRef) project.getProjects().get(0)).getIndex(), 1);
assertEquals(((RexInputRef) project.getProjects().get(1)).getIndex(), 0);
}

/**
* Realignment over multiple prefix-name pairs at once. Each pair in the input
* row type lists the longer name first ({@code idHash} before {@code id},
* {@code emailVerified} before {@code email}, {@code countryCode} before
* {@code country}); the caller's expected row type lists them the other way
* around. Mirrors the realistic failure shape where an upstream view body's
* case-sensitive alphabetical sort places longer prefix-extensions ahead of
* their shorter siblings, and a downstream consumer expects the
* lowercase-alphabetical (prefix-first) order. Each pair must be realigned
* independently and correctly.
*/
@Test
public void testMultiplePrefixSiblingsAllReorderedCorrectly() {
RelRoot root = rootWithRowType(
rowType("idHash", SqlTypeName.VARCHAR, "id", SqlTypeName.INTEGER, "emailVerified", SqlTypeName.BOOLEAN, "email",
SqlTypeName.VARCHAR, "countryCode", SqlTypeName.VARCHAR, "country", SqlTypeName.VARCHAR));
RelDataType expected =
rowType("id", SqlTypeName.INTEGER, "idHash", SqlTypeName.VARCHAR, "email", SqlTypeName.VARCHAR, "emailVerified",
SqlTypeName.BOOLEAN, "country", SqlTypeName.VARCHAR, "countryCode", SqlTypeName.VARCHAR);

RelRoot result = HiveViewExpander.alignToRowType(root, expected);

assertTrue(result.rel instanceof LogicalProject);
LogicalProject project = (LogicalProject) result.rel;

// Output names and types follow the expected (prefix-first) ordering.
List<RelDataTypeField> outFields = result.rel.getRowType().getFieldList();
assertEquals(outFields.size(), 6);
assertEquals(outFields.get(0).getName(), "id");
assertEquals(outFields.get(0).getType().getSqlTypeName(), SqlTypeName.INTEGER);
assertEquals(outFields.get(1).getName(), "idHash");
assertEquals(outFields.get(1).getType().getSqlTypeName(), SqlTypeName.VARCHAR);
assertEquals(outFields.get(2).getName(), "email");
assertEquals(outFields.get(2).getType().getSqlTypeName(), SqlTypeName.VARCHAR);
assertEquals(outFields.get(3).getName(), "emailVerified");
assertEquals(outFields.get(3).getType().getSqlTypeName(), SqlTypeName.BOOLEAN);
assertEquals(outFields.get(4).getName(), "country");
assertEquals(outFields.get(4).getType().getSqlTypeName(), SqlTypeName.VARCHAR);
assertEquals(outFields.get(5).getName(), "countryCode");
assertEquals(outFields.get(5).getType().getSqlTypeName(), SqlTypeName.VARCHAR);

// Each expected position points at the correct source position in the input row type.
// Input order was: [idHash=0, id=1, emailVerified=2, email=3, countryCode=4, country=5].
int[] expectedSourceIndices = { 1, 0, 3, 2, 5, 4 };
for (int i = 0; i < expectedSourceIndices.length; i++) {
assertTrue(project.getProjects().get(i) instanceof RexInputRef, "Project " + i + " should be a RexInputRef");
assertEquals(((RexInputRef) project.getProjects().get(i)).getIndex(), expectedSourceIndices[i],
"Expected position " + i + " (" + outFields.get(i).getName() + ") should source from input position "
+ expectedSourceIndices[i]);
}
}

@Test
public void testMissingFieldReturnsOriginalRoot() {
RelRoot root = rootWithRowType(rowType("a", SqlTypeName.INTEGER, "b", SqlTypeName.VARCHAR));
// `c` is not in the input -- alignment should bail rather than fabricate.
RelDataType expected = rowType("a", SqlTypeName.INTEGER, "c", SqlTypeName.VARCHAR);

RelRoot result = HiveViewExpander.alignToRowType(root, expected);

assertSame(result, root, "Missing field should trigger safe fallback to the original root");
}

@Test
public void testDifferentArityReturnsOriginalRoot() {
RelRoot root =
rootWithRowType(rowType("a", SqlTypeName.INTEGER, "b", SqlTypeName.VARCHAR, "c", SqlTypeName.DOUBLE));
RelDataType expected = rowType("a", SqlTypeName.INTEGER, "b", SqlTypeName.VARCHAR);

RelRoot result = HiveViewExpander.alignToRowType(root, expected);

assertSame(result, root, "Different arity should trigger safe fallback rather than silent column drop");
assertFalse(result.rel instanceof LogicalProject,
"Different-arity case should not wrap in a Project (would be lossy)");
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,10 @@ public static TestHive setupDefaultHive(HiveConf conf) throws IOException {
driver.run(
"CREATE TABLE IF NOT EXISTS test.tableInt(tinyint_col tinyint, smallint_col smallint, int_col int, bigint_col bigint)");

// Fixture for HiveViewExpander.expandView() row-type alignment regression test.
driver.run("CREATE TABLE IF NOT EXISTS test.realign_base(col_a int, col_b string)");
driver.run("CREATE VIEW IF NOT EXISTS test.realign_view AS SELECT * FROM test.realign_base");

driver.run("CREATE DATABASE IF NOT EXISTS fuzzy_union");

driver.run("CREATE TABLE IF NOT EXISTS fuzzy_union.tableA(a int, b struct<b1:string>)");
Expand Down
Loading