From 2a87feb688392496ec4c51a42b0870a67194a442 Mon Sep 17 00:00:00 2001 From: yangjie01 Date: Tue, 26 May 2026 11:46:46 +0800 Subject: [PATCH] fix: widen narrow numeric literals in zonemap pruner MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lance JNI normalizes every integer width in ZoneStats.min/max to Long and every float width to Double, while Spark V2 Literal.value() keeps the Catalyst type (Integer for int32, Short for smallint, Byte for tinyint, Float for float32, Integer epoch days for date). The boxes disagree, and Integer.compareTo(Object) rejects the Long with a ClassCastException — the existing conservative catch swallows it and silently drops pruning instead of crashing the query. Widen Byte/Short/Integer to Long and Float to Double inside normalizeLiteral. Both conversions are lossless and order-preserving. Also fixes the IN-list path, which routes each element through the same normalizer. Closes #557. --- .../spark/read/ZonemapFragmentPruner.java | 15 +++- .../spark/read/ZonemapFragmentPrunerTest.java | 89 +++++++++++++++++++ 2 files changed, 101 insertions(+), 3 deletions(-) diff --git a/lance-spark-base_2.12/src/main/java/org/lance/spark/read/ZonemapFragmentPruner.java b/lance-spark-base_2.12/src/main/java/org/lance/spark/read/ZonemapFragmentPruner.java index 0b13a5a4..0152b1ef 100644 --- a/lance-spark-base_2.12/src/main/java/org/lance/spark/read/ZonemapFragmentPruner.java +++ b/lance-spark-base_2.12/src/main/java/org/lance/spark/read/ZonemapFragmentPruner.java @@ -335,14 +335,23 @@ private static String columnName(NamedReference ref) { } /** - * V2 {@link Literal} exposes values in Spark's internal representation ({@code UTF8String} for - * strings). Zone stats from lance-core store String values — normalize here so {@code compareTo} - * against min/max works. + * V2 {@link Literal} exposes values in Spark's internal representation, while Lance's JNI + * materializes {@code ZoneStats.min/max} with every integer width boxed as {@code Long} and every + * floating-point width as {@code Double}. Widen narrow Java boxed primitives (Byte / Short / + * Integer / Float) to match — otherwise an Integer literal against a Long zone bound would throw + * {@code ClassCastException} from {@code Comparable.compareTo}. Also normalizes {@code + * UTF8String} → {@code String} for the same reason. */ private static Object normalizeLiteral(Object value) { if (value instanceof UTF8String) { return value.toString(); } + if (value instanceof Byte || value instanceof Short || value instanceof Integer) { + return ((Number) value).longValue(); + } + if (value instanceof Float) { + return ((Float) value).doubleValue(); + } return value; } diff --git a/lance-spark-base_2.12/src/test/java/org/lance/spark/read/ZonemapFragmentPrunerTest.java b/lance-spark-base_2.12/src/test/java/org/lance/spark/read/ZonemapFragmentPrunerTest.java index b097e905..779e60d6 100644 --- a/lance-spark-base_2.12/src/test/java/org/lance/spark/read/ZonemapFragmentPrunerTest.java +++ b/lance-spark-base_2.12/src/test/java/org/lance/spark/read/ZonemapFragmentPrunerTest.java @@ -20,6 +20,7 @@ import org.apache.spark.sql.connector.expressions.filter.Predicate; import org.junit.jupiter.api.Test; +import java.sql.Date; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; @@ -455,4 +456,92 @@ public void testAllNullZoneSkippedForEqualTo() { assertTrue(result.isPresent()); assertEquals(Set.of(0), result.get()); } + + @Test + public void testIntegerLiteralAgainstLongZoneStats() { + Map> stats = threeFragmentStats("seq"); + Predicate[] filters = new Predicate[] {TestPredicates.eq("seq", 150)}; + + Optional> result = ZonemapFragmentPruner.pruneFragments(filters, stats); + assertTrue(result.isPresent()); + assertEquals(Set.of(1), result.get()); + } + + @Test + public void testShortLiteralAgainstLongZoneStats() { + Map> stats = threeFragmentStats("x"); + Predicate[] filters = new Predicate[] {TestPredicates.gt("x", (short) 150)}; + + Optional> result = ZonemapFragmentPruner.pruneFragments(filters, stats); + assertTrue(result.isPresent()); + assertEquals(Set.of(1, 2), result.get()); + } + + @Test + public void testByteLiteralAgainstLongZoneStats() { + Map> stats = threeFragmentStats("x"); + Predicate[] filters = new Predicate[] {TestPredicates.lte("x", (byte) 50)}; + + Optional> result = ZonemapFragmentPruner.pruneFragments(filters, stats); + assertTrue(result.isPresent()); + assertEquals(Set.of(0), result.get()); + } + + @Test + public void testFloatLiteralAgainstDoubleZoneStats() { + Map> stats = new HashMap<>(); + stats.put( + "f", + Arrays.asList( + new ZoneStats(0, 0, 100, 0.0d, 9.9d, 0), + new ZoneStats(1, 0, 100, 10.0d, 19.9d, 0), + new ZoneStats(2, 0, 100, 20.0d, 29.9d, 0))); + + Predicate[] filters = new Predicate[] {TestPredicates.eq("f", 15.0f)}; + + Optional> result = ZonemapFragmentPruner.pruneFragments(filters, stats); + assertTrue(result.isPresent()); + assertEquals(Set.of(1), result.get()); + } + + @Test + public void testDateLiteralAgainstLongZoneStats() { + // Spark DateType literal is Integer epoch days; Date32 zone bounds are Long + // epoch days — same Integer→Long widening as INT32. + Map> stats = new HashMap<>(); + stats.put( + "d", + Arrays.asList( + new ZoneStats(0, 0, 100, 19000L, 19099L, 0), + new ZoneStats(1, 0, 100, 19100L, 19199L, 0), + new ZoneStats(2, 0, 100, 19200L, 19299L, 0))); + + Predicate[] filters = + new Predicate[] {TestPredicates.eq("d", Date.valueOf("2022-04-27"))}; // day 19109 + + Optional> result = ZonemapFragmentPruner.pruneFragments(filters, stats); + assertTrue(result.isPresent()); + assertEquals(Set.of(1), result.get()); + } + + @Test + public void testInListWithIntegerLiteralsAgainstLongZoneStats() { + Map> stats = threeFragmentStats("x"); + Predicate[] filters = new Predicate[] {TestPredicates.in("x", 50, 250, 999)}; + + Optional> result = ZonemapFragmentPruner.pruneFragments(filters, stats); + assertTrue(result.isPresent()); + assertEquals(Set.of(0, 2), result.get()); + } + + @Test + public void testInListWithMixedWidthLiteralsAgainstLongZoneStats() { + // Per-element widening inside analyzeIn's loop. + Map> stats = threeFragmentStats("x"); + Predicate[] filters = new Predicate[] {TestPredicates.in("x", 50, 250L, (short) 70, (byte) 5)}; + + Optional> result = ZonemapFragmentPruner.pruneFragments(filters, stats); + assertTrue(result.isPresent()); + assertEquals(Set.of(0, 2), result.get()); + } }