diff --git a/lance-spark-base_2.12/src/main/java/org/lance/spark/read/ZonemapFragmentPruner.java b/lance-spark-base_2.12/src/main/java/org/lance/spark/read/ZonemapFragmentPruner.java index 0b13a5a4..0152b1ef 100644 --- a/lance-spark-base_2.12/src/main/java/org/lance/spark/read/ZonemapFragmentPruner.java +++ b/lance-spark-base_2.12/src/main/java/org/lance/spark/read/ZonemapFragmentPruner.java @@ -335,14 +335,23 @@ private static String columnName(NamedReference ref) { } /** - * V2 {@link Literal} exposes values in Spark's internal representation ({@code UTF8String} for - * strings). Zone stats from lance-core store String values — normalize here so {@code compareTo} - * against min/max works. + * V2 {@link Literal} exposes values in Spark's internal representation, while Lance's JNI + * materializes {@code ZoneStats.min/max} with every integer width boxed as {@code Long} and every + * floating-point width as {@code Double}. Widen narrow Java boxed primitives (Byte / Short / + * Integer / Float) to match — otherwise an Integer literal against a Long zone bound would throw + * {@code ClassCastException} from {@code Comparable.compareTo}. Also normalizes {@code + * UTF8String} → {@code String} for the same reason. */ private static Object normalizeLiteral(Object value) { if (value instanceof UTF8String) { return value.toString(); } + if (value instanceof Byte || value instanceof Short || value instanceof Integer) { + return ((Number) value).longValue(); + } + if (value instanceof Float) { + return ((Float) value).doubleValue(); + } return value; } diff --git a/lance-spark-base_2.12/src/test/java/org/lance/spark/read/ZonemapFragmentPrunerTest.java b/lance-spark-base_2.12/src/test/java/org/lance/spark/read/ZonemapFragmentPrunerTest.java index b097e905..779e60d6 100644 --- a/lance-spark-base_2.12/src/test/java/org/lance/spark/read/ZonemapFragmentPrunerTest.java +++ b/lance-spark-base_2.12/src/test/java/org/lance/spark/read/ZonemapFragmentPrunerTest.java @@ -20,6 +20,7 @@ import org.apache.spark.sql.connector.expressions.filter.Predicate; import org.junit.jupiter.api.Test; +import java.sql.Date; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; @@ -455,4 +456,92 @@ public void testAllNullZoneSkippedForEqualTo() { assertTrue(result.isPresent()); assertEquals(Set.of(0), result.get()); } + + @Test + public void testIntegerLiteralAgainstLongZoneStats() { + Map> stats = threeFragmentStats("seq"); + Predicate[] filters = new Predicate[] {TestPredicates.eq("seq", 150)}; + + Optional> result = ZonemapFragmentPruner.pruneFragments(filters, stats); + assertTrue(result.isPresent()); + assertEquals(Set.of(1), result.get()); + } + + @Test + public void testShortLiteralAgainstLongZoneStats() { + Map> stats = threeFragmentStats("x"); + Predicate[] filters = new Predicate[] {TestPredicates.gt("x", (short) 150)}; + + Optional> result = ZonemapFragmentPruner.pruneFragments(filters, stats); + assertTrue(result.isPresent()); + assertEquals(Set.of(1, 2), result.get()); + } + + @Test + public void testByteLiteralAgainstLongZoneStats() { + Map> stats = threeFragmentStats("x"); + Predicate[] filters = new Predicate[] {TestPredicates.lte("x", (byte) 50)}; + + Optional> result = ZonemapFragmentPruner.pruneFragments(filters, stats); + assertTrue(result.isPresent()); + assertEquals(Set.of(0), result.get()); + } + + @Test + public void testFloatLiteralAgainstDoubleZoneStats() { + Map> stats = new HashMap<>(); + stats.put( + "f", + Arrays.asList( + new ZoneStats(0, 0, 100, 0.0d, 9.9d, 0), + new ZoneStats(1, 0, 100, 10.0d, 19.9d, 0), + new ZoneStats(2, 0, 100, 20.0d, 29.9d, 0))); + + Predicate[] filters = new Predicate[] {TestPredicates.eq("f", 15.0f)}; + + Optional> result = ZonemapFragmentPruner.pruneFragments(filters, stats); + assertTrue(result.isPresent()); + assertEquals(Set.of(1), result.get()); + } + + @Test + public void testDateLiteralAgainstLongZoneStats() { + // Spark DateType literal is Integer epoch days; Date32 zone bounds are Long + // epoch days — same Integer→Long widening as INT32. + Map> stats = new HashMap<>(); + stats.put( + "d", + Arrays.asList( + new ZoneStats(0, 0, 100, 19000L, 19099L, 0), + new ZoneStats(1, 0, 100, 19100L, 19199L, 0), + new ZoneStats(2, 0, 100, 19200L, 19299L, 0))); + + Predicate[] filters = + new Predicate[] {TestPredicates.eq("d", Date.valueOf("2022-04-27"))}; // day 19109 + + Optional> result = ZonemapFragmentPruner.pruneFragments(filters, stats); + assertTrue(result.isPresent()); + assertEquals(Set.of(1), result.get()); + } + + @Test + public void testInListWithIntegerLiteralsAgainstLongZoneStats() { + Map> stats = threeFragmentStats("x"); + Predicate[] filters = new Predicate[] {TestPredicates.in("x", 50, 250, 999)}; + + Optional> result = ZonemapFragmentPruner.pruneFragments(filters, stats); + assertTrue(result.isPresent()); + assertEquals(Set.of(0, 2), result.get()); + } + + @Test + public void testInListWithMixedWidthLiteralsAgainstLongZoneStats() { + // Per-element widening inside analyzeIn's loop. + Map> stats = threeFragmentStats("x"); + Predicate[] filters = new Predicate[] {TestPredicates.in("x", 50, 250L, (short) 70, (byte) 5)}; + + Optional> result = ZonemapFragmentPruner.pruneFragments(filters, stats); + assertTrue(result.isPresent()); + assertEquals(Set.of(0, 2), result.get()); + } }