From 543ecb6e6d93cfb02ece7994f8f2382463c43d9e Mon Sep 17 00:00:00 2001 From: Bharath Krishna Date: Fri, 3 Apr 2026 14:01:40 -0700 Subject: [PATCH 1/3] API: Implement notStartsWith bounds check in StrictMetricsEvaluator When column bounds are entirely outside the prefix range, all rows must satisfy notStartsWith. Previously this always returned ROWS_MIGHT_NOT_MATCH regardless of bounds, missing an optimization opportunity for file-level pruning. Now returns ROWS_MUST_MATCH when: - Lower bound truncated to prefix length > prefix (all values above) - Upper bound truncated to prefix length < prefix (all values below) - Column contains only null values (nulls satisfy NOT predicates) Follows the same truncation pattern used in InclusiveMetricsEvaluator.startsWith and the null-handling pattern from StrictMetricsEvaluator.notEq. --- .../expressions/StrictMetricsEvaluator.java | 36 +++++- .../TestStrictMetricsEvaluator.java | 116 ++++++++++++++++++ 2 files changed, 150 insertions(+), 2 deletions(-) diff --git a/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java index c225f21da8a8..e969428e5abd 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java +++ b/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java @@ -22,6 +22,7 @@ import java.nio.ByteBuffer; import java.util.Collection; +import java.util.Comparator; import java.util.Map; import java.util.Set; import java.util.stream.Collectors; @@ -29,6 +30,7 @@ import org.apache.iceberg.DataFile; import org.apache.iceberg.Schema; import org.apache.iceberg.expressions.ExpressionVisitors.BoundExpressionVisitor; +import org.apache.iceberg.types.Comparators; import org.apache.iceberg.types.Conversions; import org.apache.iceberg.types.Types.StructType; import org.apache.iceberg.util.NaNUtil; @@ -467,8 +469,38 @@ public Boolean startsWith(BoundReference ref, Literal lit) { @Override public Boolean notStartsWith(BoundReference ref, Literal lit) { - // TODO: Handle cases that definitely cannot match, such as notStartsWith("x") when the bounds - // are ["a", "b"]. + int id = ref.fieldId(); + if (isNestedColumn(id)) { + return ROWS_MIGHT_NOT_MATCH; + } + + if (containsNullsOnly(id)) { + return ROWS_MUST_MATCH; + } + + String prefix = (String) lit.value(); + Comparator comparator = Comparators.charSequences(); + + if (lowerBounds != null && lowerBounds.containsKey(id)) { + CharSequence lower = Conversions.fromByteBuffer(ref.type(), lowerBounds.get(id)); + // if the lower bound, truncated to the prefix length, is strictly greater than the prefix, + // then all values are above the prefix range and none can start with it + int length = Math.min(prefix.length(), lower.length()); + if (comparator.compare(lower.subSequence(0, length), prefix) > 0) { + return ROWS_MUST_MATCH; + } + } + + if (upperBounds != null && upperBounds.containsKey(id)) { + CharSequence upper = Conversions.fromByteBuffer(ref.type(), upperBounds.get(id)); + // if the upper bound, truncated to the prefix length, is strictly less than the prefix, + // then all values are below the prefix range and none can start with it + int length = Math.min(prefix.length(), upper.length()); + if (comparator.compare(upper.subSequence(0, length), prefix) < 0) { + return ROWS_MUST_MATCH; + } + } + return ROWS_MIGHT_NOT_MATCH; } diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java b/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java index f34cd730df77..fa3e44b8f095 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java @@ -32,6 +32,7 @@ import static org.apache.iceberg.expressions.Expressions.notIn; import static org.apache.iceberg.expressions.Expressions.notNaN; import static org.apache.iceberg.expressions.Expressions.notNull; +import static org.apache.iceberg.expressions.Expressions.notStartsWith; import static org.apache.iceberg.expressions.Expressions.or; import static org.apache.iceberg.types.Conversions.toByteBuffer; import static org.apache.iceberg.types.Types.NestedField.optional; @@ -172,6 +173,40 @@ public class TestStrictMetricsEvaluator { // upper bounds ImmutableMap.of(5, toByteBuffer(StringType.get(), "bbb"))); + // String-focused file: required column 3 has no nulls and string bounds ["abc", "abd"] + private static final DataFile STRING_FILE = + new TestDataFile( + "string_file.avro", + Row.of(), + 50, + // any value counts, including nulls + ImmutableMap.of(3, 50L), + // null value counts + ImmutableMap.of(), + // nan value counts + null, + // lower bounds + ImmutableMap.of(3, toByteBuffer(StringType.get(), "abc")), + // upper bounds + ImmutableMap.of(3, toByteBuffer(StringType.get(), "abd"))); + + // String file with wider range: required column 3 has no nulls and bounds ["aa", "dC"] + private static final DataFile STRING_FILE_2 = + new TestDataFile( + "string_file_2.avro", + Row.of(), + 50, + // any value counts, including nulls + ImmutableMap.of(3, 50L), + // null value counts + ImmutableMap.of(), + // nan value counts + null, + // lower bounds + ImmutableMap.of(3, toByteBuffer(StringType.get(), "aa")), + // upper bounds + ImmutableMap.of(3, toByteBuffer(StringType.get(), "dC"))); + @Test public void testAllNulls() { boolean shouldRead = new StrictMetricsEvaluator(SCHEMA, notNull("all_nulls")).eval(FILE); @@ -684,4 +719,85 @@ SCHEMA, lessThanOrEqual("struct.nested_col_with_stats", INT_MAX_VALUE)) new StrictMetricsEvaluator(SCHEMA, notNull("struct.nested_col_with_stats")).eval(FILE); assertThat(shouldRead).as("notNull nested column should not match").isFalse(); } + + @Test + public void testNotStartsWithAllNulls() { + // all_nulls column (col 4) has all null values; no value can start with any prefix + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("all_nulls", "a")).eval(FILE); + assertThat(shouldRead).as("Should match: all null values satisfy notStartsWith").isTrue(); + } + + @Test + public void testNotStartsWithBoundsAbovePrefix() { + // STRING_FILE: required column 3 has bounds ["abc", "abd"] + // prefix "aaa" is below the lower bound truncated to 3 chars ("abc" > "aaa") + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "aaa")).eval(STRING_FILE); + assertThat(shouldRead).as("Should match: all values are above the prefix range").isTrue(); + } + + @Test + public void testNotStartsWithBoundsBelowPrefix() { + // STRING_FILE: required column 3 has bounds ["abc", "abd"] + // prefix "zzz" is above the upper bound truncated to 3 chars ("abd" < "zzz") + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "zzz")).eval(STRING_FILE); + assertThat(shouldRead).as("Should match: all values are below the prefix range").isTrue(); + } + + @Test + public void testNotStartsWithBoundsOverlapPrefix() { + // STRING_FILE: required column 3 has bounds ["abc", "abd"] + // prefix "ab" overlaps the bounds — some values could start with "ab" + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "ab")).eval(STRING_FILE); + assertThat(shouldRead).as("Should not match: bounds overlap the prefix range").isFalse(); + + // prefix "abc" overlaps the lower bound of ["abc", "abd"] + shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "abc")).eval(STRING_FILE); + assertThat(shouldRead).as("Should not match: lower bound starts with the prefix").isFalse(); + } + + @Test + public void testNotStartsWithWiderRange() { + // STRING_FILE_2: required column 3 has bounds ["aa", "dC"] + // prefix "e" is above the upper bound truncated to 1 char ("d" < "e") + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "e")).eval(STRING_FILE_2); + assertThat(shouldRead).as("Should match: all values are below the prefix").isTrue(); + + // prefix "a" overlaps the bounds — some values start with "a" + shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "a")).eval(STRING_FILE_2); + assertThat(shouldRead).as("Should not match: lower bound starts with the prefix").isFalse(); + + // prefix "c" is within the range ["aa", "dC"] + shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "c")).eval(STRING_FILE_2); + assertThat(shouldRead).as("Should not match: prefix is within the bounds range").isFalse(); + } + + @Test + public void testNotStartsWithNoStats() { + // FILE has no string bounds for column 3 ("required") + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "a")).eval(FILE); + assertThat(shouldRead).as("Should not match: no bounds available for column").isFalse(); + } + + @Test + public void testNotStartsWithSomeNullsBoundsOutsidePrefix() { + // FILE_2: column 5 (some_nulls) has 10 nulls, bounds ["bbb", "eee"] + // prefix "zzz" is above the upper bound + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("some_nulls", "zzz")).eval(FILE_2); + assertThat(shouldRead).as("Should match: all values are below the prefix").isTrue(); + + // prefix "aaa" is below the lower bound + shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("some_nulls", "aaa")).eval(FILE_2); + assertThat(shouldRead).as("Should match: all values are above the prefix").isTrue(); + } } From bbb3deb6fd7b38a8bbd4eea6ad06ba8272bcf81d Mon Sep 17 00:00:00 2001 From: Bharath Krishna Date: Fri, 3 Apr 2026 15:31:57 -0700 Subject: [PATCH 2/3] API: Fix comments and add test for prefix longer than bounds --- .../expressions/StrictMetricsEvaluator.java | 6 ++-- .../TestStrictMetricsEvaluator.java | 31 +++++++++---------- 2 files changed, 17 insertions(+), 20 deletions(-) diff --git a/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java index e969428e5abd..5d981e7ed139 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java +++ b/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java @@ -483,8 +483,7 @@ public Boolean notStartsWith(BoundReference ref, Literal lit) { if (lowerBounds != null && lowerBounds.containsKey(id)) { CharSequence lower = Conversions.fromByteBuffer(ref.type(), lowerBounds.get(id)); - // if the lower bound, truncated to the prefix length, is strictly greater than the prefix, - // then all values are above the prefix range and none can start with it + // truncate lower bound so that its length is not greater than the length of prefix int length = Math.min(prefix.length(), lower.length()); if (comparator.compare(lower.subSequence(0, length), prefix) > 0) { return ROWS_MUST_MATCH; @@ -493,8 +492,7 @@ public Boolean notStartsWith(BoundReference ref, Literal lit) { if (upperBounds != null && upperBounds.containsKey(id)) { CharSequence upper = Conversions.fromByteBuffer(ref.type(), upperBounds.get(id)); - // if the upper bound, truncated to the prefix length, is strictly less than the prefix, - // then all values are below the prefix range and none can start with it + // truncate upper bound so that its length is not greater than the length of prefix int length = Math.min(prefix.length(), upper.length()); if (comparator.compare(upper.subSequence(0, length), prefix) < 0) { return ROWS_MUST_MATCH; diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java b/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java index fa3e44b8f095..135fd712764d 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java @@ -722,7 +722,6 @@ SCHEMA, lessThanOrEqual("struct.nested_col_with_stats", INT_MAX_VALUE)) @Test public void testNotStartsWithAllNulls() { - // all_nulls column (col 4) has all null values; no value can start with any prefix boolean shouldRead = new StrictMetricsEvaluator(SCHEMA, notStartsWith("all_nulls", "a")).eval(FILE); assertThat(shouldRead).as("Should match: all null values satisfy notStartsWith").isTrue(); @@ -730,8 +729,6 @@ public void testNotStartsWithAllNulls() { @Test public void testNotStartsWithBoundsAbovePrefix() { - // STRING_FILE: required column 3 has bounds ["abc", "abd"] - // prefix "aaa" is below the lower bound truncated to 3 chars ("abc" > "aaa") boolean shouldRead = new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "aaa")).eval(STRING_FILE); assertThat(shouldRead).as("Should match: all values are above the prefix range").isTrue(); @@ -739,8 +736,6 @@ public void testNotStartsWithBoundsAbovePrefix() { @Test public void testNotStartsWithBoundsBelowPrefix() { - // STRING_FILE: required column 3 has bounds ["abc", "abd"] - // prefix "zzz" is above the upper bound truncated to 3 chars ("abd" < "zzz") boolean shouldRead = new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "zzz")).eval(STRING_FILE); assertThat(shouldRead).as("Should match: all values are below the prefix range").isTrue(); @@ -748,13 +743,10 @@ public void testNotStartsWithBoundsBelowPrefix() { @Test public void testNotStartsWithBoundsOverlapPrefix() { - // STRING_FILE: required column 3 has bounds ["abc", "abd"] - // prefix "ab" overlaps the bounds — some values could start with "ab" boolean shouldRead = new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "ab")).eval(STRING_FILE); assertThat(shouldRead).as("Should not match: bounds overlap the prefix range").isFalse(); - // prefix "abc" overlaps the lower bound of ["abc", "abd"] shouldRead = new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "abc")).eval(STRING_FILE); assertThat(shouldRead).as("Should not match: lower bound starts with the prefix").isFalse(); @@ -762,18 +754,14 @@ public void testNotStartsWithBoundsOverlapPrefix() { @Test public void testNotStartsWithWiderRange() { - // STRING_FILE_2: required column 3 has bounds ["aa", "dC"] - // prefix "e" is above the upper bound truncated to 1 char ("d" < "e") boolean shouldRead = new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "e")).eval(STRING_FILE_2); assertThat(shouldRead).as("Should match: all values are below the prefix").isTrue(); - // prefix "a" overlaps the bounds — some values start with "a" shouldRead = new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "a")).eval(STRING_FILE_2); assertThat(shouldRead).as("Should not match: lower bound starts with the prefix").isFalse(); - // prefix "c" is within the range ["aa", "dC"] shouldRead = new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "c")).eval(STRING_FILE_2); assertThat(shouldRead).as("Should not match: prefix is within the bounds range").isFalse(); @@ -781,7 +769,6 @@ public void testNotStartsWithWiderRange() { @Test public void testNotStartsWithNoStats() { - // FILE has no string bounds for column 3 ("required") boolean shouldRead = new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "a")).eval(FILE); assertThat(shouldRead).as("Should not match: no bounds available for column").isFalse(); @@ -789,15 +776,27 @@ public void testNotStartsWithNoStats() { @Test public void testNotStartsWithSomeNullsBoundsOutsidePrefix() { - // FILE_2: column 5 (some_nulls) has 10 nulls, bounds ["bbb", "eee"] - // prefix "zzz" is above the upper bound boolean shouldRead = new StrictMetricsEvaluator(SCHEMA, notStartsWith("some_nulls", "zzz")).eval(FILE_2); assertThat(shouldRead).as("Should match: all values are below the prefix").isTrue(); - // prefix "aaa" is below the lower bound shouldRead = new StrictMetricsEvaluator(SCHEMA, notStartsWith("some_nulls", "aaa")).eval(FILE_2); assertThat(shouldRead).as("Should match: all values are above the prefix").isTrue(); } + + @Test + public void testNotStartsWithPrefixLongerThanBounds() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "aaaaaaa")).eval(STRING_FILE); + assertThat(shouldRead).as("Should match: all values are above the long prefix").isTrue(); + + shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "zzzzzzz")).eval(STRING_FILE); + assertThat(shouldRead).as("Should match: all values are below the long prefix").isTrue(); + + shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "abcdef")).eval(STRING_FILE); + assertThat(shouldRead).as("Should not match: prefix overlaps with bound range").isFalse(); + } } From 7ca819c35033082c20df7738116366439c7d1e45 Mon Sep 17 00:00:00 2001 From: Bharath Krishna Date: Mon, 6 Apr 2026 18:05:39 -0700 Subject: [PATCH 3/3] API: Add nested column test for notStartsWith in StrictMetricsEvaluator --- .../expressions/TestStrictMetricsEvaluator.java | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java b/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java index 135fd712764d..d5ecbeb65c44 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java @@ -73,8 +73,8 @@ public class TestStrictMetricsEvaluator { "struct", Types.StructType.of( Types.NestedField.optional(16, "nested_col_no_stats", Types.IntegerType.get()), - Types.NestedField.optional( - 17, "nested_col_with_stats", Types.IntegerType.get())))); + Types.NestedField.optional(17, "nested_col_with_stats", Types.IntegerType.get()), + Types.NestedField.optional(18, "nested_string_col", Types.StringType.get())))); private static final int INT_MIN_VALUE = 30; private static final int INT_MAX_VALUE = 79; @@ -799,4 +799,12 @@ public void testNotStartsWithPrefixLongerThanBounds() { new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "abcdef")).eval(STRING_FILE); assertThat(shouldRead).as("Should not match: prefix overlaps with bound range").isFalse(); } + + @Test + public void testNotStartsWithNestedColumn() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("struct.nested_string_col", "a")) + .eval(FILE); + assertThat(shouldRead).as("notStartsWith nested column should not match").isFalse(); + } }