diff --git a/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java index c225f21da8a8..5d981e7ed139 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java +++ b/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java @@ -22,6 +22,7 @@ import java.nio.ByteBuffer; import java.util.Collection; +import java.util.Comparator; import java.util.Map; import java.util.Set; import java.util.stream.Collectors; @@ -29,6 +30,7 @@ import org.apache.iceberg.DataFile; import org.apache.iceberg.Schema; import org.apache.iceberg.expressions.ExpressionVisitors.BoundExpressionVisitor; +import org.apache.iceberg.types.Comparators; import org.apache.iceberg.types.Conversions; import org.apache.iceberg.types.Types.StructType; import org.apache.iceberg.util.NaNUtil; @@ -467,8 +469,36 @@ public Boolean startsWith(BoundReference ref, Literal lit) { @Override public Boolean notStartsWith(BoundReference ref, Literal lit) { - // TODO: Handle cases that definitely cannot match, such as notStartsWith("x") when the bounds - // are ["a", "b"]. + int id = ref.fieldId(); + if (isNestedColumn(id)) { + return ROWS_MIGHT_NOT_MATCH; + } + + if (containsNullsOnly(id)) { + return ROWS_MUST_MATCH; + } + + String prefix = (String) lit.value(); + Comparator comparator = Comparators.charSequences(); + + if (lowerBounds != null && lowerBounds.containsKey(id)) { + CharSequence lower = Conversions.fromByteBuffer(ref.type(), lowerBounds.get(id)); + // truncate lower bound so that its length is not greater than the length of prefix + int length = Math.min(prefix.length(), lower.length()); + if (comparator.compare(lower.subSequence(0, length), prefix) > 0) { + return ROWS_MUST_MATCH; + } + } + + if (upperBounds != null && upperBounds.containsKey(id)) { + CharSequence upper = Conversions.fromByteBuffer(ref.type(), upperBounds.get(id)); + // truncate upper bound so that its length is not greater than the length of prefix + int length = Math.min(prefix.length(), upper.length()); + if (comparator.compare(upper.subSequence(0, length), prefix) < 0) { + return ROWS_MUST_MATCH; + } + } + return ROWS_MIGHT_NOT_MATCH; } diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java b/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java index f34cd730df77..135fd712764d 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java @@ -32,6 +32,7 @@ import static org.apache.iceberg.expressions.Expressions.notIn; import static org.apache.iceberg.expressions.Expressions.notNaN; import static org.apache.iceberg.expressions.Expressions.notNull; +import static org.apache.iceberg.expressions.Expressions.notStartsWith; import static org.apache.iceberg.expressions.Expressions.or; import static org.apache.iceberg.types.Conversions.toByteBuffer; import static org.apache.iceberg.types.Types.NestedField.optional; @@ -172,6 +173,40 @@ public class TestStrictMetricsEvaluator { // upper bounds ImmutableMap.of(5, toByteBuffer(StringType.get(), "bbb"))); + // String-focused file: required column 3 has no nulls and string bounds ["abc", "abd"] + private static final DataFile STRING_FILE = + new TestDataFile( + "string_file.avro", + Row.of(), + 50, + // any value counts, including nulls + ImmutableMap.of(3, 50L), + // null value counts + ImmutableMap.of(), + // nan value counts + null, + // lower bounds + ImmutableMap.of(3, toByteBuffer(StringType.get(), "abc")), + // upper bounds + ImmutableMap.of(3, toByteBuffer(StringType.get(), "abd"))); + + // String file with wider range: required column 3 has no nulls and bounds ["aa", "dC"] + private static final DataFile STRING_FILE_2 = + new TestDataFile( + "string_file_2.avro", + Row.of(), + 50, + // any value counts, including nulls + ImmutableMap.of(3, 50L), + // null value counts + ImmutableMap.of(), + // nan value counts + null, + // lower bounds + ImmutableMap.of(3, toByteBuffer(StringType.get(), "aa")), + // upper bounds + ImmutableMap.of(3, toByteBuffer(StringType.get(), "dC"))); + @Test public void testAllNulls() { boolean shouldRead = new StrictMetricsEvaluator(SCHEMA, notNull("all_nulls")).eval(FILE); @@ -684,4 +719,84 @@ SCHEMA, lessThanOrEqual("struct.nested_col_with_stats", INT_MAX_VALUE)) new StrictMetricsEvaluator(SCHEMA, notNull("struct.nested_col_with_stats")).eval(FILE); assertThat(shouldRead).as("notNull nested column should not match").isFalse(); } + + @Test + public void testNotStartsWithAllNulls() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("all_nulls", "a")).eval(FILE); + assertThat(shouldRead).as("Should match: all null values satisfy notStartsWith").isTrue(); + } + + @Test + public void testNotStartsWithBoundsAbovePrefix() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "aaa")).eval(STRING_FILE); + assertThat(shouldRead).as("Should match: all values are above the prefix range").isTrue(); + } + + @Test + public void testNotStartsWithBoundsBelowPrefix() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "zzz")).eval(STRING_FILE); + assertThat(shouldRead).as("Should match: all values are below the prefix range").isTrue(); + } + + @Test + public void testNotStartsWithBoundsOverlapPrefix() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "ab")).eval(STRING_FILE); + assertThat(shouldRead).as("Should not match: bounds overlap the prefix range").isFalse(); + + shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "abc")).eval(STRING_FILE); + assertThat(shouldRead).as("Should not match: lower bound starts with the prefix").isFalse(); + } + + @Test + public void testNotStartsWithWiderRange() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "e")).eval(STRING_FILE_2); + assertThat(shouldRead).as("Should match: all values are below the prefix").isTrue(); + + shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "a")).eval(STRING_FILE_2); + assertThat(shouldRead).as("Should not match: lower bound starts with the prefix").isFalse(); + + shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "c")).eval(STRING_FILE_2); + assertThat(shouldRead).as("Should not match: prefix is within the bounds range").isFalse(); + } + + @Test + public void testNotStartsWithNoStats() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "a")).eval(FILE); + assertThat(shouldRead).as("Should not match: no bounds available for column").isFalse(); + } + + @Test + public void testNotStartsWithSomeNullsBoundsOutsidePrefix() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("some_nulls", "zzz")).eval(FILE_2); + assertThat(shouldRead).as("Should match: all values are below the prefix").isTrue(); + + shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("some_nulls", "aaa")).eval(FILE_2); + assertThat(shouldRead).as("Should match: all values are above the prefix").isTrue(); + } + + @Test + public void testNotStartsWithPrefixLongerThanBounds() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "aaaaaaa")).eval(STRING_FILE); + assertThat(shouldRead).as("Should match: all values are above the long prefix").isTrue(); + + shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "zzzzzzz")).eval(STRING_FILE); + assertThat(shouldRead).as("Should match: all values are below the long prefix").isTrue(); + + shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "abcdef")).eval(STRING_FILE); + assertThat(shouldRead).as("Should not match: prefix overlaps with bound range").isFalse(); + } }