Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,15 @@

import java.nio.ByteBuffer;
import java.util.Collection;
import java.util.Comparator;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.iceberg.ContentFile;
import org.apache.iceberg.DataFile;
import org.apache.iceberg.Schema;
import org.apache.iceberg.expressions.ExpressionVisitors.BoundExpressionVisitor;
import org.apache.iceberg.types.Comparators;
import org.apache.iceberg.types.Conversions;
import org.apache.iceberg.types.Types.StructType;
import org.apache.iceberg.util.NaNUtil;
Expand Down Expand Up @@ -467,8 +469,36 @@ public <T> Boolean startsWith(BoundReference<T> ref, Literal<T> lit) {

@Override
public <T> Boolean notStartsWith(BoundReference<T> ref, Literal<T> lit) {
// TODO: Handle cases that definitely cannot match, such as notStartsWith("x") when the bounds
// are ["a", "b"].
int id = ref.fieldId();
if (isNestedColumn(id)) {
return ROWS_MIGHT_NOT_MATCH;
}

if (containsNullsOnly(id)) {
return ROWS_MUST_MATCH;
}

String prefix = (String) lit.value();
Comparator<CharSequence> comparator = Comparators.charSequences();

if (lowerBounds != null && lowerBounds.containsKey(id)) {
CharSequence lower = Conversions.fromByteBuffer(ref.type(), lowerBounds.get(id));
// truncate lower bound so that its length is not greater than the length of prefix
int length = Math.min(prefix.length(), lower.length());
if (comparator.compare(lower.subSequence(0, length), prefix) > 0) {
return ROWS_MUST_MATCH;
}
}

if (upperBounds != null && upperBounds.containsKey(id)) {
CharSequence upper = Conversions.fromByteBuffer(ref.type(), upperBounds.get(id));
// truncate upper bound so that its length is not greater than the length of prefix
int length = Math.min(prefix.length(), upper.length());
if (comparator.compare(upper.subSequence(0, length), prefix) < 0) {
return ROWS_MUST_MATCH;
}
}

return ROWS_MIGHT_NOT_MATCH;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import static org.apache.iceberg.expressions.Expressions.notIn;
import static org.apache.iceberg.expressions.Expressions.notNaN;
import static org.apache.iceberg.expressions.Expressions.notNull;
import static org.apache.iceberg.expressions.Expressions.notStartsWith;
import static org.apache.iceberg.expressions.Expressions.or;
import static org.apache.iceberg.types.Conversions.toByteBuffer;
import static org.apache.iceberg.types.Types.NestedField.optional;
Expand Down Expand Up @@ -172,6 +173,40 @@ public class TestStrictMetricsEvaluator {
// upper bounds
ImmutableMap.of(5, toByteBuffer(StringType.get(), "bbb")));

// String-focused file: required column 3 has no nulls and string bounds ["abc", "abd"]
private static final DataFile STRING_FILE =
new TestDataFile(
"string_file.avro",
Row.of(),
50,
// any value counts, including nulls
ImmutableMap.of(3, 50L),
// null value counts
ImmutableMap.of(),
// nan value counts
null,
// lower bounds
ImmutableMap.of(3, toByteBuffer(StringType.get(), "abc")),
// upper bounds
ImmutableMap.of(3, toByteBuffer(StringType.get(), "abd")));

// String file with wider range: required column 3 has no nulls and bounds ["aa", "dC"]
private static final DataFile STRING_FILE_2 =
new TestDataFile(
"string_file_2.avro",
Row.of(),
50,
// any value counts, including nulls
ImmutableMap.of(3, 50L),
// null value counts
ImmutableMap.of(),
// nan value counts
null,
// lower bounds
ImmutableMap.of(3, toByteBuffer(StringType.get(), "aa")),
// upper bounds
ImmutableMap.of(3, toByteBuffer(StringType.get(), "dC")));

@Test
public void testAllNulls() {
boolean shouldRead = new StrictMetricsEvaluator(SCHEMA, notNull("all_nulls")).eval(FILE);
Expand Down Expand Up @@ -684,4 +719,84 @@ SCHEMA, lessThanOrEqual("struct.nested_col_with_stats", INT_MAX_VALUE))
new StrictMetricsEvaluator(SCHEMA, notNull("struct.nested_col_with_stats")).eval(FILE);
assertThat(shouldRead).as("notNull nested column should not match").isFalse();
}

@Test
public void testNotStartsWithAllNulls() {
boolean shouldRead =
new StrictMetricsEvaluator(SCHEMA, notStartsWith("all_nulls", "a")).eval(FILE);
assertThat(shouldRead).as("Should match: all null values satisfy notStartsWith").isTrue();
}

@Test
public void testNotStartsWithBoundsAbovePrefix() {
boolean shouldRead =
new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "aaa")).eval(STRING_FILE);
assertThat(shouldRead).as("Should match: all values are above the prefix range").isTrue();
}

@Test
public void testNotStartsWithBoundsBelowPrefix() {
boolean shouldRead =
new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "zzz")).eval(STRING_FILE);
assertThat(shouldRead).as("Should match: all values are below the prefix range").isTrue();
}

@Test
public void testNotStartsWithBoundsOverlapPrefix() {
boolean shouldRead =
new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "ab")).eval(STRING_FILE);
assertThat(shouldRead).as("Should not match: bounds overlap the prefix range").isFalse();

shouldRead =
new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "abc")).eval(STRING_FILE);
assertThat(shouldRead).as("Should not match: lower bound starts with the prefix").isFalse();
}

@Test
public void testNotStartsWithWiderRange() {
boolean shouldRead =
new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "e")).eval(STRING_FILE_2);
assertThat(shouldRead).as("Should match: all values are below the prefix").isTrue();

shouldRead =
new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "a")).eval(STRING_FILE_2);
assertThat(shouldRead).as("Should not match: lower bound starts with the prefix").isFalse();

shouldRead =
new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "c")).eval(STRING_FILE_2);
assertThat(shouldRead).as("Should not match: prefix is within the bounds range").isFalse();
}

@Test
public void testNotStartsWithNoStats() {
boolean shouldRead =
new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "a")).eval(FILE);
assertThat(shouldRead).as("Should not match: no bounds available for column").isFalse();
}

@Test
public void testNotStartsWithSomeNullsBoundsOutsidePrefix() {
boolean shouldRead =
new StrictMetricsEvaluator(SCHEMA, notStartsWith("some_nulls", "zzz")).eval(FILE_2);
assertThat(shouldRead).as("Should match: all values are below the prefix").isTrue();

shouldRead =
new StrictMetricsEvaluator(SCHEMA, notStartsWith("some_nulls", "aaa")).eval(FILE_2);
assertThat(shouldRead).as("Should match: all values are above the prefix").isTrue();
}

@Test
public void testNotStartsWithPrefixLongerThanBounds() {
boolean shouldRead =
new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "aaaaaaa")).eval(STRING_FILE);
assertThat(shouldRead).as("Should match: all values are above the long prefix").isTrue();

shouldRead =
new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "zzzzzzz")).eval(STRING_FILE);
assertThat(shouldRead).as("Should match: all values are below the long prefix").isTrue();

shouldRead =
new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "abcdef")).eval(STRING_FILE);
assertThat(shouldRead).as("Should not match: prefix overlaps with bound range").isFalse();
}
}
Loading