From 582d9cb6ac32253e27ec05b447058f4b6baf033f Mon Sep 17 00:00:00 2001 From: SteNicholas Date: Tue, 10 Feb 2026 11:26:47 +0800 Subject: [PATCH] feat: support LeafFunction of StartsWith, EndsWith, Contains, Like --- cmake_modules/ThirdpartyToolchain.cmake | 2 +- include/paimon/predicate/function.h | 6 +- include/paimon/predicate/predicate_builder.h | 32 ++ src/paimon/CMakeLists.txt | 4 + src/paimon/common/predicate/contains.cpp | 24 + src/paimon/common/predicate/contains.h | 46 ++ src/paimon/common/predicate/ends_with.cpp | 26 + src/paimon/common/predicate/ends_with.h | 46 ++ src/paimon/common/predicate/equal.cpp | 4 +- src/paimon/common/predicate/equal.h | 2 +- .../common/predicate/greater_or_equal.cpp | 4 +- .../common/predicate/greater_or_equal.h | 2 +- src/paimon/common/predicate/greater_than.cpp | 4 +- src/paimon/common/predicate/greater_than.h | 2 +- src/paimon/common/predicate/in.cpp | 4 +- src/paimon/common/predicate/in.h | 2 +- src/paimon/common/predicate/is_not_null.cpp | 4 +- src/paimon/common/predicate/is_not_null.h | 2 +- src/paimon/common/predicate/is_null.cpp | 4 +- src/paimon/common/predicate/is_null.h | 2 +- src/paimon/common/predicate/leaf_function.h | 2 +- .../common/predicate/leaf_predicate.cpp | 7 +- src/paimon/common/predicate/less_or_equal.cpp | 4 +- src/paimon/common/predicate/less_or_equal.h | 2 +- src/paimon/common/predicate/less_than.cpp | 4 +- src/paimon/common/predicate/less_than.h | 2 +- src/paimon/common/predicate/like.cpp | 102 ++++ src/paimon/common/predicate/like.h | 46 ++ src/paimon/common/predicate/not_equal.cpp | 4 +- src/paimon/common/predicate/not_equal.h | 2 +- src/paimon/common/predicate/not_in.cpp | 4 +- src/paimon/common/predicate/not_in.h | 2 +- .../common/predicate/predicate_builder.cpp | 58 ++- .../common/predicate/predicate_test.cpp | 467 ++++++++++++++---- src/paimon/common/predicate/predicate_utils.h | 17 +- src/paimon/common/predicate/starts_with.cpp | 38 ++ src/paimon/common/predicate/starts_with.h | 51 ++ .../predicate/string_leaf_binary_function.h | 42 ++ src/paimon/format/orc/predicate_converter.cpp | 8 + .../format/orc/predicate_converter_test.cpp | 32 ++ .../format/parquet/predicate_converter.cpp | 29 ++ .../parquet/predicate_converter_test.cpp | 37 ++ .../parquet/predicate_pushdown_test.cpp | 32 ++ 43 files changed, 1090 insertions(+), 124 deletions(-) create mode 100644 src/paimon/common/predicate/contains.cpp create mode 100644 src/paimon/common/predicate/contains.h create mode 100644 src/paimon/common/predicate/ends_with.cpp create mode 100644 src/paimon/common/predicate/ends_with.h create mode 100644 src/paimon/common/predicate/like.cpp create mode 100644 src/paimon/common/predicate/like.h create mode 100644 src/paimon/common/predicate/starts_with.cpp create mode 100644 src/paimon/common/predicate/starts_with.h create mode 100644 src/paimon/common/predicate/string_leaf_binary_function.h diff --git a/cmake_modules/ThirdpartyToolchain.cmake b/cmake_modules/ThirdpartyToolchain.cmake index 97526305..9c15b76d 100644 --- a/cmake_modules/ThirdpartyToolchain.cmake +++ b/cmake_modules/ThirdpartyToolchain.cmake @@ -1057,7 +1057,7 @@ macro(build_arrow) -DARROW_BUILD_BENCHMARKS=OFF -DARROW_BUILD_EXAMPLES=OFF -DARROW_JEMALLOC=OFF - -DARROW_WITH_RE2=OFF + -DARROW_WITH_RE2=ON -DARROW_WITH_UTF8PROC=OFF -DARROW_ORC=OFF -DARROW_SIMD_LEVEL=NONE diff --git a/include/paimon/predicate/function.h b/include/paimon/predicate/function.h index 15279b3b..19b76732 100644 --- a/include/paimon/predicate/function.h +++ b/include/paimon/predicate/function.h @@ -37,7 +37,11 @@ class PAIMON_EXPORT Function { IN = 9, NOT_IN = 10, AND = 11, - OR = 12 + OR = 12, + STARTS_WITH = 13, + ENDS_WITH = 14, + CONTAINS = 15, + LIKE = 16 }; virtual ~Function() = default; virtual Type GetType() const = 0; diff --git a/include/paimon/predicate/predicate_builder.h b/include/paimon/predicate/predicate_builder.h index 20ecc82e..77e0ace1 100644 --- a/include/paimon/predicate/predicate_builder.h +++ b/include/paimon/predicate/predicate_builder.h @@ -133,5 +133,37 @@ class PAIMON_EXPORT PredicateBuilder { /// /// @param predicate A shared pointer to the predicate to be negated, which must not be nullptr. static Result> Not(const std::shared_ptr& predicate); + + /// Create a starts-with predicate (field like 'abc%' or field like 'abc_'). + /// + /// Tests whether the field value starts with the provided literal value. + static Result> StartsWith(int32_t field_index, + const std::string& field_name, + const FieldType& field_type, + const Literal& literal); + + /// Create an ends-with predicate (field like '%abc' or field like '_abc'). + /// + /// Tests whether the field value ends with the provided literal value. + static Result> EndsWith(int32_t field_index, + const std::string& field_name, + const FieldType& field_type, + const Literal& literal); + + /// Create a contains predicate (field like '%abc%'). + /// + /// Tests whether the field value contains the provided literal value. + static Result> Contains(int32_t field_index, + const std::string& field_name, + const FieldType& field_type, + const Literal& literal); + + /// Create a like predicate (field like literal). + /// + /// Tests whether the field value like the provided literal value. + static Result> Like(int32_t field_index, + const std::string& field_name, + const FieldType& field_type, + const Literal& literal); }; } // namespace paimon diff --git a/src/paimon/CMakeLists.txt b/src/paimon/CMakeLists.txt index 5c2d0796..1b9a9fe5 100644 --- a/src/paimon/CMakeLists.txt +++ b/src/paimon/CMakeLists.txt @@ -77,6 +77,8 @@ set(PAIMON_COMMON_SRCS common/options/time_duration.cpp common/predicate/and.cpp common/predicate/compound_predicate.cpp + common/predicate/contains.cpp + common/predicate/ends_with.cpp common/predicate/equal.cpp common/predicate/greater_or_equal.cpp common/predicate/greater_than.cpp @@ -86,6 +88,7 @@ set(PAIMON_COMMON_SRCS common/predicate/leaf_predicate.cpp common/predicate/less_or_equal.cpp common/predicate/less_than.cpp + common/predicate/like.cpp common/predicate/literal_converter.cpp common/predicate/literal.cpp common/predicate/not_equal.cpp @@ -93,6 +96,7 @@ set(PAIMON_COMMON_SRCS common/predicate/or.cpp common/predicate/predicate_builder.cpp common/predicate/predicate_utils.cpp + common/predicate/starts_with.cpp common/reader/batch_reader.cpp common/reader/concat_batch_reader.cpp common/reader/predicate_batch_reader.cpp diff --git a/src/paimon/common/predicate/contains.cpp b/src/paimon/common/predicate/contains.cpp new file mode 100644 index 00000000..535304b7 --- /dev/null +++ b/src/paimon/common/predicate/contains.cpp @@ -0,0 +1,24 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/common/predicate/contains.h" + +namespace paimon { + +Result Contains::TestString(const std::string& field, const std::string& pattern) const { + return field.find(pattern) != std::string::npos; +} +} // namespace paimon diff --git a/src/paimon/common/predicate/contains.h b/src/paimon/common/predicate/contains.h new file mode 100644 index 00000000..81cb788a --- /dev/null +++ b/src/paimon/common/predicate/contains.h @@ -0,0 +1,46 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include "paimon/common/predicate/string_leaf_binary_function.h" +#include "paimon/result.h" + +namespace paimon { +/// A `StringLeafBinaryFunction` to eval filter like '%abc%'. +class Contains : public StringLeafBinaryFunction { + public: + static const Contains& Instance() { + static const Contains instance = Contains(); + return instance; + } + + Type GetType() const override { + return Type::CONTAINS; + } + + std::string ToString() const override { + return "Contains"; + } + + Result TestString(const std::string& field, const std::string& pattern) const override; + + private: + Contains() = default; +}; +} // namespace paimon diff --git a/src/paimon/common/predicate/ends_with.cpp b/src/paimon/common/predicate/ends_with.cpp new file mode 100644 index 00000000..864effb1 --- /dev/null +++ b/src/paimon/common/predicate/ends_with.cpp @@ -0,0 +1,26 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/common/predicate/ends_with.h" + +#include "paimon/common/utils/string_utils.h" + +namespace paimon { + +Result EndsWith::TestString(const std::string& field, const std::string& pattern) const { + return StringUtils::EndsWith(field, pattern); +} +} // namespace paimon diff --git a/src/paimon/common/predicate/ends_with.h b/src/paimon/common/predicate/ends_with.h new file mode 100644 index 00000000..894f7e10 --- /dev/null +++ b/src/paimon/common/predicate/ends_with.h @@ -0,0 +1,46 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include "paimon/common/predicate/string_leaf_binary_function.h" +#include "paimon/result.h" + +namespace paimon { +/// A `StringLeafBinaryFunction` to eval filter like '%abc' or filter like '_abc'. +class EndsWith : public StringLeafBinaryFunction { + public: + static const EndsWith& Instance() { + static const EndsWith instance = EndsWith(); + return instance; + } + + Type GetType() const override { + return Type::ENDS_WITH; + } + + std::string ToString() const override { + return "EndsWith"; + } + + Result TestString(const std::string& field, const std::string& pattern) const override; + + private: + EndsWith() = default; +}; +} // namespace paimon diff --git a/src/paimon/common/predicate/equal.cpp b/src/paimon/common/predicate/equal.cpp index b92842c8..7c3dd9fb 100644 --- a/src/paimon/common/predicate/equal.cpp +++ b/src/paimon/common/predicate/equal.cpp @@ -21,8 +21,8 @@ namespace paimon { class LeafFunction; -const LeafFunction& Equal::Negate() const { - return NotEqual::Instance(); +const LeafFunction* Equal::Negate() const { + return &NotEqual::Instance(); } } // namespace paimon diff --git a/src/paimon/common/predicate/equal.h b/src/paimon/common/predicate/equal.h index 3d80cd7b..9f98fd8c 100644 --- a/src/paimon/common/predicate/equal.h +++ b/src/paimon/common/predicate/equal.h @@ -52,7 +52,7 @@ class Equal : public NullFalseLeafBinaryFunction { Type GetType() const override { return Type::EQUAL; } - const LeafFunction& Negate() const override; + const LeafFunction* Negate() const override; std::string ToString() const override { return "Equal"; } diff --git a/src/paimon/common/predicate/greater_or_equal.cpp b/src/paimon/common/predicate/greater_or_equal.cpp index f344cefb..2b0b0b4b 100644 --- a/src/paimon/common/predicate/greater_or_equal.cpp +++ b/src/paimon/common/predicate/greater_or_equal.cpp @@ -21,8 +21,8 @@ namespace paimon { class LeafFunction; -const LeafFunction& GreaterOrEqual::Negate() const { - return LessThan::Instance(); +const LeafFunction* GreaterOrEqual::Negate() const { + return &LessThan::Instance(); } } // namespace paimon diff --git a/src/paimon/common/predicate/greater_or_equal.h b/src/paimon/common/predicate/greater_or_equal.h index 30fc25bf..f0fdcf10 100644 --- a/src/paimon/common/predicate/greater_or_equal.h +++ b/src/paimon/common/predicate/greater_or_equal.h @@ -49,7 +49,7 @@ class GreaterOrEqual : public NullFalseLeafBinaryFunction { Type GetType() const override { return Type::GREATER_OR_EQUAL; } - const LeafFunction& Negate() const override; + const LeafFunction* Negate() const override; std::string ToString() const override { return "GreaterOrEqual"; } diff --git a/src/paimon/common/predicate/greater_than.cpp b/src/paimon/common/predicate/greater_than.cpp index b64b1281..13ca3353 100644 --- a/src/paimon/common/predicate/greater_than.cpp +++ b/src/paimon/common/predicate/greater_than.cpp @@ -26,7 +26,7 @@ const GreaterThan& GreaterThan::Instance() { return kInstance; } -const LeafFunction& GreaterThan::Negate() const { - return LessOrEqual::Instance(); +const LeafFunction* GreaterThan::Negate() const { + return &LessOrEqual::Instance(); } } // namespace paimon diff --git a/src/paimon/common/predicate/greater_than.h b/src/paimon/common/predicate/greater_than.h index 35c8e17e..2ca5ab5b 100644 --- a/src/paimon/common/predicate/greater_than.h +++ b/src/paimon/common/predicate/greater_than.h @@ -46,7 +46,7 @@ class GreaterThan : public NullFalseLeafBinaryFunction { Type GetType() const override { return Type::GREATER_THAN; } - const LeafFunction& Negate() const override; + const LeafFunction* Negate() const override; std::string ToString() const override { return "GreaterThan"; diff --git a/src/paimon/common/predicate/in.cpp b/src/paimon/common/predicate/in.cpp index 334ed32e..d59df696 100644 --- a/src/paimon/common/predicate/in.cpp +++ b/src/paimon/common/predicate/in.cpp @@ -21,8 +21,8 @@ namespace paimon { class LeafFunction; -const LeafFunction& In::Negate() const { - return NotIn::Instance(); +const LeafFunction* In::Negate() const { + return &NotIn::Instance(); } } // namespace paimon diff --git a/src/paimon/common/predicate/in.h b/src/paimon/common/predicate/in.h index 7208bb37..21b1a134 100644 --- a/src/paimon/common/predicate/in.h +++ b/src/paimon/common/predicate/in.h @@ -68,7 +68,7 @@ class In : public MultiLiteralsLeafFunction { return Type::IN; } - const LeafFunction& Negate() const override; + const LeafFunction* Negate() const override; std::string ToString() const override { return "In"; diff --git a/src/paimon/common/predicate/is_not_null.cpp b/src/paimon/common/predicate/is_not_null.cpp index 273c9ff4..de5a27a5 100644 --- a/src/paimon/common/predicate/is_not_null.cpp +++ b/src/paimon/common/predicate/is_not_null.cpp @@ -21,8 +21,8 @@ namespace paimon { class LeafFunction; -const LeafFunction& IsNotNull::Negate() const { - return IsNull::Instance(); +const LeafFunction* IsNotNull::Negate() const { + return &IsNull::Instance(); } } // namespace paimon diff --git a/src/paimon/common/predicate/is_not_null.h b/src/paimon/common/predicate/is_not_null.h index 0363983c..48669632 100644 --- a/src/paimon/common/predicate/is_not_null.h +++ b/src/paimon/common/predicate/is_not_null.h @@ -47,7 +47,7 @@ class IsNotNull : public LeafUnaryFunction { Type GetType() const override { return Type::IS_NOT_NULL; } - const LeafFunction& Negate() const override; + const LeafFunction* Negate() const override; std::string ToString() const override { return "IsNotNull"; } diff --git a/src/paimon/common/predicate/is_null.cpp b/src/paimon/common/predicate/is_null.cpp index 3cae9ffd..e9df8e12 100644 --- a/src/paimon/common/predicate/is_null.cpp +++ b/src/paimon/common/predicate/is_null.cpp @@ -21,8 +21,8 @@ namespace paimon { class LeafFunction; -const LeafFunction& IsNull::Negate() const { - return IsNotNull::Instance(); +const LeafFunction* IsNull::Negate() const { + return &IsNotNull::Instance(); } } // namespace paimon diff --git a/src/paimon/common/predicate/is_null.h b/src/paimon/common/predicate/is_null.h index 7e01935c..46c8bab6 100644 --- a/src/paimon/common/predicate/is_null.h +++ b/src/paimon/common/predicate/is_null.h @@ -47,7 +47,7 @@ class IsNull : public LeafUnaryFunction { Type GetType() const override { return Type::IS_NULL; } - const LeafFunction& Negate() const override; + const LeafFunction* Negate() const override; std::string ToString() const override { return "IsNull"; } diff --git a/src/paimon/common/predicate/leaf_function.h b/src/paimon/common/predicate/leaf_function.h index be556015..f29cd3d6 100644 --- a/src/paimon/common/predicate/leaf_function.h +++ b/src/paimon/common/predicate/leaf_function.h @@ -35,6 +35,6 @@ class LeafFunction : public Function { const std::optional& null_count, const std::vector& literals) const = 0; - virtual const LeafFunction& Negate() const = 0; + virtual const LeafFunction* Negate() const = 0; }; } // namespace paimon diff --git a/src/paimon/common/predicate/leaf_predicate.cpp b/src/paimon/common/predicate/leaf_predicate.cpp index dcd4ae17..df73cbd3 100644 --- a/src/paimon/common/predicate/leaf_predicate.cpp +++ b/src/paimon/common/predicate/leaf_predicate.cpp @@ -39,8 +39,11 @@ const Function& LeafPredicate::GetFunction() const { } std::shared_ptr LeafPredicate::Negate() const { - const auto& negate_func = leaf_function_.Negate(); - return std::make_shared(negate_func, field_index_, field_name_, field_type_, + const auto* negate_func = leaf_function_.Negate(); + if (!negate_func) { + return nullptr; + } + return std::make_shared(*negate_func, field_index_, field_name_, field_type_, literals_); } diff --git a/src/paimon/common/predicate/less_or_equal.cpp b/src/paimon/common/predicate/less_or_equal.cpp index 8690f30d..b9e031be 100644 --- a/src/paimon/common/predicate/less_or_equal.cpp +++ b/src/paimon/common/predicate/less_or_equal.cpp @@ -21,8 +21,8 @@ namespace paimon { class LeafFunction; -const LeafFunction& LessOrEqual::Negate() const { - return GreaterThan::Instance(); +const LeafFunction* LessOrEqual::Negate() const { + return &GreaterThan::Instance(); } } // namespace paimon diff --git a/src/paimon/common/predicate/less_or_equal.h b/src/paimon/common/predicate/less_or_equal.h index d2d22a6a..5b4de7df 100644 --- a/src/paimon/common/predicate/less_or_equal.h +++ b/src/paimon/common/predicate/less_or_equal.h @@ -48,7 +48,7 @@ class LessOrEqual : public NullFalseLeafBinaryFunction { Type GetType() const override { return Type::LESS_OR_EQUAL; } - const LeafFunction& Negate() const override; + const LeafFunction* Negate() const override; std::string ToString() const override { return "LessOrEqual"; diff --git a/src/paimon/common/predicate/less_than.cpp b/src/paimon/common/predicate/less_than.cpp index b6642775..34401964 100644 --- a/src/paimon/common/predicate/less_than.cpp +++ b/src/paimon/common/predicate/less_than.cpp @@ -25,8 +25,8 @@ const LessThan& LessThan::Instance() { static const LessThan kInstance{}; return kInstance; } -const LeafFunction& LessThan::Negate() const { - return GreaterOrEqual::Instance(); +const LeafFunction* LessThan::Negate() const { + return &GreaterOrEqual::Instance(); } } // namespace paimon diff --git a/src/paimon/common/predicate/less_than.h b/src/paimon/common/predicate/less_than.h index ddb5af30..87809905 100644 --- a/src/paimon/common/predicate/less_than.h +++ b/src/paimon/common/predicate/less_than.h @@ -45,7 +45,7 @@ class LessThan : public NullFalseLeafBinaryFunction { Type GetType() const override { return Type::LESS_THAN; } - const LeafFunction& Negate() const override; + const LeafFunction* Negate() const override; std::string ToString() const override { return "LessThan"; } diff --git a/src/paimon/common/predicate/like.cpp b/src/paimon/common/predicate/like.cpp new file mode 100644 index 00000000..a1a7eac9 --- /dev/null +++ b/src/paimon/common/predicate/like.cpp @@ -0,0 +1,102 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/common/predicate/like.h" + +namespace paimon { + +Result Like::TestString(const std::string& field, const std::string& pattern) const { + if (pattern.empty()) { + return field.empty(); + } + std::vector pat; + std::vector is_wild; + for (size_t i = 0; i < pattern.size(); ++i) { + if (pattern[i] == '\\' && i + 1 < pattern.size()) { + pat.push_back(pattern[i + 1]); + is_wild.push_back(false); + ++i; + } else { + char c = pattern[i]; + pat.push_back(c); + is_wild.push_back(c == '_' || c == '%'); + } + } + std::vector simp_pat; + std::vector simp_wild; + for (size_t i = 0; i < pat.size(); ++i) { + if (is_wild[i] && pat[i] == '%' && !simp_pat.empty() && simp_wild.back() && + simp_pat.back() == '%') { + continue; + } + simp_pat.push_back(pat[i]); + simp_wild.push_back(is_wild[i]); + } + const size_t m = field.size(); + const size_t n = simp_pat.size(); + if (field.empty()) { + return n == 1 && simp_wild[0] && simp_pat[0] == '%'; + } + size_t min_len = 0; + for (size_t i = 0; i < n; ++i) { + if (!simp_wild[i]) { + min_len++; + } + } + if (min_len > m) { + return false; + } + constexpr size_t STACK_LIMIT = 128; + std::unique_ptr dp_storage; + bool* dp; + if (n <= STACK_LIMIT) { + dp = static_cast(alloca((n + 1) * sizeof(bool))); + } else { + dp_storage = std::make_unique(n + 1); + dp = dp_storage.get(); + } + std::fill_n(dp, n + 1, false); + dp[0] = true; + for (size_t j = 1; j <= n && simp_wild[j - 1] && simp_pat[j - 1] == '%'; ++j) { + dp[j] = true; + } + const char* f = field.data(); + for (size_t i = 0; i < m; ++i) { + const char sc = f[i]; + bool prev = dp[0]; + dp[0] = false; + bool has_match = false; + for (size_t j = 1; j <= n; ++j) { + const bool temp = dp[j]; + const char pc = simp_pat[j - 1]; + const bool wild = simp_wild[j - 1]; + if (wild && pc == '%') { + dp[j] = dp[j - 1] || dp[j]; + } else if (wild && pc == '_') { + dp[j] = prev; + } else { + dp[j] = (pc == sc) ? prev : false; + } + has_match |= dp[j]; + prev = temp; + } + if (!has_match) { + return false; + } + } + return dp[n]; +} +} // namespace paimon diff --git a/src/paimon/common/predicate/like.h b/src/paimon/common/predicate/like.h new file mode 100644 index 00000000..b5f63b3c --- /dev/null +++ b/src/paimon/common/predicate/like.h @@ -0,0 +1,46 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include "paimon/common/predicate/string_leaf_binary_function.h" +#include "paimon/result.h" + +namespace paimon { +/// A `StringLeafBinaryFunction` to eval filter like. +class Like : public StringLeafBinaryFunction { + public: + static const Like& Instance() { + static const Like instance = Like(); + return instance; + } + + Type GetType() const override { + return Type::LIKE; + } + + std::string ToString() const override { + return "Like"; + } + + Result TestString(const std::string& field, const std::string& pattern) const override; + + private: + Like() = default; +}; +} // namespace paimon diff --git a/src/paimon/common/predicate/not_equal.cpp b/src/paimon/common/predicate/not_equal.cpp index fe5e29c0..b4b3a441 100644 --- a/src/paimon/common/predicate/not_equal.cpp +++ b/src/paimon/common/predicate/not_equal.cpp @@ -21,8 +21,8 @@ namespace paimon { class LeafFunction; -const LeafFunction& NotEqual::Negate() const { - return Equal::Instance(); +const LeafFunction* NotEqual::Negate() const { + return &Equal::Instance(); } } // namespace paimon diff --git a/src/paimon/common/predicate/not_equal.h b/src/paimon/common/predicate/not_equal.h index 2f015c69..2572935f 100644 --- a/src/paimon/common/predicate/not_equal.h +++ b/src/paimon/common/predicate/not_equal.h @@ -50,7 +50,7 @@ class NotEqual : public NullFalseLeafBinaryFunction { Type GetType() const override { return Type::NOT_EQUAL; } - const LeafFunction& Negate() const override; + const LeafFunction* Negate() const override; std::string ToString() const override { return "NotEqual"; } diff --git a/src/paimon/common/predicate/not_in.cpp b/src/paimon/common/predicate/not_in.cpp index 2bb36c50..5d2217f7 100644 --- a/src/paimon/common/predicate/not_in.cpp +++ b/src/paimon/common/predicate/not_in.cpp @@ -21,8 +21,8 @@ namespace paimon { class LeafFunction; -const LeafFunction& NotIn::Negate() const { - return In::Instance(); +const LeafFunction* NotIn::Negate() const { + return &In::Instance(); } } // namespace paimon diff --git a/src/paimon/common/predicate/not_in.h b/src/paimon/common/predicate/not_in.h index 3fb87c4b..42ff5107 100644 --- a/src/paimon/common/predicate/not_in.h +++ b/src/paimon/common/predicate/not_in.h @@ -69,7 +69,7 @@ class NotIn : public MultiLiteralsLeafFunction { Type GetType() const override { return Type::NOT_IN; } - const LeafFunction& Negate() const override; + const LeafFunction* Negate() const override; std::string ToString() const override { return "NotIn"; } diff --git a/src/paimon/common/predicate/predicate_builder.cpp b/src/paimon/common/predicate/predicate_builder.cpp index 71e7d89a..a8b798f6 100644 --- a/src/paimon/common/predicate/predicate_builder.cpp +++ b/src/paimon/common/predicate/predicate_builder.cpp @@ -20,6 +20,8 @@ #include "paimon/common/predicate/and.h" #include "paimon/common/predicate/compound_predicate_impl.h" +#include "paimon/common/predicate/contains.h" +#include "paimon/common/predicate/ends_with.h" #include "paimon/common/predicate/equal.h" #include "paimon/common/predicate/greater_or_equal.h" #include "paimon/common/predicate/greater_than.h" @@ -29,9 +31,11 @@ #include "paimon/common/predicate/leaf_predicate_impl.h" #include "paimon/common/predicate/less_or_equal.h" #include "paimon/common/predicate/less_than.h" +#include "paimon/common/predicate/like.h" #include "paimon/common/predicate/not_equal.h" #include "paimon/common/predicate/not_in.h" #include "paimon/common/predicate/or.h" +#include "paimon/common/predicate/starts_with.h" #include "paimon/predicate/literal.h" #include "paimon/status.h" @@ -158,6 +162,58 @@ Result> PredicateBuilder::Not( if (!predicate) { return Status::Invalid("There must not be nullptr to construct a NOT predicate"); } - return predicate->Negate(); + auto negate_predicate = predicate->Negate(); + if (!negate_predicate) { + return Status::Invalid("Could not construct A NOT predicate from " + predicate->ToString()); + } + return negate_predicate; +} + +Result> PredicateBuilder::StartsWith(int32_t field_index, + const std::string& field_name, + const FieldType& field_type, + const Literal& literal) { + if (field_type != FieldType::STRING || literal.GetType() != FieldType::STRING) { + return Status::Invalid( + "There must be STRING type field and literal to construct a StartsWith predicate"); + } + return std::make_shared(StartsWith::Instance(), field_index, field_name, + field_type, std::vector({literal})); +} + +Result> PredicateBuilder::EndsWith(int32_t field_index, + const std::string& field_name, + const FieldType& field_type, + const Literal& literal) { + if (field_type != FieldType::STRING || literal.GetType() != FieldType::STRING) { + return Status::Invalid( + "There must be STRING type field and literal to construct an EndsWith predicate"); + } + return std::make_shared(EndsWith::Instance(), field_index, field_name, + field_type, std::vector({literal})); +} + +Result> PredicateBuilder::Contains(int32_t field_index, + const std::string& field_name, + const FieldType& field_type, + const Literal& literal) { + if (field_type != FieldType::STRING || literal.GetType() != FieldType::STRING) { + return Status::Invalid( + "There must be STRING type field and literal to construct a Contains predicate"); + } + return std::make_shared(Contains::Instance(), field_index, field_name, + field_type, std::vector({literal})); +} + +Result> PredicateBuilder::Like(int32_t field_index, + const std::string& field_name, + const FieldType& field_type, + const Literal& literal) { + if (field_type != FieldType::STRING || literal.GetType() != FieldType::STRING) { + return Status::Invalid( + "There must be STRING type field and literal to construct a Like predicate"); + } + return std::make_shared(Like::Instance(), field_index, field_name, + field_type, std::vector({literal})); } } // namespace paimon diff --git a/src/paimon/common/predicate/predicate_test.cpp b/src/paimon/common/predicate/predicate_test.cpp index e9e91c37..a63022f9 100644 --- a/src/paimon/common/predicate/predicate_test.cpp +++ b/src/paimon/common/predicate/predicate_test.cpp @@ -92,7 +92,7 @@ class PredicateTest : public ::testing::Test { return ret; } - BinaryRow CreateBinaryRow(const std::vector>& value) const { + BinaryRow CreateBigIntRow(const std::vector>& value) const { auto pool = GetDefaultPool(); BinaryRow row(/*arity=*/value.size()); BinaryRowWriter row_writer(&row, 0, pool.get()); @@ -106,6 +106,21 @@ class PredicateTest : public ::testing::Test { row_writer.Complete(); return row; } + + BinaryRow CreateStringRow(const std::vector>& value) const { + auto pool = GetDefaultPool(); + BinaryRow row(/*arity=*/value.size()); + BinaryRowWriter row_writer(&row, 0, pool.get()); + for (size_t i = 0; i < value.size(); ++i) { + if (value[i] == std::nullopt) { + row_writer.SetNullAt(i); + } else { + row_writer.WriteString(i, BinaryString::FromString(value[i].value(), pool.get())); + } + } + row_writer.Complete(); + return row; + } }; TEST_F(PredicateTest, TestInvalidFieldIndex) { @@ -129,7 +144,7 @@ TEST_F(PredicateTest, TestInvalidFieldIndex) { // with internal row auto arrow_schema = arrow::schema(arrow::FieldVector({arrow::field("f0", bigint_type)})); - ASSERT_NOK_WITH_MSG(predicate->Test(arrow_schema, CreateBinaryRow({4})), + ASSERT_NOK_WITH_MSG(predicate->Test(arrow_schema, CreateBigIntRow({4})), "field index 2 exceed field count 1 in row"); } @@ -162,9 +177,9 @@ TEST_F(PredicateTest, TestEqual) { FieldType::BIGINT, Literal(10l))); // with internal row auto arrow_schema = arrow::schema(arrow::FieldVector({arrow::field("f0", bigint_type)})); - ASSERT_FALSE(predicate->Test(arrow_schema, CreateBinaryRow({4})).value()); - ASSERT_TRUE(predicate->Test(arrow_schema, CreateBinaryRow({5})).value()); - ASSERT_FALSE(predicate->Test(arrow_schema, CreateBinaryRow({std::nullopt})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateBigIntRow({4})).value()); + ASSERT_TRUE(predicate->Test(arrow_schema, CreateBigIntRow({5})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateBigIntRow({std::nullopt})).value()); // with stats ASSERT_TRUE(StatsCheck(*predicate, 3ll, {FieldStats(0ll, 5ll, 0ll)})); @@ -192,8 +207,8 @@ TEST_F(PredicateTest, TestEqualNull) { // with internal row auto arrow_schema = arrow::schema(arrow::FieldVector({arrow::field("f0", bigint_type)})); - ASSERT_FALSE(predicate->Test(arrow_schema, CreateBinaryRow({4})).value()); - ASSERT_FALSE(predicate->Test(arrow_schema, CreateBinaryRow({std::nullopt})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateBigIntRow({4})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateBigIntRow({std::nullopt})).value()); // with stats ASSERT_FALSE(StatsCheck(*predicate, 3ll, {FieldStats(0ll, 5ll, 0ll)})); @@ -224,10 +239,10 @@ TEST_F(PredicateTest, TestNotEqual) { // with internal row auto arrow_schema = arrow::schema(arrow::FieldVector({arrow::field("f0", bigint_type)})); - ASSERT_TRUE(predicate->Test(arrow_schema, CreateBinaryRow({4})).value()); - ASSERT_FALSE(predicate->Test(arrow_schema, CreateBinaryRow({5})).value()); - ASSERT_FALSE(predicate->Test(arrow_schema, CreateBinaryRow({std::nullopt})).value()); - ASSERT_TRUE(predicate_negate->Test(arrow_schema, CreateBinaryRow({5})).value()); + ASSERT_TRUE(predicate->Test(arrow_schema, CreateBigIntRow({4})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateBigIntRow({5})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateBigIntRow({std::nullopt})).value()); + ASSERT_TRUE(predicate_negate->Test(arrow_schema, CreateBigIntRow({5})).value()); // with stats ASSERT_TRUE(StatsCheck(*predicate, 3ll, {FieldStats(0ll, 5ll, 0ll)})); @@ -255,8 +270,8 @@ TEST_F(PredicateTest, TestNotEqualNull) { ASSERT_EQ(is_valid, std::vector({0, 0})); // with internal row auto arrow_schema = arrow::schema(arrow::FieldVector({arrow::field("f0", bigint_type)})); - ASSERT_FALSE(predicate->Test(arrow_schema, CreateBinaryRow({4})).value()); - ASSERT_FALSE(predicate->Test(arrow_schema, CreateBinaryRow({std::nullopt})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateBigIntRow({4})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateBigIntRow({std::nullopt})).value()); // with stats ASSERT_FALSE(StatsCheck(*predicate, 3ll, {FieldStats(0ll, 5ll, 0ll)})); @@ -287,10 +302,10 @@ TEST_F(PredicateTest, TestGreater) { FieldType::BIGINT, Literal(5l))); // with internal row auto arrow_schema = arrow::schema(arrow::FieldVector({arrow::field("f0", bigint_type)})); - ASSERT_FALSE(predicate->Test(arrow_schema, CreateBinaryRow({4})).value()); - ASSERT_FALSE(predicate->Test(arrow_schema, CreateBinaryRow({5})).value()); - ASSERT_TRUE(predicate->Test(arrow_schema, CreateBinaryRow({6})).value()); - ASSERT_FALSE(predicate->Test(arrow_schema, CreateBinaryRow({std::nullopt})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateBigIntRow({4})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateBigIntRow({5})).value()); + ASSERT_TRUE(predicate->Test(arrow_schema, CreateBigIntRow({6})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateBigIntRow({std::nullopt})).value()); // with stats ASSERT_FALSE(StatsCheck(*predicate, 3ll, {FieldStats(0ll, 4ll, 0ll)})); @@ -319,8 +334,8 @@ TEST_F(PredicateTest, TestGreaterNull) { // with internal row auto arrow_schema = arrow::schema(arrow::FieldVector({arrow::field("f0", bigint_type)})); - ASSERT_FALSE(predicate->Test(arrow_schema, CreateBinaryRow({4})).value()); - ASSERT_FALSE(predicate->Test(arrow_schema, CreateBinaryRow({std::nullopt})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateBigIntRow({4})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateBigIntRow({std::nullopt})).value()); // with stats ASSERT_FALSE(StatsCheck(*predicate, 3ll, {FieldStats(0ll, 4ll, 0ll)})); ASSERT_FALSE(StatsCheck(*predicate, 1ll, {FieldStats(std::nullopt, std::nullopt, 1ll)})); @@ -351,10 +366,10 @@ TEST_F(PredicateTest, TestGreaterOrEqual) { Literal(5l))); // with internal row auto arrow_schema = arrow::schema(arrow::FieldVector({arrow::field("f0", bigint_type)})); - ASSERT_FALSE(predicate->Test(arrow_schema, CreateBinaryRow({4})).value()); - ASSERT_TRUE(predicate->Test(arrow_schema, CreateBinaryRow({5})).value()); - ASSERT_TRUE(predicate->Test(arrow_schema, CreateBinaryRow({6})).value()); - ASSERT_FALSE(predicate->Test(arrow_schema, CreateBinaryRow({std::nullopt})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateBigIntRow({4})).value()); + ASSERT_TRUE(predicate->Test(arrow_schema, CreateBigIntRow({5})).value()); + ASSERT_TRUE(predicate->Test(arrow_schema, CreateBigIntRow({6})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateBigIntRow({std::nullopt})).value()); // with stats ASSERT_FALSE(StatsCheck(*predicate, 3ll, {FieldStats(0ll, 4ll, 0ll)})); @@ -383,8 +398,8 @@ TEST_F(PredicateTest, TestGreaterOrEqualNull) { // with internal row auto arrow_schema = arrow::schema(arrow::FieldVector({arrow::field("f0", bigint_type)})); - ASSERT_FALSE(predicate->Test(arrow_schema, CreateBinaryRow({4})).value()); - ASSERT_FALSE(predicate->Test(arrow_schema, CreateBinaryRow({std::nullopt})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateBigIntRow({4})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateBigIntRow({std::nullopt})).value()); // with stats ASSERT_FALSE(StatsCheck(*predicate, 3ll, {FieldStats(0ll, 4ll, 0ll)})); ASSERT_FALSE(StatsCheck(*predicate, 1ll, {FieldStats(std::nullopt, std::nullopt, 1ll)})); @@ -415,10 +430,10 @@ TEST_F(PredicateTest, TestLess) { // with internal row auto arrow_schema = arrow::schema(arrow::FieldVector({arrow::field("f0", bigint_type)})); - ASSERT_TRUE(predicate->Test(arrow_schema, CreateBinaryRow({4})).value()); - ASSERT_FALSE(predicate->Test(arrow_schema, CreateBinaryRow({5})).value()); - ASSERT_FALSE(predicate->Test(arrow_schema, CreateBinaryRow({6})).value()); - ASSERT_FALSE(predicate->Test(arrow_schema, CreateBinaryRow({std::nullopt})).value()); + ASSERT_TRUE(predicate->Test(arrow_schema, CreateBigIntRow({4})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateBigIntRow({5})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateBigIntRow({6})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateBigIntRow({std::nullopt})).value()); // with stats ASSERT_FALSE(StatsCheck(*predicate, 3ll, {FieldStats(6ll, 7ll, 0ll)})); ASSERT_FALSE(StatsCheck(*predicate, 3ll, {FieldStats(5ll, 7ll, 0ll)})); @@ -445,8 +460,8 @@ TEST_F(PredicateTest, TestLessNull) { // with internal row auto arrow_schema = arrow::schema(arrow::FieldVector({arrow::field("f0", bigint_type)})); - ASSERT_FALSE(predicate->Test(arrow_schema, CreateBinaryRow({4})).value()); - ASSERT_FALSE(predicate->Test(arrow_schema, CreateBinaryRow({std::nullopt})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateBigIntRow({4})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateBigIntRow({std::nullopt})).value()); // with stats ASSERT_FALSE(StatsCheck(*predicate, 3ll, {FieldStats(6ll, 7ll, 0ll)})); ASSERT_FALSE(StatsCheck(*predicate, 1ll, {FieldStats(std::nullopt, std::nullopt, 1ll)})); @@ -477,10 +492,10 @@ TEST_F(PredicateTest, TestLessOrEqual) { // with internal row auto arrow_schema = arrow::schema(arrow::FieldVector({arrow::field("f0", bigint_type)})); - ASSERT_TRUE(predicate->Test(arrow_schema, CreateBinaryRow({4})).value()); - ASSERT_TRUE(predicate->Test(arrow_schema, CreateBinaryRow({5})).value()); - ASSERT_FALSE(predicate->Test(arrow_schema, CreateBinaryRow({6})).value()); - ASSERT_FALSE(predicate->Test(arrow_schema, CreateBinaryRow({std::nullopt})).value()); + ASSERT_TRUE(predicate->Test(arrow_schema, CreateBigIntRow({4})).value()); + ASSERT_TRUE(predicate->Test(arrow_schema, CreateBigIntRow({5})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateBigIntRow({6})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateBigIntRow({std::nullopt})).value()); // with stats ASSERT_FALSE(StatsCheck(*predicate, 3ll, {FieldStats(6ll, 7ll, 0ll)})); ASSERT_TRUE(StatsCheck(*predicate, 3ll, {FieldStats(5ll, 7ll, 0ll)})); @@ -507,8 +522,8 @@ TEST_F(PredicateTest, TestLessOrEqualNull) { // with internal row auto arrow_schema = arrow::schema(arrow::FieldVector({arrow::field("f0", bigint_type)})); - ASSERT_FALSE(predicate->Test(arrow_schema, CreateBinaryRow({4})).value()); - ASSERT_FALSE(predicate->Test(arrow_schema, CreateBinaryRow({std::nullopt})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateBigIntRow({4})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateBigIntRow({std::nullopt})).value()); // with stats ASSERT_FALSE(StatsCheck(*predicate, 3ll, {FieldStats(6ll, 7ll, 0ll)})); ASSERT_FALSE(StatsCheck(*predicate, 1ll, {FieldStats(std::nullopt, std::nullopt, 1ll)})); @@ -536,8 +551,8 @@ TEST_F(PredicateTest, TestIsNull) { // with internal row auto arrow_schema = arrow::schema(arrow::FieldVector({arrow::field("f0", bigint_type)})); - ASSERT_FALSE(predicate->Test(arrow_schema, CreateBinaryRow({4})).value()); - ASSERT_TRUE(predicate->Test(arrow_schema, CreateBinaryRow({std::nullopt})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateBigIntRow({4})).value()); + ASSERT_TRUE(predicate->Test(arrow_schema, CreateBigIntRow({std::nullopt})).value()); // with stats ASSERT_FALSE(StatsCheck(*predicate, 3ll, {FieldStats(6ll, 7ll, 0ll)})); ASSERT_TRUE(StatsCheck(*predicate, 3ll, {FieldStats(5ll, 7ll, 1ll)})); @@ -565,8 +580,8 @@ TEST_F(PredicateTest, TestIsNotNull) { // with internal row auto arrow_schema = arrow::schema(arrow::FieldVector({arrow::field("f0", bigint_type)})); - ASSERT_TRUE(predicate->Test(arrow_schema, CreateBinaryRow({4})).value()); - ASSERT_FALSE(predicate->Test(arrow_schema, CreateBinaryRow({std::nullopt})).value()); + ASSERT_TRUE(predicate->Test(arrow_schema, CreateBigIntRow({4})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateBigIntRow({std::nullopt})).value()); // with stats ASSERT_TRUE(StatsCheck(*predicate, 3ll, {FieldStats(6ll, 7ll, 0ll)})); ASSERT_TRUE(StatsCheck(*predicate, 3ll, {FieldStats(5ll, 7ll, 1ll)})); @@ -598,10 +613,10 @@ TEST_F(PredicateTest, TestIn) { // with internal row auto arrow_schema = arrow::schema(arrow::FieldVector({arrow::field("f0", bigint_type)})); - ASSERT_TRUE(predicate->Test(arrow_schema, CreateBinaryRow({1})).value()); - ASSERT_FALSE(predicate->Test(arrow_schema, CreateBinaryRow({2})).value()); - ASSERT_TRUE(predicate->Test(arrow_schema, CreateBinaryRow({3})).value()); - ASSERT_FALSE(predicate->Test(arrow_schema, CreateBinaryRow({std::nullopt})).value()); + ASSERT_TRUE(predicate->Test(arrow_schema, CreateBigIntRow({1})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateBigIntRow({2})).value()); + ASSERT_TRUE(predicate->Test(arrow_schema, CreateBigIntRow({3})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateBigIntRow({std::nullopt})).value()); // with stats ASSERT_TRUE(StatsCheck(*predicate, 3ll, {FieldStats(0ll, 5ll, 0ll)})); ASSERT_FALSE(StatsCheck(*predicate, 3ll, {FieldStats(6ll, 7ll, 0ll)})); @@ -630,10 +645,10 @@ TEST_F(PredicateTest, TestInNull) { // with internal row auto arrow_schema = arrow::schema(arrow::FieldVector({arrow::field("f0", bigint_type)})); - ASSERT_TRUE(predicate->Test(arrow_schema, CreateBinaryRow({1})).value()); - ASSERT_FALSE(predicate->Test(arrow_schema, CreateBinaryRow({2})).value()); - ASSERT_TRUE(predicate->Test(arrow_schema, CreateBinaryRow({3})).value()); - ASSERT_FALSE(predicate->Test(arrow_schema, CreateBinaryRow({std::nullopt})).value()); + ASSERT_TRUE(predicate->Test(arrow_schema, CreateBigIntRow({1})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateBigIntRow({2})).value()); + ASSERT_TRUE(predicate->Test(arrow_schema, CreateBigIntRow({3})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateBigIntRow({std::nullopt})).value()); // with stats ASSERT_TRUE(StatsCheck(*predicate, 3ll, {FieldStats(0ll, 5ll, 0ll)})); ASSERT_FALSE(StatsCheck(*predicate, 3ll, {FieldStats(6ll, 7ll, 0ll)})); @@ -665,10 +680,10 @@ TEST_F(PredicateTest, TestNotIn) { // with internal row auto arrow_schema = arrow::schema(arrow::FieldVector({arrow::field("f0", bigint_type)})); - ASSERT_FALSE(predicate->Test(arrow_schema, CreateBinaryRow({1})).value()); - ASSERT_TRUE(predicate->Test(arrow_schema, CreateBinaryRow({2})).value()); - ASSERT_FALSE(predicate->Test(arrow_schema, CreateBinaryRow({3})).value()); - ASSERT_FALSE(predicate->Test(arrow_schema, CreateBinaryRow({std::nullopt})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateBigIntRow({1})).value()); + ASSERT_TRUE(predicate->Test(arrow_schema, CreateBigIntRow({2})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateBigIntRow({3})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateBigIntRow({std::nullopt})).value()); // with stats ASSERT_FALSE(StatsCheck(*predicate, 3ll, {FieldStats(1ll, 1ll, 0ll)})); ASSERT_FALSE(StatsCheck(*predicate, 3ll, {FieldStats(3ll, 3ll, 0ll)})); @@ -700,10 +715,10 @@ TEST_F(PredicateTest, TestNotInNull) { // with internal row auto arrow_schema = arrow::schema(arrow::FieldVector({arrow::field("f0", bigint_type)})); - ASSERT_FALSE(predicate->Test(arrow_schema, CreateBinaryRow({1})).value()); - ASSERT_FALSE(predicate->Test(arrow_schema, CreateBinaryRow({2})).value()); - ASSERT_FALSE(predicate->Test(arrow_schema, CreateBinaryRow({3})).value()); - ASSERT_FALSE(predicate->Test(arrow_schema, CreateBinaryRow({std::nullopt})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateBigIntRow({1})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateBigIntRow({2})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateBigIntRow({3})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateBigIntRow({std::nullopt})).value()); // with stats ASSERT_FALSE(StatsCheck(*predicate, 3ll, {FieldStats(1ll, 1ll, 0ll)})); ASSERT_FALSE(StatsCheck(*predicate, 3ll, {FieldStats(3ll, 3ll, 0ll)})); @@ -741,10 +756,10 @@ TEST_F(PredicateTest, TestLargeIn) { // with internal row auto arrow_schema = arrow::schema(arrow::FieldVector({arrow::field("f0", bigint_type)})); - ASSERT_TRUE(predicate->Test(arrow_schema, CreateBinaryRow({1})).value()); - ASSERT_FALSE(predicate->Test(arrow_schema, CreateBinaryRow({2})).value()); - ASSERT_TRUE(predicate->Test(arrow_schema, CreateBinaryRow({3})).value()); - ASSERT_FALSE(predicate->Test(arrow_schema, CreateBinaryRow({std::nullopt})).value()); + ASSERT_TRUE(predicate->Test(arrow_schema, CreateBigIntRow({1})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateBigIntRow({2})).value()); + ASSERT_TRUE(predicate->Test(arrow_schema, CreateBigIntRow({3})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateBigIntRow({std::nullopt})).value()); // with stats ASSERT_TRUE(StatsCheck(*predicate, 3ll, {FieldStats(0ll, 5ll, 0ll)})); ASSERT_FALSE(StatsCheck(*predicate, 3ll, {FieldStats(6ll, 7ll, 0ll)})); @@ -781,10 +796,10 @@ TEST_F(PredicateTest, TestLargeInNull) { // with internal row auto arrow_schema = arrow::schema(arrow::FieldVector({arrow::field("f0", bigint_type)})); - ASSERT_TRUE(predicate->Test(arrow_schema, CreateBinaryRow({1})).value()); - ASSERT_FALSE(predicate->Test(arrow_schema, CreateBinaryRow({2})).value()); - ASSERT_TRUE(predicate->Test(arrow_schema, CreateBinaryRow({3})).value()); - ASSERT_FALSE(predicate->Test(arrow_schema, CreateBinaryRow({std::nullopt})).value()); + ASSERT_TRUE(predicate->Test(arrow_schema, CreateBigIntRow({1})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateBigIntRow({2})).value()); + ASSERT_TRUE(predicate->Test(arrow_schema, CreateBigIntRow({3})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateBigIntRow({std::nullopt})).value()); // with stats ASSERT_TRUE(StatsCheck(*predicate, 3ll, {FieldStats(0ll, 5ll, 0ll)})); ASSERT_FALSE(StatsCheck(*predicate, 3ll, {FieldStats(6ll, 7ll, 0ll)})); @@ -820,10 +835,10 @@ TEST_F(PredicateTest, TestLargeNotIn) { // with internal row auto arrow_schema = arrow::schema(arrow::FieldVector({arrow::field("f0", bigint_type)})); - ASSERT_FALSE(predicate->Test(arrow_schema, CreateBinaryRow({1})).value()); - ASSERT_TRUE(predicate->Test(arrow_schema, CreateBinaryRow({2})).value()); - ASSERT_FALSE(predicate->Test(arrow_schema, CreateBinaryRow({3})).value()); - ASSERT_FALSE(predicate->Test(arrow_schema, CreateBinaryRow({std::nullopt})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateBigIntRow({1})).value()); + ASSERT_TRUE(predicate->Test(arrow_schema, CreateBigIntRow({2})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateBigIntRow({3})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateBigIntRow({std::nullopt})).value()); // with stats ASSERT_FALSE(StatsCheck(*predicate, 3ll, {FieldStats(1ll, 1ll, 0ll)})); ASSERT_FALSE(StatsCheck(*predicate, 3ll, {FieldStats(3ll, 3ll, 0ll)})); @@ -863,10 +878,10 @@ TEST_F(PredicateTest, TestLargeNotInNull) { // with internal row auto arrow_schema = arrow::schema(arrow::FieldVector({arrow::field("f0", bigint_type)})); - ASSERT_FALSE(predicate->Test(arrow_schema, CreateBinaryRow({1})).value()); - ASSERT_FALSE(predicate->Test(arrow_schema, CreateBinaryRow({2})).value()); - ASSERT_FALSE(predicate->Test(arrow_schema, CreateBinaryRow({3})).value()); - ASSERT_FALSE(predicate->Test(arrow_schema, CreateBinaryRow({std::nullopt})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateBigIntRow({1})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateBigIntRow({2})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateBigIntRow({3})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateBigIntRow({std::nullopt})).value()); // with stats ASSERT_FALSE(StatsCheck(*predicate, 3ll, {FieldStats(1ll, 1ll, 0ll)})); ASSERT_FALSE(StatsCheck(*predicate, 3ll, {FieldStats(3ll, 3ll, 0ll)})); @@ -910,10 +925,10 @@ TEST_F(PredicateTest, TestAnd) { // with internal row auto arrow_schema = arrow::schema( arrow::FieldVector({arrow::field("f0", bigint_type), arrow::field("f1", bigint_type)})); - ASSERT_FALSE(predicate->Test(arrow_schema, CreateBinaryRow({4, 5})).value()); - ASSERT_FALSE(predicate->Test(arrow_schema, CreateBinaryRow({3, 6})).value()); - ASSERT_TRUE(predicate->Test(arrow_schema, CreateBinaryRow({3, 5})).value()); - ASSERT_FALSE(predicate->Test(arrow_schema, CreateBinaryRow({std::nullopt, 5})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateBigIntRow({4, 5})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateBigIntRow({3, 6})).value()); + ASSERT_TRUE(predicate->Test(arrow_schema, CreateBigIntRow({3, 5})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateBigIntRow({std::nullopt, 5})).value()); // with stats ASSERT_TRUE( StatsCheck(*predicate, 3ll, {FieldStats(3ll, 6ll, 0ll), FieldStats(4ll, 6ll, 0ll)})); @@ -956,10 +971,10 @@ TEST_F(PredicateTest, TestOr) { // with internal row auto arrow_schema = arrow::schema( arrow::FieldVector({arrow::field("f0", bigint_type), arrow::field("f1", bigint_type)})); - ASSERT_FALSE(predicate->Test(arrow_schema, CreateBinaryRow({4, 6})).value()); - ASSERT_TRUE(predicate->Test(arrow_schema, CreateBinaryRow({3, 6})).value()); - ASSERT_TRUE(predicate->Test(arrow_schema, CreateBinaryRow({3, 5})).value()); - ASSERT_TRUE(predicate->Test(arrow_schema, CreateBinaryRow({std::nullopt, 5})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateBigIntRow({4, 6})).value()); + ASSERT_TRUE(predicate->Test(arrow_schema, CreateBigIntRow({3, 6})).value()); + ASSERT_TRUE(predicate->Test(arrow_schema, CreateBigIntRow({3, 5})).value()); + ASSERT_TRUE(predicate->Test(arrow_schema, CreateBigIntRow({std::nullopt, 5})).value()); // with stats ASSERT_TRUE( StatsCheck(*predicate, 3ll, {FieldStats(3ll, 6ll, 0ll), FieldStats(4ll, 6ll, 0ll)})); @@ -999,10 +1014,10 @@ TEST_F(PredicateTest, TestBetween) { // with internal row auto arrow_schema = arrow::schema(arrow::FieldVector({arrow::field("f0", bigint_type)})); - ASSERT_FALSE(predicate->Test(arrow_schema, CreateBinaryRow({1})).value()); - ASSERT_TRUE(predicate->Test(arrow_schema, CreateBinaryRow({4})).value()); - ASSERT_FALSE(predicate->Test(arrow_schema, CreateBinaryRow({std::nullopt})).value()); - ASSERT_TRUE(predicate_negate->Test(arrow_schema, CreateBinaryRow({1})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateBigIntRow({1})).value()); + ASSERT_TRUE(predicate->Test(arrow_schema, CreateBigIntRow({4})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateBigIntRow({std::nullopt})).value()); + ASSERT_TRUE(predicate_negate->Test(arrow_schema, CreateBigIntRow({1})).value()); // with stats ASSERT_TRUE(StatsCheck(*predicate, 3ll, {FieldStats(1ll, 10ll, 0ll)})); @@ -1031,14 +1046,292 @@ TEST_F(PredicateTest, TestBetweenNull) { // with internal row auto arrow_schema = arrow::schema(arrow::FieldVector({arrow::field("f0", bigint_type)})); - ASSERT_FALSE(predicate->Test(arrow_schema, CreateBinaryRow({4})).value()); - ASSERT_FALSE(predicate->Test(arrow_schema, CreateBinaryRow({std::nullopt})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateBigIntRow({4})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateBigIntRow({std::nullopt})).value()); // with stats ASSERT_FALSE(StatsCheck(*predicate, 3ll, {FieldStats(1ll, 10ll, 0ll)})); ASSERT_FALSE(StatsCheck(*predicate, 1ll, {FieldStats(std::nullopt, std::nullopt, 1ll)})); } +TEST_F(PredicateTest, TestStartsWith) { + auto string_type = arrow::utf8(); + ASSERT_OK_AND_ASSIGN( + const auto predicate_base, + PredicateBuilder::StartsWith(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, + Literal(FieldType::STRING, "aab", 3))); + const auto predicate = std::dynamic_pointer_cast(predicate_base); + ASSERT_TRUE(predicate); + auto f0 = arrow::ipc::internal::json::ArrayFromJSON(string_type, + R"(["ccddee", "bbccdd", "aabbcc", null])") + .ValueOrDie(); + auto f1 = arrow::ipc::internal::json::ArrayFromJSON( + string_type, R"(["gghhii", "ffgghh", "eeffgg", "ddeeff"])") + .ValueOrDie(); + std::shared_ptr src_type = + arrow::struct_({arrow::field("f0", string_type), arrow::field("f1", string_type)}); + + std::shared_ptr struct_array = + arrow::StructArray::Make({f0, f1}, src_type->fields()).ValueOrDie(); + + ASSERT_OK_AND_ASSIGN(auto is_valid, predicate->Test(*struct_array)); + ASSERT_EQ(is_valid, std::vector({0, 0, 1, 0})); + + ASSERT_EQ(predicate->Negate(), nullptr); + // with internal row + auto arrow_schema = arrow::schema(arrow::FieldVector({arrow::field("f0", string_type)})); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateStringRow({"ccddee"})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateStringRow({"bbccdd"})).value()); + ASSERT_TRUE(predicate->Test(arrow_schema, CreateStringRow({"aabbcc"})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateStringRow({std::nullopt})).value()); +} + +TEST_F(PredicateTest, TestStartsWithNull) { + const auto string_type = arrow::utf8(); + ASSERT_OK_AND_ASSIGN( + const auto predicate_base, + PredicateBuilder::StartsWith( + /*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, Literal(FieldType::STRING))); + const auto predicate = std::dynamic_pointer_cast(predicate_base); + ASSERT_TRUE(predicate); + auto f0 = + arrow::ipc::internal::json::ArrayFromJSON(string_type, R"(["bbccdd", null])").ValueOrDie(); + auto f1 = arrow::ipc::internal::json::ArrayFromJSON(string_type, R"(["ffgghh", "ccddee"])") + .ValueOrDie(); + std::shared_ptr src_type = + arrow::struct_({arrow::field("f0", string_type), arrow::field("f1", string_type)}); + + std::shared_ptr struct_array = + arrow::StructArray::Make({f0, f1}, src_type->fields()).ValueOrDie(); + + ASSERT_OK_AND_ASSIGN(auto is_valid, predicate->Test(*struct_array)); + ASSERT_EQ(is_valid, std::vector({0, 0})); + + // with internal row + auto arrow_schema = arrow::schema(arrow::FieldVector({arrow::field("f0", string_type)})); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateStringRow({"bbccdd"})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateStringRow({std::nullopt})).value()); +} + +TEST_F(PredicateTest, TestEndsWith) { + auto string_type = arrow::utf8(); + ASSERT_OK_AND_ASSIGN( + const auto predicate_base, + PredicateBuilder::EndsWith(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, + Literal(FieldType::STRING, "bcc", 3))); + const auto predicate = std::dynamic_pointer_cast(predicate_base); + ASSERT_TRUE(predicate); + auto f0 = arrow::ipc::internal::json::ArrayFromJSON(string_type, + R"(["ccddee", "bbccdd", "aabbcc", null])") + .ValueOrDie(); + auto f1 = arrow::ipc::internal::json::ArrayFromJSON( + string_type, R"(["gghhii", "ffgghh", "eeffgg", "ddeeff"])") + .ValueOrDie(); + std::shared_ptr src_type = + arrow::struct_({arrow::field("f0", string_type), arrow::field("f1", string_type)}); + + std::shared_ptr struct_array = + arrow::StructArray::Make({f0, f1}, src_type->fields()).ValueOrDie(); + + ASSERT_OK_AND_ASSIGN(auto is_valid, predicate->Test(*struct_array)); + ASSERT_EQ(is_valid, std::vector({0, 0, 1, 0})); + + ASSERT_EQ(predicate->Negate(), nullptr); + // with internal row + auto arrow_schema = arrow::schema(arrow::FieldVector({arrow::field("f0", string_type)})); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateStringRow({"ccddee"})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateStringRow({"bbccdd"})).value()); + ASSERT_TRUE(predicate->Test(arrow_schema, CreateStringRow({"aabbcc"})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateStringRow({std::nullopt})).value()); +} + +TEST_F(PredicateTest, TestEndsWithNull) { + const auto string_type = arrow::utf8(); + ASSERT_OK_AND_ASSIGN( + const auto predicate_base, + PredicateBuilder::EndsWith( + /*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, Literal(FieldType::STRING))); + const auto predicate = std::dynamic_pointer_cast(predicate_base); + ASSERT_TRUE(predicate); + auto f0 = + arrow::ipc::internal::json::ArrayFromJSON(string_type, R"(["bbccdd", null])").ValueOrDie(); + auto f1 = arrow::ipc::internal::json::ArrayFromJSON(string_type, R"(["ffgghh", "ccddee"])") + .ValueOrDie(); + std::shared_ptr src_type = + arrow::struct_({arrow::field("f0", string_type), arrow::field("f1", string_type)}); + + std::shared_ptr struct_array = + arrow::StructArray::Make({f0, f1}, src_type->fields()).ValueOrDie(); + + ASSERT_OK_AND_ASSIGN(auto is_valid, predicate->Test(*struct_array)); + ASSERT_EQ(is_valid, std::vector({0, 0})); + + // with internal row + auto arrow_schema = arrow::schema(arrow::FieldVector({arrow::field("f0", string_type)})); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateStringRow({"bbccdd"})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateStringRow({std::nullopt})).value()); +} + +TEST_F(PredicateTest, TestContains) { + auto string_type = arrow::utf8(); + ASSERT_OK_AND_ASSIGN( + const auto predicate_base, + PredicateBuilder::Contains(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, + Literal(FieldType::STRING, "cde", 3))); + const auto predicate = std::dynamic_pointer_cast(predicate_base); + ASSERT_TRUE(predicate); + auto f0 = arrow::ipc::internal::json::ArrayFromJSON(string_type, + R"(["ghijkl", "defghi", "abcdef", null])") + .ValueOrDie(); + auto f1 = arrow::ipc::internal::json::ArrayFromJSON( + string_type, R"(["stuvwx", "pqrstu", "mnopqr", "jklmno"])") + .ValueOrDie(); + std::shared_ptr src_type = + arrow::struct_({arrow::field("f0", string_type), arrow::field("f1", string_type)}); + + std::shared_ptr struct_array = + arrow::StructArray::Make({f0, f1}, src_type->fields()).ValueOrDie(); + + ASSERT_OK_AND_ASSIGN(auto is_valid, predicate->Test(*struct_array)); + ASSERT_EQ(is_valid, std::vector({0, 0, 1, 0})); + + ASSERT_EQ(predicate->Negate(), nullptr); + // with internal row + auto arrow_schema = arrow::schema(arrow::FieldVector({arrow::field("f0", string_type)})); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateStringRow({"ghijkl"})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateStringRow({"defghi"})).value()); + ASSERT_TRUE(predicate->Test(arrow_schema, CreateStringRow({"abcdef"})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateStringRow({std::nullopt})).value()); +} + +TEST_F(PredicateTest, TestContainsNull) { + const auto string_type = arrow::utf8(); + ASSERT_OK_AND_ASSIGN( + const auto predicate_base, + PredicateBuilder::Contains( + /*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, Literal(FieldType::STRING))); + const auto predicate = std::dynamic_pointer_cast(predicate_base); + ASSERT_TRUE(predicate); + auto f0 = + arrow::ipc::internal::json::ArrayFromJSON(string_type, R"(["defghi", null])").ValueOrDie(); + auto f1 = arrow::ipc::internal::json::ArrayFromJSON(string_type, R"(["pqrstu", "jklmno"])") + .ValueOrDie(); + std::shared_ptr src_type = + arrow::struct_({arrow::field("f0", string_type), arrow::field("f1", string_type)}); + + std::shared_ptr struct_array = + arrow::StructArray::Make({f0, f1}, src_type->fields()).ValueOrDie(); + + ASSERT_OK_AND_ASSIGN(auto is_valid, predicate->Test(*struct_array)); + ASSERT_EQ(is_valid, std::vector({0, 0})); + + // with internal row + auto arrow_schema = arrow::schema(arrow::FieldVector({arrow::field("f0", string_type)})); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateStringRow({"defghi"})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateStringRow({std::nullopt})).value()); +} + +TEST_F(PredicateTest, TestLike) { + ASSERT_OK_AND_ASSIGN(auto predicate_base, + PredicateBuilder::Like( + /*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, + Literal(FieldType::STRING, "a.c", 3))); + auto predicate = std::dynamic_pointer_cast(predicate_base); + ASSERT_TRUE(predicate); + ASSERT_EQ(predicate->Negate(), nullptr); + // with internal row + auto arrow_schema = arrow::schema(arrow::FieldVector({arrow::field("f0", arrow::utf8())})); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateStringRow({"abc"})).value()); + ASSERT_TRUE(predicate->Test(arrow_schema, CreateStringRow({"a.c"})).value()); + + ASSERT_OK_AND_ASSIGN(predicate_base, + PredicateBuilder::Like( + /*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, + Literal(FieldType::STRING, "a.*d", 4))); + predicate = std::dynamic_pointer_cast(predicate_base); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateStringRow({"abcd"})).value()); + + ASSERT_OK_AND_ASSIGN(predicate_base, + PredicateBuilder::Like( + /*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, + Literal(FieldType::STRING, "%c.e", 4))); + predicate = std::dynamic_pointer_cast(predicate_base); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateStringRow({"abcde"})).value()); + + ASSERT_OK_AND_ASSIGN(predicate_base, + PredicateBuilder::Like( + /*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, + Literal(FieldType::STRING, "a\\_c", 4))); + predicate = std::dynamic_pointer_cast(predicate_base); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateStringRow({"a-c"})).value()); + ASSERT_TRUE(predicate->Test(arrow_schema, CreateStringRow({"a_c"})).value()); + + ASSERT_OK_AND_ASSIGN(predicate_base, + PredicateBuilder::Like( + /*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, + Literal(FieldType::STRING, "start%", 6))); + predicate = std::dynamic_pointer_cast(predicate_base); + ASSERT_TRUE(predicate->Test(arrow_schema, CreateStringRow({"startX"})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateStringRow({"not_startX"})).value()); + + ASSERT_OK_AND_ASSIGN(predicate_base, + PredicateBuilder::Like( + /*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, + Literal(FieldType::STRING, "%middle%", 8))); + predicate = std::dynamic_pointer_cast(predicate_base); + ASSERT_TRUE(predicate->Test(arrow_schema, CreateStringRow({"xxmiddleyy"})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateStringRow({"xxmidxdleyy"})).value()); + + ASSERT_OK_AND_ASSIGN(predicate_base, + PredicateBuilder::Like( + /*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, + Literal(FieldType::STRING, "%end", 4))); + predicate = std::dynamic_pointer_cast(predicate_base); + ASSERT_TRUE(predicate->Test(arrow_schema, CreateStringRow({"xxend"})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateStringRow({"xxendyy"})).value()); + + ASSERT_OK_AND_ASSIGN(predicate_base, + PredicateBuilder::Like( + /*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, + Literal(FieldType::STRING, "equal", 5))); + predicate = std::dynamic_pointer_cast(predicate_base); + ASSERT_TRUE(predicate->Test(arrow_schema, CreateStringRow({"equal"})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateStringRow({"equalxx"})).value()); + + ASSERT_OK_AND_ASSIGN(predicate_base, + PredicateBuilder::Like( + /*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, + Literal(FieldType::STRING, "st_rt%", 6))); + predicate = std::dynamic_pointer_cast(predicate_base); + ASSERT_TRUE(predicate->Test(arrow_schema, CreateStringRow({"startxx"})).value()); + ASSERT_TRUE(predicate->Test(arrow_schema, CreateStringRow({"stbrtxx"})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateStringRow({"xxstbrtxx"})).value()); + + ASSERT_OK_AND_ASSIGN(predicate_base, + PredicateBuilder::Like( + /*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, + Literal(FieldType::STRING, "abc%def%", 8))); + predicate = std::dynamic_pointer_cast(predicate_base); + ASSERT_TRUE(predicate->Test(arrow_schema, CreateStringRow({"abchahadefxx"})).value()); + ASSERT_FALSE(predicate->Test(arrow_schema, CreateStringRow({"abchahadafxx"})).value()); +} + +TEST_F(PredicateTest, TestCompound) { + ASSERT_OK_AND_ASSIGN( + const auto startswith_predicate, + PredicateBuilder::StartsWith(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, + Literal(FieldType::STRING, "aab", 3))); + ASSERT_OK_AND_ASSIGN( + const auto endswith_predicate, + PredicateBuilder::EndsWith(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, + Literal(FieldType::STRING, "bcc", 3))); + ASSERT_OK_AND_ASSIGN(const auto compound_predicate, + PredicateBuilder::And({startswith_predicate, endswith_predicate})); + ASSERT_NOK_WITH_MSG( + PredicateBuilder::Not(compound_predicate), + "Could not construct A NOT predicate from And([StartsWith(f0, aab), EndsWith(f0, bcc)])"); +} + TEST_F(PredicateTest, TestPredicateToString) { { auto predicate = PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", diff --git a/src/paimon/common/predicate/predicate_utils.h b/src/paimon/common/predicate/predicate_utils.h index 397f6871..b5e030b2 100644 --- a/src/paimon/common/predicate/predicate_utils.h +++ b/src/paimon/common/predicate/predicate_utils.h @@ -97,8 +97,23 @@ class PAIMON_EXPORT PredicateUtils { return visitor->VisitIn(predicate->Literals()); case Function::Type::NOT_IN: return visitor->VisitNotIn(predicate->Literals()); + case Function::Type::STARTS_WITH: { + assert(predicate->Literals().size() == 1); + return visitor->VisitStartsWith(predicate->Literals()[0]); + } + case Function::Type::ENDS_WITH: { + assert(predicate->Literals().size() == 1); + return visitor->VisitEndsWith(predicate->Literals()[0]); + } + case Function::Type::CONTAINS: { + assert(predicate->Literals().size() == 1); + return visitor->VisitContains(predicate->Literals()[0]); + } + case Function::Type::LIKE: { + assert(predicate->Literals().size() == 1); + return visitor->VisitLike(predicate->Literals()[0]); + } default: - // TODO(xinyu.lxy): support StartsWith/EndsWith/Contains return Status::Invalid(fmt::format("invalid {} function in leaf predicate", predicate->GetFunction().ToString())); } diff --git a/src/paimon/common/predicate/starts_with.cpp b/src/paimon/common/predicate/starts_with.cpp new file mode 100644 index 00000000..1fbc5bb5 --- /dev/null +++ b/src/paimon/common/predicate/starts_with.cpp @@ -0,0 +1,38 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/common/predicate/starts_with.h" + +#include "paimon/common/utils/string_utils.h" + +namespace paimon { + +Result StartsWith::TestString(const std::string& field, const std::string& pattern) const { + return StringUtils::StartsWith(field, pattern); +} + +Result StartsWith::Test(int64_t row_count, const Literal& min_value, const Literal& max_value, + const std::optional& null_count, + const Literal& pattern_literal) const { + const auto min_str = min_value.GetValue(); + const auto max_str = max_value.GetValue(); + const auto pattern_str = pattern_literal.GetValue(); + PAIMON_ASSIGN_OR_RAISE(const auto min_test, TestString(min_str, pattern_str)); + PAIMON_ASSIGN_OR_RAISE(const auto max_test, TestString(max_str, pattern_str)); + return (min_test || min_str.compare(pattern_str) <= 0) && + (max_test || max_str.compare(pattern_str) >= 0); +} +} // namespace paimon diff --git a/src/paimon/common/predicate/starts_with.h b/src/paimon/common/predicate/starts_with.h new file mode 100644 index 00000000..220f8259 --- /dev/null +++ b/src/paimon/common/predicate/starts_with.h @@ -0,0 +1,51 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include "paimon/common/predicate/string_leaf_binary_function.h" +#include "paimon/common/utils/string_utils.h" +#include "paimon/result.h" + +namespace paimon { +/// A `StringLeafBinaryFunction` to eval filter like 'abc%' or filter like 'abc_'. +class StartsWith : public StringLeafBinaryFunction { + public: + static const StartsWith& Instance() { + static const StartsWith instance = StartsWith(); + return instance; + } + + Type GetType() const override { + return Type::STARTS_WITH; + } + + std::string ToString() const override { + return "StartsWith"; + } + + Result TestString(const std::string& field, const std::string& pattern) const override; + + Result Test(int64_t row_count, const Literal& min_value, const Literal& max_value, + const std::optional& null_count, + const Literal& pattern_literal) const override; + + private: + StartsWith() = default; +}; +} // namespace paimon diff --git a/src/paimon/common/predicate/string_leaf_binary_function.h b/src/paimon/common/predicate/string_leaf_binary_function.h new file mode 100644 index 00000000..0406456b --- /dev/null +++ b/src/paimon/common/predicate/string_leaf_binary_function.h @@ -0,0 +1,42 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "paimon/common/predicate/null_false_leaf_binary_function.h" + +namespace paimon { +class LeafFunction; + +class StringLeafBinaryFunction : public NullFalseLeafBinaryFunction { + public: + virtual Result TestString(const std::string& field, const std::string& pattern) const = 0; + + Result Test(const Literal& field, const Literal& pattern_literal) const override { + return TestString(field.GetValue(), pattern_literal.GetValue()); + } + + Result Test(int64_t row_count, const Literal& min_value, const Literal& max_value, + const std::optional& null_count, + const Literal& literal) const override { + return true; + } + + const LeafFunction* Negate() const override { + return nullptr; + } +}; +} // namespace paimon diff --git a/src/paimon/format/orc/predicate_converter.cpp b/src/paimon/format/orc/predicate_converter.cpp index 39fa5553..c470d11d 100644 --- a/src/paimon/format/orc/predicate_converter.cpp +++ b/src/paimon/format/orc/predicate_converter.cpp @@ -203,6 +203,14 @@ Status PredicateConverter::ConvertLeaf( builder->end(); break; } + case Function::Type::STARTS_WITH: + case Function::Type::ENDS_WITH: + case Function::Type::CONTAINS: + case Function::Type::LIKE: + // SearchArgument does not support predicates including StartsWith, EndsWith, Contains + // and Like that should skip. + builder->literal(::orc::TruthValue::YES); + break; default: return Status::Invalid( fmt::format("invalid predicate type {}", static_cast(function_type))); diff --git a/src/paimon/format/orc/predicate_converter_test.cpp b/src/paimon/format/orc/predicate_converter_test.cpp index c71b9640..d3f64933 100644 --- a/src/paimon/format/orc/predicate_converter_test.cpp +++ b/src/paimon/format/orc/predicate_converter_test.cpp @@ -111,6 +111,38 @@ TEST(PredicateConverterTest, TestSimple) { ASSERT_OK_AND_ASSIGN(auto search_arg, PredicateConverter::Convert(*orc_type, predicate)); ASSERT_EQ("leaf-0 = (column(id=1) in [1, 3, 5]), expr = leaf-0", search_arg->toString()); } + { + ASSERT_OK_AND_ASSIGN( + const auto predicate, + PredicateBuilder::StartsWith(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, + Literal(FieldType::STRING, "aab", 3))); + ASSERT_OK_AND_ASSIGN(auto search_arg, PredicateConverter::Convert(*orc_type, predicate)); + ASSERT_EQ("expr = YES", search_arg->toString()); + } + { + ASSERT_OK_AND_ASSIGN( + const auto predicate, + PredicateBuilder::EndsWith(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, + Literal(FieldType::STRING, "bcc", 3))); + ASSERT_OK_AND_ASSIGN(auto search_arg, PredicateConverter::Convert(*orc_type, predicate)); + ASSERT_EQ("expr = YES", search_arg->toString()); + } + { + ASSERT_OK_AND_ASSIGN( + const auto predicate, + PredicateBuilder::Contains(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, + Literal(FieldType::STRING, "abc", 3))); + ASSERT_OK_AND_ASSIGN(auto search_arg, PredicateConverter::Convert(*orc_type, predicate)); + ASSERT_EQ("expr = YES", search_arg->toString()); + } + { + ASSERT_OK_AND_ASSIGN( + const auto predicate, + PredicateBuilder::Like(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, + Literal(FieldType::STRING, "abc", 3))); + ASSERT_OK_AND_ASSIGN(auto search_arg, PredicateConverter::Convert(*orc_type, predicate)); + ASSERT_EQ("expr = YES", search_arg->toString()); + } { auto predicate = PredicateBuilder::NotIn(/*field_index=*/0, /*field_name=*/"f0", FieldType::BIGINT, diff --git a/src/paimon/format/parquet/predicate_converter.cpp b/src/paimon/format/parquet/predicate_converter.cpp index 0b46b76a..680c5c0b 100644 --- a/src/paimon/format/parquet/predicate_converter.cpp +++ b/src/paimon/format/parquet/predicate_converter.cpp @@ -19,6 +19,7 @@ #include #include +#include "arrow/compute/api.h" #include "arrow/compute/expression.h" #include "arrow/scalar.h" #include "arrow/type_fwd.h" @@ -206,6 +207,34 @@ Result PredicateConverter::ConvertLeaf( } return arrow::compute::and_(sub_exprs); } + case Function::Type::STARTS_WITH: { + PAIMON_RETURN_NOT_OK(CheckLiteralNotEmpty(literals, function, field_name)); + auto options = std::make_shared( + literals[0].GetValue()); + return arrow::compute::call("starts_with", {arrow::compute::field_ref(field_name)}, + options); + } + case Function::Type::ENDS_WITH: { + PAIMON_RETURN_NOT_OK(CheckLiteralNotEmpty(literals, function, field_name)); + auto options = std::make_shared( + literals[0].GetValue()); + return arrow::compute::call("ends_with", {arrow::compute::field_ref(field_name)}, + options); + } + case Function::Type::CONTAINS: { + PAIMON_RETURN_NOT_OK(CheckLiteralNotEmpty(literals, function, field_name)); + auto options = std::make_shared( + literals[0].GetValue()); + return arrow::compute::call("match_substring", {arrow::compute::field_ref(field_name)}, + options); + } + case Function::Type::LIKE: { + PAIMON_RETURN_NOT_OK(CheckLiteralNotEmpty(literals, function, field_name)); + auto options = std::make_shared( + literals[0].GetValue()); + return arrow::compute::call("match_like", {arrow::compute::field_ref(field_name)}, + options); + } default: return Status::Invalid( fmt::format("invalid predicate type {}", static_cast(function_type))); diff --git a/src/paimon/format/parquet/predicate_converter_test.cpp b/src/paimon/format/parquet/predicate_converter_test.cpp index 69dfd2e7..6a76585f 100644 --- a/src/paimon/format/parquet/predicate_converter_test.cpp +++ b/src/paimon/format/parquet/predicate_converter_test.cpp @@ -126,6 +126,43 @@ TEST(PredicateConverterTest, TestSimple) { predicate, /*predicate_node_count_limit=*/100)); ASSERT_EQ("(((f0 != 1) and (f0 != 3)) and (f0 != 5))", expression.ToString()); } + { + ASSERT_OK_AND_ASSIGN( + const auto predicate, + PredicateBuilder::StartsWith(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, + Literal(FieldType::STRING, "aab", 3))); + ASSERT_OK_AND_ASSIGN(auto expression, PredicateConverter::Convert( + predicate, /*predicate_node_count_limit=*/100)); + ASSERT_EQ("starts_with(f0, {pattern=\"aab\", ignore_case=false})", expression.ToString()); + } + { + ASSERT_OK_AND_ASSIGN( + const auto predicate, + PredicateBuilder::EndsWith(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, + Literal(FieldType::STRING, "bcc", 3))); + ASSERT_OK_AND_ASSIGN(auto expression, PredicateConverter::Convert( + predicate, /*predicate_node_count_limit=*/100)); + ASSERT_EQ("ends_with(f0, {pattern=\"bcc\", ignore_case=false})", expression.ToString()); + } + { + ASSERT_OK_AND_ASSIGN( + const auto predicate, + PredicateBuilder::Contains(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, + Literal(FieldType::STRING, "abc", 3))); + ASSERT_OK_AND_ASSIGN(auto expression, PredicateConverter::Convert( + predicate, /*predicate_node_count_limit=*/100)); + ASSERT_EQ("match_substring(f0, {pattern=\"abc\", ignore_case=false})", + expression.ToString()); + } + { + ASSERT_OK_AND_ASSIGN( + const auto predicate, + PredicateBuilder::Like(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, + Literal(FieldType::STRING, "abc", 3))); + ASSERT_OK_AND_ASSIGN(auto expression, PredicateConverter::Convert( + predicate, /*predicate_node_count_limit=*/100)); + ASSERT_EQ("match_like(f0, {pattern=\"abc\", ignore_case=false})", expression.ToString()); + } { // support decimal precision and scale mismatches between literal and data auto predicate = PredicateBuilder::In(/*field_index=*/7, /*field_name=*/"f7", diff --git a/src/paimon/format/parquet/predicate_pushdown_test.cpp b/src/paimon/format/parquet/predicate_pushdown_test.cpp index 3c108fcf..338e9494 100644 --- a/src/paimon/format/parquet/predicate_pushdown_test.cpp +++ b/src/paimon/format/parquet/predicate_pushdown_test.cpp @@ -320,6 +320,38 @@ TEST_F(PredicatePushdownTest, TestStringData) { Literal(FieldType::STRING, "zooooooo", 8)); CheckResult(read_schema, predicate, /*expected_array=*/nullptr); } + { + // f0 like 'ba%', has data + ASSERT_OK_AND_ASSIGN(const auto predicate, + PredicateBuilder::StartsWith( + /*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, + Literal(FieldType::STRING, "ba", 2))); + CheckResult(read_schema, predicate, /*expected_array=*/expected_array); + } + { + // f0 like '%ta', has data + ASSERT_OK_AND_ASSIGN(const auto predicate, + PredicateBuilder::EndsWith( + /*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, + Literal(FieldType::STRING, "ta", 2))); + CheckResult(read_schema, predicate, /*expected_array=*/expected_array); + } + { + // f0 like '%me%', has data + ASSERT_OK_AND_ASSIGN(const auto predicate, + PredicateBuilder::Contains( + /*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, + Literal(FieldType::STRING, "me", 2))); + CheckResult(read_schema, predicate, /*expected_array=*/expected_array); + } + { + // f0 like 'me', no data + ASSERT_OK_AND_ASSIGN(const auto predicate, + PredicateBuilder::Like( + /*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, + Literal(FieldType::STRING, "me", 2))); + CheckResult(read_schema, predicate, /*expected_array=*/expected_array); + } } TEST_F(PredicatePushdownTest, TestBinaryData) {