From be6f0228bc9086e8b6549a8992cee1e550f09535 Mon Sep 17 00:00:00 2001 From: "logan.riggs@gmail.com" Date: Thu, 14 May 2026 20:54:32 +0000 Subject: [PATCH 1/4] Add regexp_extract optional third parameter function version. --- cpp/src/gandiva/function_registry_string.cc | 6 ++++ cpp/src/gandiva/gdv_string_function_stubs.cc | 23 ++++++++++++++ cpp/src/gandiva/regex_functions_holder.cc | 4 +-- .../gandiva/regex_functions_holder_test.cc | 31 ++++++++++++++++--- 4 files changed, 58 insertions(+), 6 deletions(-) diff --git a/cpp/src/gandiva/function_registry_string.cc b/cpp/src/gandiva/function_registry_string.cc index be57ce4f4768..507c08b7cd7f 100644 --- a/cpp/src/gandiva/function_registry_string.cc +++ b/cpp/src/gandiva/function_registry_string.cc @@ -257,6 +257,12 @@ std::vector GetStringFunctionRegistry() { NativeFunction::kNeedsFunctionHolder | NativeFunction::kCanReturnErrors), + NativeFunction("regexp_extract", {}, DataTypeVector{utf8(), utf8()}, + utf8(), kResultNullIfNull, "gdv_fn_regexp_extract_utf8_utf8", + NativeFunction::kNeedsContext | + NativeFunction::kNeedsFunctionHolder | + NativeFunction::kCanReturnErrors), + NativeFunction("regexp_extract", {}, DataTypeVector{utf8(), utf8(), int32()}, utf8(), kResultNullIfNull, "gdv_fn_regexp_extract_utf8_utf8_int32", NativeFunction::kNeedsContext | diff --git a/cpp/src/gandiva/gdv_string_function_stubs.cc b/cpp/src/gandiva/gdv_string_function_stubs.cc index d271834fb478..80919de902e5 100644 --- a/cpp/src/gandiva/gdv_string_function_stubs.cc +++ b/cpp/src/gandiva/gdv_string_function_stubs.cc @@ -70,6 +70,16 @@ const char* gdv_fn_regexp_replace_utf8_utf8( out_length); } +const char* gdv_fn_regexp_extract_utf8_utf8(int64_t ptr, int64_t holder_ptr, + const char* data, int32_t data_len, + const char* /*pattern*/, + int32_t /*pattern_len*/, + int32_t* out_length) { + gandiva::ExecutionContext* context = reinterpret_cast(ptr); + gandiva::ExtractHolder* holder = reinterpret_cast(holder_ptr); + return (*holder)(context, data, data_len, 1, out_length); +} + const char* gdv_fn_regexp_extract_utf8_utf8_int32(int64_t ptr, int64_t holder_ptr, const char* data, int32_t data_len, const char* /*pattern*/, @@ -855,6 +865,19 @@ arrow::Status ExportedStringFunctions::AddMappings(Engine* engine) const { "gdv_fn_regexp_extract_utf8_utf8_int32", types->i8_ptr_type() /*return_type*/, args, reinterpret_cast(gdv_fn_regexp_extract_utf8_utf8_int32)); + // gdv_fn_regexp_extract_utf8_utf8 + args = {types->i64_type(), // int64_t ptr + types->i64_type(), // int64_t holder_ptr + types->i8_ptr_type(), // const char* data + types->i32_type(), // int data_len + types->i8_ptr_type(), // const char* pattern + types->i32_type(), // int pattern_len + types->i32_ptr_type()}; // int32_t* out_length + + engine->AddGlobalMappingForFunc( + "gdv_fn_regexp_extract_utf8_utf8", types->i8_ptr_type() /*return_type*/, args, + reinterpret_cast(gdv_fn_regexp_extract_utf8_utf8)); + // gdv_fn_castVARCHAR_int32_int64 args = {types->i64_type(), // int64_t execution_context types->i32_type(), // int32_t value diff --git a/cpp/src/gandiva/regex_functions_holder.cc b/cpp/src/gandiva/regex_functions_holder.cc index 6c0c3d40f127..334c640833e1 100644 --- a/cpp/src/gandiva/regex_functions_holder.cc +++ b/cpp/src/gandiva/regex_functions_holder.cc @@ -212,8 +212,8 @@ void ReplaceHolder::return_error(ExecutionContext* context, std::string& data, } Result> ExtractHolder::Make(const FunctionNode& node) { - ARROW_RETURN_IF(node.children().size() != 3, - Status::Invalid("'extract' function requires three parameters")); + ARROW_RETURN_IF(node.children().size() != 2 && node.children().size() != 3, + Status::Invalid("'extract' function requires two or three parameters")); auto literal = dynamic_cast(node.children().at(1).get()); ARROW_RETURN_IF( diff --git a/cpp/src/gandiva/regex_functions_holder_test.cc b/cpp/src/gandiva/regex_functions_holder_test.cc index 4d7b0fd3192d..4875b0eeeed0 100644 --- a/cpp/src/gandiva/regex_functions_holder_test.cc +++ b/cpp/src/gandiva/regex_functions_holder_test.cc @@ -604,24 +604,47 @@ TEST_F(TestExtractHolder, TestExtractInvalidPattern) { execution_context_.Reset(); } -TEST_F(TestExtractHolder, TestErrorWhileBuildingHolder) { - // Create function with incorrect number of params +TEST_F(TestExtractHolder, TestDefaultIndexExtract) { + // 2-arg form defaults to index 1 (first capture group) auto field = std::make_shared(arrow::field("in", arrow::utf8())); auto pattern_node = std::make_shared( arrow::utf8(), LiteralHolder(R"((\w+) (\w+))"), false); auto function_node = FunctionNode("regexp_extract", {field, pattern_node}, arrow::utf8()); + EXPECT_OK_AND_ASSIGN(auto extract_holder, ExtractHolder::Make(function_node)); + + std::string input_string = "John Doe"; + int32_t out_length = 0; + + auto& extract = *extract_holder; + const char* ret = + extract(&execution_context_, input_string.c_str(), + static_cast(input_string.length()), 1, &out_length); + EXPECT_EQ(std::string(ret, out_length), "John"); + + input_string = "Ringo Beast"; + ret = extract(&execution_context_, input_string.c_str(), + static_cast(input_string.length()), 1, &out_length); + EXPECT_EQ(std::string(ret, out_length), "Ringo"); +} + +TEST_F(TestExtractHolder, TestErrorWhileBuildingHolder) { + // Create function with incorrect number of params (one arg) + auto field = std::make_shared(arrow::field("in", arrow::utf8())); + NodeVector one_arg = {field}; + auto function_node = FunctionNode("regexp_extract", one_arg, arrow::utf8()); + auto extract_holder = ExtractHolder::Make(function_node); EXPECT_RAISES_WITH_MESSAGE_THAT( - Invalid, ::testing::HasSubstr("'extract' function requires three parameters"), + Invalid, ::testing::HasSubstr("'extract' function requires two or three parameters"), extract_holder.status()); execution_context_.Reset(); // Create function with non-utf8 literal parameter as pattern field = std::make_shared(arrow::field("in", arrow::utf8())); - pattern_node = std::make_shared(arrow::int32(), LiteralHolder(2), false); + auto pattern_node = std::make_shared(arrow::int32(), LiteralHolder(2), false); auto index_node = std::make_shared(arrow::field("idx", arrow::int32())); function_node = FunctionNode("regexp_extract", {field, pattern_node, index_node}, arrow::utf8()); From f09a2c3fc71e5801d93ecf94e14e221863ab1c94 Mon Sep 17 00:00:00 2001 From: "logan.riggs@gmail.com" Date: Thu, 14 May 2026 23:07:21 +0000 Subject: [PATCH 2/4] lint --- cpp/src/gandiva/function_registry_string.cc | 4 ++-- cpp/src/gandiva/gdv_string_function_stubs.cc | 8 ++++---- cpp/src/gandiva/regex_functions_holder_test.cc | 11 ++++++----- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/cpp/src/gandiva/function_registry_string.cc b/cpp/src/gandiva/function_registry_string.cc index 507c08b7cd7f..4f063d8f4726 100644 --- a/cpp/src/gandiva/function_registry_string.cc +++ b/cpp/src/gandiva/function_registry_string.cc @@ -257,8 +257,8 @@ std::vector GetStringFunctionRegistry() { NativeFunction::kNeedsFunctionHolder | NativeFunction::kCanReturnErrors), - NativeFunction("regexp_extract", {}, DataTypeVector{utf8(), utf8()}, - utf8(), kResultNullIfNull, "gdv_fn_regexp_extract_utf8_utf8", + NativeFunction("regexp_extract", {}, DataTypeVector{utf8(), utf8()}, utf8(), + kResultNullIfNull, "gdv_fn_regexp_extract_utf8_utf8", NativeFunction::kNeedsContext | NativeFunction::kNeedsFunctionHolder | NativeFunction::kCanReturnErrors), diff --git a/cpp/src/gandiva/gdv_string_function_stubs.cc b/cpp/src/gandiva/gdv_string_function_stubs.cc index 80919de902e5..7cfbecf7735a 100644 --- a/cpp/src/gandiva/gdv_string_function_stubs.cc +++ b/cpp/src/gandiva/gdv_string_function_stubs.cc @@ -71,10 +71,10 @@ const char* gdv_fn_regexp_replace_utf8_utf8( } const char* gdv_fn_regexp_extract_utf8_utf8(int64_t ptr, int64_t holder_ptr, - const char* data, int32_t data_len, - const char* /*pattern*/, - int32_t /*pattern_len*/, - int32_t* out_length) { + const char* data, int32_t data_len, + const char* /*pattern*/, + int32_t /*pattern_len*/, + int32_t* out_length) { gandiva::ExecutionContext* context = reinterpret_cast(ptr); gandiva::ExtractHolder* holder = reinterpret_cast(holder_ptr); return (*holder)(context, data, data_len, 1, out_length); diff --git a/cpp/src/gandiva/regex_functions_holder_test.cc b/cpp/src/gandiva/regex_functions_holder_test.cc index 4875b0eeeed0..e591757d9444 100644 --- a/cpp/src/gandiva/regex_functions_holder_test.cc +++ b/cpp/src/gandiva/regex_functions_holder_test.cc @@ -618,9 +618,8 @@ TEST_F(TestExtractHolder, TestDefaultIndexExtract) { int32_t out_length = 0; auto& extract = *extract_holder; - const char* ret = - extract(&execution_context_, input_string.c_str(), - static_cast(input_string.length()), 1, &out_length); + const char* ret = extract(&execution_context_, input_string.c_str(), + static_cast(input_string.length()), 1, &out_length); EXPECT_EQ(std::string(ret, out_length), "John"); input_string = "Ringo Beast"; @@ -637,14 +636,16 @@ TEST_F(TestExtractHolder, TestErrorWhileBuildingHolder) { auto extract_holder = ExtractHolder::Make(function_node); EXPECT_RAISES_WITH_MESSAGE_THAT( - Invalid, ::testing::HasSubstr("'extract' function requires two or three parameters"), + Invalid, + ::testing::HasSubstr("'extract' function requires two or three parameters"), extract_holder.status()); execution_context_.Reset(); // Create function with non-utf8 literal parameter as pattern field = std::make_shared(arrow::field("in", arrow::utf8())); - auto pattern_node = std::make_shared(arrow::int32(), LiteralHolder(2), false); + auto pattern_node = + std::make_shared(arrow::int32(), LiteralHolder(2), false); auto index_node = std::make_shared(arrow::field("idx", arrow::int32())); function_node = FunctionNode("regexp_extract", {field, pattern_node, index_node}, arrow::utf8()); From 149fb3e4e216ec51cf3ac78a675514a4a4f7e499 Mon Sep 17 00:00:00 2001 From: "logan.riggs@gmail.com" Date: Thu, 14 May 2026 23:27:35 +0000 Subject: [PATCH 3/4] expanded testing --- .../gandiva/regex_functions_holder_test.cc | 57 +++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/cpp/src/gandiva/regex_functions_holder_test.cc b/cpp/src/gandiva/regex_functions_holder_test.cc index e591757d9444..ee8ea16b2499 100644 --- a/cpp/src/gandiva/regex_functions_holder_test.cc +++ b/cpp/src/gandiva/regex_functions_holder_test.cc @@ -678,3 +678,60 @@ TEST_F(TestExtractHolder, TestErrorWhileBuildingHolder) { } } // namespace gandiva + +extern "C" const char* gdv_fn_regexp_extract_utf8_utf8(int64_t ptr, int64_t holder_ptr, + const char* data, int32_t data_len, + const char* pattern, + int32_t pattern_len, + int32_t* out_length); + +TEST(TestRegexpExtractStub, TestDefaultIndexStub) { + gandiva::ExecutionContext ctx; + auto ctx_ptr = reinterpret_cast(&ctx); + + EXPECT_OK_AND_ASSIGN(auto holder, gandiva::ExtractHolder::Make(R"((\w+) (\w+))")); + auto holder_ptr = reinterpret_cast(holder.get()); + + std::string pattern = R"((\w+) (\w+))"; + int32_t out_length = 0; + + std::string input = "John Doe"; + const char* ret = gdv_fn_regexp_extract_utf8_utf8( + ctx_ptr, holder_ptr, input.c_str(), static_cast(input.size()), + pattern.c_str(), static_cast(pattern.size()), &out_length); + EXPECT_EQ(std::string(ret, out_length), "John"); + + input = "Ringo Beast"; + ret = gdv_fn_regexp_extract_utf8_utf8( + ctx_ptr, holder_ptr, input.c_str(), static_cast(input.size()), + pattern.c_str(), static_cast(pattern.size()), &out_length); + EXPECT_EQ(std::string(ret, out_length), "Ringo"); + + // no match returns empty string + input = "--- ---"; + ret = gdv_fn_regexp_extract_utf8_utf8( + ctx_ptr, holder_ptr, input.c_str(), static_cast(input.size()), + pattern.c_str(), static_cast(pattern.size()), &out_length); + EXPECT_EQ(out_length, 0); +} + +extern "C" const char* gdv_fn_regexp_extract_utf8_utf8_int32( + int64_t ptr, int64_t holder_ptr, const char* data, int32_t data_len, + const char* pattern, int32_t pattern_len, int32_t extract_index, int32_t* out_length); + +TEST(TestRegexpExtractStub, TestIndexStub) { + gandiva::ExecutionContext ctx; + auto ctx_ptr = reinterpret_cast(&ctx); + + EXPECT_OK_AND_ASSIGN(auto holder, gandiva::ExtractHolder::Make(R"((\w+) (\w+))")); + auto holder_ptr = reinterpret_cast(holder.get()); + + std::string pattern = R"((\w+) (\w+))"; + int32_t out_length = 0; + + std::string input = "John Doe"; + const char* ret = gdv_fn_regexp_extract_utf8_utf8_int32( + ctx_ptr, holder_ptr, input.c_str(), static_cast(input.size()), + pattern.c_str(), static_cast(pattern.size()), 2, &out_length); + EXPECT_EQ(std::string(ret, out_length), "Doe"); +} From 1ac300d12694ad95796911d8b4ceb677a9ec68d8 Mon Sep 17 00:00:00 2001 From: "logan.riggs@gmail.com" Date: Fri, 15 May 2026 00:16:04 +0000 Subject: [PATCH 4/4] More tests --- .../gandiva/regex_functions_holder_test.cc | 52 +++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/cpp/src/gandiva/regex_functions_holder_test.cc b/cpp/src/gandiva/regex_functions_holder_test.cc index ee8ea16b2499..a78206bb8479 100644 --- a/cpp/src/gandiva/regex_functions_holder_test.cc +++ b/cpp/src/gandiva/regex_functions_holder_test.cc @@ -604,6 +604,58 @@ TEST_F(TestExtractHolder, TestExtractInvalidPattern) { execution_context_.Reset(); } +TEST_F(TestExtractHolder, TestEmptyInput) { + EXPECT_OK_AND_ASSIGN(auto extract_holder, ExtractHolder::Make(R"((\w+))")); + auto& extract = *extract_holder; + int32_t out_length = 0; + + const char* ret = extract(&execution_context_, "", 0, 0, &out_length); + EXPECT_EQ(std::string(ret, out_length), ""); + EXPECT_FALSE(execution_context_.has_error()); +} + +TEST_F(TestExtractHolder, TestOptionalGroup) { + // (a)?(b): group 1 is optional; when input is "b" it doesn't participate + EXPECT_OK_AND_ASSIGN(auto extract_holder, ExtractHolder::Make(R"((a)?(b))")); + auto& extract = *extract_holder; + int32_t out_length = 0; + + std::string input = "b"; + const char* ret = extract(&execution_context_, input.c_str(), + static_cast(input.size()), 1, &out_length); + EXPECT_EQ(std::string(ret, out_length), ""); + EXPECT_FALSE(execution_context_.has_error()); + + ret = extract(&execution_context_, input.c_str(), static_cast(input.size()), 2, + &out_length); + EXPECT_EQ(std::string(ret, out_length), "b"); + + input = "ab"; + ret = extract(&execution_context_, input.c_str(), static_cast(input.size()), 1, + &out_length); + EXPECT_EQ(std::string(ret, out_length), "a"); +} + +TEST_F(TestExtractHolder, TestNoUserGroups) { + // Pattern with no user capturing groups — only the outer wrapper group exists. + // Index 0 returns the full match; index 1 is out of range. + EXPECT_OK_AND_ASSIGN(auto extract_holder, ExtractHolder::Make(R"(\d+)")); + auto& extract = *extract_holder; + int32_t out_length = 0; + + std::string input = "abc123def"; + const char* ret = extract(&execution_context_, input.c_str(), + static_cast(input.size()), 0, &out_length); + EXPECT_EQ(std::string(ret, out_length), "123"); + EXPECT_FALSE(execution_context_.has_error()); + + ret = extract(&execution_context_, input.c_str(), static_cast(input.size()), 1, + &out_length); + EXPECT_EQ(out_length, 0); + EXPECT_TRUE(execution_context_.has_error()); + execution_context_.Reset(); +} + TEST_F(TestExtractHolder, TestDefaultIndexExtract) { // 2-arg form defaults to index 1 (first capture group) auto field = std::make_shared(arrow::field("in", arrow::utf8()));