From 2d8120ae51af87f1048660b3bf32466710c6ed59 Mon Sep 17 00:00:00 2001 From: "lisizhuo.lsz" Date: Thu, 12 Feb 2026 02:43:57 +0000 Subject: [PATCH] feat: fix IsNullOrWhitespaceOnly utility in JiebaTokenizer --- src/paimon/global_index/lucene/jieba_analyzer.cpp | 8 ++------ src/paimon/global_index/lucene/jieba_analyzer.h | 2 -- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/src/paimon/global_index/lucene/jieba_analyzer.cpp b/src/paimon/global_index/lucene/jieba_analyzer.cpp index 3c6485f8..55853692 100644 --- a/src/paimon/global_index/lucene/jieba_analyzer.cpp +++ b/src/paimon/global_index/lucene/jieba_analyzer.cpp @@ -15,6 +15,7 @@ */ #include "paimon/global_index/lucene/jieba_analyzer.h" +#include "paimon/common/utils/string_utils.h" #include "paimon/global_index/lucene/lucene_utils.h" namespace paimon::lucene { @@ -82,11 +83,6 @@ void JiebaTokenizer::CutWithMode(const std::string& tokenize_mode, const cppjieb } } -bool JiebaTokenizer::IsWhitespaceOnly(const std::string& term) { - return term.empty() || - std::all_of(term.begin(), term.end(), [](unsigned char c) { return std::isspace(c); }); -} - void JiebaTokenizer::Normalize(const std::unordered_set& stop_words, std::vector* input_ptr, std::vector* output_ptr) { @@ -95,7 +91,7 @@ void JiebaTokenizer::Normalize(const std::unordered_set& stop_words output.clear(); output.reserve(input.size()); for (auto& term : input) { - if (IsWhitespaceOnly(term)) { + if (StringUtils::IsNullOrWhitespaceOnly(term)) { continue; } // remove stop words diff --git a/src/paimon/global_index/lucene/jieba_analyzer.h b/src/paimon/global_index/lucene/jieba_analyzer.h index 31a174bb..5f6da9a4 100644 --- a/src/paimon/global_index/lucene/jieba_analyzer.h +++ b/src/paimon/global_index/lucene/jieba_analyzer.h @@ -60,8 +60,6 @@ class JiebaTokenizer : public Lucene::Tokenizer { private: void InnerReset(); - static bool IsWhitespaceOnly(const std::string& term); - private: JiebaTokenizerContext context_; size_t term_index_ = 0;