From 07d5c2d4236447f8670c4731d6b09c8a2e678855 Mon Sep 17 00:00:00 2001 From: lovit Date: Wed, 11 Mar 2026 02:09:55 +0900 Subject: [PATCH] =?UTF-8?q?refactor(tokenizer):=20=5Ftokenize=20char-by-ch?= =?UTF-8?q?ar=20=EB=A3=A8=ED=94=84=EB=A5=BC=20re.sub=EC=9C=BC=EB=A1=9C=20?= =?UTF-8?q?=EA=B5=90=EC=B2=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 파이프라인마다 findall + char-by-char 순회 → pattern.sub(lambda m: f" {m.group()} ", s) - 코드 라인 수 절반 이하로 감소 (34줄 → 13줄) - % 포매팅 제거 (f-string으로 자연 대체, #282 해소) - 빈 문자열 가드 추가 (if not words: return []) - 타입 어노테이션 추가 Co-Authored-By: Claude Sonnet 4.6 --- soynlp/tokenizer/tokenizer.py | 28 +++------------------------- 1 file changed, 3 insertions(+), 25 deletions(-) diff --git a/soynlp/tokenizer/tokenizer.py b/soynlp/tokenizer/tokenizer.py index f48deb9..44b969b 100644 --- a/soynlp/tokenizer/tokenizer.py +++ b/soynlp/tokenizer/tokenizer.py @@ -117,36 +117,14 @@ def tokenize(self, sentence, return_words=True): tokens = [token.word for token in tokens] return tokens - def _tokenize(self, s, offset=0, eojeol_id=0): + def _tokenize(self, s: str, offset: int = 0, eojeol_id: int = 0) -> list[Token]: for pattern in self.pipelines: - founds = pattern.findall(s) - if not founds: - continue - found = founds.pop(0) - len_found = len(found) - - s_ = "" - begin = 0 - for i, char in enumerate(s): - if begin > i: - continue - if s[i : i + len_found] == found: - s_ += f" {s[i : i + len_found]} " - begin = i + len_found - if not founds: - s_ += s[begin:] - break - else: - found = founds.pop(0) - len_found = len(found) - continue - s_ += char - s = s_ + s = pattern.sub(lambda m: f" {m.group()} ", s) words = self.doublewhite_pattern.sub(" ", s).strip().split() if not words: return [] r = len(words[0]) - tokens = [Token(words[0], 0 + offset, r + offset, 1, r, eojeol_id)] + tokens = [Token(words[0], offset, offset + r, 1, r, eojeol_id)] begin = tokens[0].end for word in words[1:]: r = len(word)