From 0434ac90984424a869814e5a9dc1fe1774f7171c Mon Sep 17 00:00:00 2001 From: lovit Date: Wed, 11 Mar 2026 02:05:16 +0900 Subject: [PATCH] =?UTF-8?q?fix(tokenizer):=20=EC=88=AB=EC=9E=90=20?= =?UTF-8?q?=ED=8C=A8=ED=84=B4=EC=9D=B4=20=EB=B2=84=EC=A0=84=20=EB=B2=88?= =?UTF-8?q?=ED=98=B8(3.1.1)=EB=A5=BC=20=EC=9E=98=EB=AA=BB=20=EB=B6=84?= =?UTF-8?q?=EB=A6=AC=ED=95=98=EB=8A=94=20=EB=B2=84=EA=B7=B8=20=EC=88=98?= =?UTF-8?q?=EC=A0=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [-+]?\d*[\.]?[\d]+|[-+]?\d+ → [-+]?\d+(?:\.\d+)* 소수점이 연속으로 등장하는 버전 번호 형식(3.1.1, 1.2.3.4)을 하나의 토큰으로 처리. 단위 테스트 추가. Closes #279, #7 Co-Authored-By: Claude Sonnet 4.6 --- soynlp/tokenizer/tokenizer.py | 2 +- tests/unit/test_tokenizers.py | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/soynlp/tokenizer/tokenizer.py b/soynlp/tokenizer/tokenizer.py index d24e42b..f48deb9 100644 --- a/soynlp/tokenizer/tokenizer.py +++ b/soynlp/tokenizer/tokenizer.py @@ -66,7 +66,7 @@ def __init__(self, pipelines=None): def _default_pipelines(self): return [ - re.compile(r"[-+]?\d*[\.]?[\d]+|[-+]?\d+", re.UNICODE), # number + re.compile(r"[-+]?\d+(?:\.\d+)*", re.UNICODE), # number (int, decimal, version: 3.1.1) re.compile(r"[가-힣]+", re.UNICODE), # Korean re.compile(r"[ㄱ-ㅎ]+", re.UNICODE), # jaum re.compile(r"[ㅏ-ㅣ]+", re.UNICODE), # moum diff --git a/tests/unit/test_tokenizers.py b/tests/unit/test_tokenizers.py index d809b2f..fefe765 100644 --- a/tests/unit/test_tokenizers.py +++ b/tests/unit/test_tokenizers.py @@ -10,6 +10,15 @@ def test_regex_tokenizer_empty_string(): assert tokenizer.tokenize(" ") == [] +def test_regex_tokenizer_version_number(): + tokenizer = RegexTokenizer() + # 버전 번호(3.1.1)가 하나의 토큰으로 분리되어야 한다 (issue #7) + assert tokenizer.tokenize("다음에는3.1.1장입니다") == ["다음에는", "3.1.1", "장입니다"] + assert tokenizer.tokenize("v1.2.3.4릴리스") == ["v", "1.2.3.4", "릴리스"] + assert tokenizer.tokenize("3.14") == ["3.14"] + assert tokenizer.tokenize("42") == ["42"] + + def test_regex_tokenizer(): sentence = "abc123가나다 alphabet!!3.14한글 hank`s report" expected_words = ["abc", "123", "가나다", "alphabet", "!!", "3.14", "한글", "hank`s", "report"]