Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion soynlp/tokenizer/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def __init__(self, pipelines=None):

def _default_pipelines(self):
return [
re.compile(r"[-+]?\d*[\.]?[\d]+|[-+]?\d+", re.UNICODE), # number
re.compile(r"[-+]?\d+(?:\.\d+)*", re.UNICODE), # number (int, decimal, version: 3.1.1)
re.compile(r"[가-힣]+", re.UNICODE), # Korean
re.compile(r"[ㄱ-ㅎ]+", re.UNICODE), # jaum
re.compile(r"[ㅏ-ㅣ]+", re.UNICODE), # moum
Expand Down
9 changes: 9 additions & 0 deletions tests/unit/test_tokenizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,15 @@ def test_regex_tokenizer_empty_string():
assert tokenizer.tokenize(" ") == []


def test_regex_tokenizer_version_number():
tokenizer = RegexTokenizer()
# 버전 번호(3.1.1)가 하나의 토큰으로 분리되어야 한다 (issue #7)
assert tokenizer.tokenize("다음에는3.1.1장입니다") == ["다음에는", "3.1.1", "장입니다"]
assert tokenizer.tokenize("v1.2.3.4릴리스") == ["v", "1.2.3.4", "릴리스"]
assert tokenizer.tokenize("3.14") == ["3.14"]
assert tokenizer.tokenize("42") == ["42"]


def test_regex_tokenizer():
sentence = "abc123가나다 alphabet!!3.14한글 hank`s report"
expected_words = ["abc", "123", "가나다", "alphabet", "!!", "3.14", "한글", "hank`s", "report"]
Expand Down
Loading