From 2c2a1efd47cecaeccf8e3283e63a28296bcaecd9 Mon Sep 17 00:00:00 2001 From: phoneee Date: Sun, 29 Mar 2026 18:09:29 +0700 Subject: [PATCH 1/3] fix: catch TypeError instead of ValueError in sent_tokenize --- pythainlp/tokenize/core.py | 2 +- tests/core/test_tokenize.py | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py index ff133c68b..64d716c0a 100644 --- a/pythainlp/tokenize/core.py +++ b/pythainlp/tokenize/core.py @@ -488,7 +488,7 @@ def sent_tokenize( if isinstance(text, list): try: original_text = "".join(text) - except ValueError: + except TypeError: return [] else: original_text = str(text) diff --git a/tests/core/test_tokenize.py b/tests/core/test_tokenize.py index 0b5befd3a..af49a1e3a 100644 --- a/tests/core/test_tokenize.py +++ b/tests/core/test_tokenize.py @@ -333,6 +333,16 @@ def test_sent_tokenize(self): ) with self.assertRaises(ValueError): sent_tokenize("ฉันไป กิน", engine="XX") # engine does not exist + # Reproduce: list with non-string items should return [] + # instead of raising TypeError (str.join raises TypeError, not ValueError) + self.assertEqual( + sent_tokenize(["สวัสดี", 123], engine="whitespace+newline"), + [], + ) + self.assertEqual( + sent_tokenize(["สวัสดี", None], engine="whitespace+newline"), + [], + ) def test_subword_tokenize(self): self.assertEqual(subword_tokenize(None), []) # type: ignore[arg-type] From e45846558532281484e174b96cd95b320267be0b Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 1 May 2026 07:24:47 +0100 Subject: [PATCH 2/3] Update tests/core/test_tokenize.py --- tests/core/test_tokenize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/core/test_tokenize.py b/tests/core/test_tokenize.py index af49a1e3a..86c9ec133 100644 --- a/tests/core/test_tokenize.py +++ b/tests/core/test_tokenize.py @@ -336,7 +336,7 @@ def test_sent_tokenize(self): # Reproduce: list with non-string items should return [] # instead of raising TypeError (str.join raises TypeError, not ValueError) self.assertEqual( - sent_tokenize(["สวัสดี", 123], engine="whitespace+newline"), + sent_tokenize(["สวัสดี", 123], engine="whitespace+newline"), # type: ignore [], ) self.assertEqual( From 628bb2872d4b4c2beb7bade1d123bcb87b739f0b Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 1 May 2026 07:24:56 +0100 Subject: [PATCH 3/3] Update tests/core/test_tokenize.py --- tests/core/test_tokenize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/core/test_tokenize.py b/tests/core/test_tokenize.py index 86c9ec133..4f1884f0d 100644 --- a/tests/core/test_tokenize.py +++ b/tests/core/test_tokenize.py @@ -340,7 +340,7 @@ def test_sent_tokenize(self): [], ) self.assertEqual( - sent_tokenize(["สวัสดี", None], engine="whitespace+newline"), + sent_tokenize(["สวัสดี", None], engine="whitespace+newline"), # type: ignore [], )