diff --git a/CHANGELOG.md b/CHANGELOG.md index 330d57cde..978055405 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,12 @@ and this project adheres to - Full release notes: - Commit history: +## [Unreleased] + +## Changed + +- Improve guardrails in `check_sara()` and `nighit()` + ## [5.3.4] - 2026-04-02 ### Fixed diff --git a/pythainlp/khavee/core.py b/pythainlp/khavee/core.py index 71a0dde2d..79c9673b6 100644 --- a/pythainlp/khavee/core.py +++ b/pythainlp/khavee/core.py @@ -217,7 +217,7 @@ def check_sara(self, word: str) -> str: sara.append("เอือ") if not sara: - return "Can't find Sara in this word" + return "" return sara[0] diff --git a/tests/core/test_khavee.py b/tests/core/test_khavee.py index 0a89559e6..b49ea6c2c 100644 --- a/tests/core/test_khavee.py +++ b/tests/core/test_khavee.py @@ -258,3 +258,14 @@ def test_เอือ_sara(self): def test_returns_string(self): self.assertIsInstance(self.kv.check_sara("เริง"), str) + + def test_empty_string_returns_empty(self): + self.assertEqual(self.kv.check_sara(""), "") + + def test_empty_string_after_removing_karun_returns_empty(self): + self.assertEqual(self.kv.check_sara("ก์"), "") + + def test_empty_string_after_removing_tone_marks_returns_empty(self): + self.assertEqual( + self.kv.check_sara("\u0e48"), "" + ) # The string contains only Thai Mai Ek tone mark diff --git a/tests/core/test_morpheme.py b/tests/core/test_morpheme.py index 9e81b2b28..6a20b728d 100644 --- a/tests/core/test_morpheme.py +++ b/tests/core/test_morpheme.py @@ -10,15 +10,26 @@ class MorphemeTestCase(unittest.TestCase): def test_nighit(self): self.assertEqual(nighit("สํ", "คีต"), "สังคีต") + self.assertEqual( + nighit("สํ", "คีต "), "สังคีต" + ) # w2 has trailing space, should still work + self.assertEqual( + nighit("สํ ", "คีต"), "สังคีต" + ) # w1 has trailing space, should still work self.assertEqual(nighit("สํ", "จร"), "สัญจร") self.assertEqual(nighit("สํ", "ฐาน"), "สัณฐาน") self.assertEqual(nighit("สํ", "นิษฐาน"), "สันนิษฐาน") self.assertEqual(nighit("สํ", "ปทา"), "สัมปทา") self.assertEqual(nighit("สํ", "โยค"), "สังโยค") + self.assertEqual(nighit("", "คีต"), "คีต") # w1 is empty, should return w2 + self.assertEqual(nighit("สํ", ""), "สํ") # w2 is empty, should return w1 + with self.assertRaises(NotImplementedError): nighit("abc", "คีต") # w1 does not end with ํ and len > 2 with self.assertRaises(NotImplementedError): nighit("สํ", "มาร") # consonant ม is not in any supported group + with self.assertRaises(ValueError): + nighit("สํ", "123") # w2 does not contain any Thai consonant def test_is_native_thai(self): self.assertFalse(is_native_thai(None)) # type: ignore[arg-type]