From 152bcc5def607dc8b2fe38f52d5f66fe8bcf0e47 Mon Sep 17 00:00:00 2001 From: phoneee Date: Sun, 29 Mar 2026 16:40:25 +0700 Subject: [PATCH 1/5] fix: guard nighit, check_sara, check_marttra against empty input nighit() crashed with IndexError when w2 had no consonants. check_sara("") and check_marttra("") crashed accessing word[-1]. Add input validation with clear error messages. --- pythainlp/khavee/core.py | 6 ++++++ pythainlp/morpheme/word_formation.py | 7 ++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/pythainlp/khavee/core.py b/pythainlp/khavee/core.py index f940e8922..dc426b322 100644 --- a/pythainlp/khavee/core.py +++ b/pythainlp/khavee/core.py @@ -54,6 +54,9 @@ def check_sara(self, word: str) -> str: sara = [] countoa = 0 + if not word: + return "" + # In case of การันย์ if "์" in word[-1]: word = word[:-2] @@ -253,6 +256,9 @@ def check_marttra(self, word: str) -> str: word = self.handle_karun_sound_silence(word) word = remove_tonemark(word) + if not word: + return "" + # Check for ำ at the end (represents "am" sound, ends with m) if word[-1] == "ำ": return "กม" diff --git a/pythainlp/morpheme/word_formation.py b/pythainlp/morpheme/word_formation.py index 63453a25e..567a3e2c0 100644 --- a/pythainlp/morpheme/word_formation.py +++ b/pythainlp/morpheme/word_formation.py @@ -38,7 +38,12 @@ def nighit(w1: str, w2: str) -> str: newword = [] newword.append(list_w1[0]) newword.append("ั") - consonant_start = [i for i in list_w2 if i in set(thai_consonants)][0] + consonants_in_w2 = [i for i in list_w2 if i in set(thai_consonants)] + if not consonants_in_w2: + raise ValueError( + f"w2 '{w2}' contains no Thai consonants." + ) + consonant_start = consonants_in_w2[0] if consonant_start in ["ก", "ช", "ค", "ข", "ง"]: newword.append("ง") elif consonant_start in ["จ", "ฉ", "ช", "ฌ"]: From e8ba54a3e0363d29023d2f1f7a79c37b89ac2085 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sun, 29 Mar 2026 22:08:23 +0100 Subject: [PATCH 2/5] Update pythainlp/morpheme/word_formation.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- pythainlp/morpheme/word_formation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pythainlp/morpheme/word_formation.py b/pythainlp/morpheme/word_formation.py index 567a3e2c0..45b161fe2 100644 --- a/pythainlp/morpheme/word_formation.py +++ b/pythainlp/morpheme/word_formation.py @@ -41,7 +41,7 @@ def nighit(w1: str, w2: str) -> str: consonants_in_w2 = [i for i in list_w2 if i in set(thai_consonants)] if not consonants_in_w2: raise ValueError( - f"w2 '{w2}' contains no Thai consonants." + f"w2 {w2!r} contains no Thai consonants." ) consonant_start = consonants_in_w2[0] if consonant_start in ["ก", "ช", "ค", "ข", "ง"]: From 8c05e8d55f5fb700b8b68b32287c736065f4e250 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 1 May 2026 08:01:36 +0100 Subject: [PATCH 3/5] Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- pythainlp/khavee/core.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pythainlp/khavee/core.py b/pythainlp/khavee/core.py index dc426b322..7e2f78f80 100644 --- a/pythainlp/khavee/core.py +++ b/pythainlp/khavee/core.py @@ -60,6 +60,9 @@ def check_sara(self, word: str) -> str: # In case of การันย์ if "์" in word[-1]: word = word[:-2] + # After removing the karun, the word may become empty (e.g. "ก์") + if not word: + return "" # In case of สระเดี่ยว for i in word: From 087fafd990e4c8be65b097c072ba4e68efc21083 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 1 May 2026 08:09:47 +0100 Subject: [PATCH 4/5] Optimize consonant filtering in word formation --- pythainlp/morpheme/word_formation.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pythainlp/morpheme/word_formation.py b/pythainlp/morpheme/word_formation.py index 45b161fe2..76ca0f096 100644 --- a/pythainlp/morpheme/word_formation.py +++ b/pythainlp/morpheme/word_formation.py @@ -38,7 +38,8 @@ def nighit(w1: str, w2: str) -> str: newword = [] newword.append(list_w1[0]) newword.append("ั") - consonants_in_w2 = [i for i in list_w2 if i in set(thai_consonants)] + _consonants = set(thai_consonants) + consonants_in_w2 = [i for i in list_w2 if i in _consonants] if not consonants_in_w2: raise ValueError( f"w2 {w2!r} contains no Thai consonants." From be43a48153127f428eee5bc5f598873d0016e283 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 1 May 2026 08:32:12 +0100 Subject: [PATCH 5/5] Implement input validation for nighit function Added type checks and string handling for inputs. --- pythainlp/morpheme/word_formation.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pythainlp/morpheme/word_formation.py b/pythainlp/morpheme/word_formation.py index 76ca0f096..45817dc31 100644 --- a/pythainlp/morpheme/word_formation.py +++ b/pythainlp/morpheme/word_formation.py @@ -31,6 +31,14 @@ def nighit(w1: str, w2: str) -> str: assert nighit("สํ","ปทา")=="สัมปทา" assert nighit("สํ","โยค")=="สังโยค" """ + if not isinstance(w1, str) or not isinstance(w2, str): + raise TypeError("Both w1 and w2 must be strings.") + w1 = w1.strip() + w2 = w2.strip() + if not w1: + return w2 + if not w2: + return w1 if not str(w1).endswith("ํ") and len(w1) != 2: raise NotImplementedError(f"The function doesn't support {w1}.") list_w1 = list(w1)