From 61bcc8a5b6cf2b666499c7574307ba842a8a7a21 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 7 Apr 2026 08:39:00 +0000
Subject: [PATCH 1/3] Initial plan


From 0d15964ff341139628d2eeda5a7d1de5fddf3185 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 7 Apr 2026 08:46:32 +0000
Subject: [PATCH 2/3] Add unittest suite and GitHub Actions test workflow

Agent-Logs-Url: https://github.com/PyThaiNLP/LEKCut/sessions/6266cef0-f9f2-420d-abda-2c42afcb0a4a

Co-authored-by: wannaphong <8536487+wannaphong@users.noreply.github.com>
---
 .github/workflows/tests.yml |  44 +++++++++++
 lekcut/attacut.py           |   4 +
 lekcut/deepcut.py           |   2 +
 tests/__init__.py           |   0
 tests/test_lekcut.py        | 153 ++++++++++++++++++++++++++++++++++++
 5 files changed, 203 insertions(+)
 create mode 100644 .github/workflows/tests.yml
 create mode 100644 tests/__init__.py
 create mode 100644 tests/test_lekcut.py

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
new file mode 100644
index 0000000..259d887
--- /dev/null
+++ b/.github/workflows/tests.yml
@@ -0,0 +1,44 @@
+name: Tests
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+
+concurrency:
+  group: >-
+    ${{ github.workflow }}-${{
+      github.event.pull_request.head.repo.full_name || github.repository
+    }}-${{ github.head_ref || github.ref_name }}
+  cancel-in-progress: true
+
+jobs:
+  test:
+    name: Run unit tests
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.9", "3.10", "3.11", "3.12"]
+
+    steps:
+      - name: Checkout source code
+        uses: actions/checkout@v4
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install dependencies
+        run: |
+          pip install --upgrade pip
+          pip install -r requirements.txt
+
+      - name: Install package
+        run: pip install -e .
+
+      - name: Run tests
+        run: python -m unittest discover -s tests -v
diff --git a/lekcut/attacut.py b/lekcut/attacut.py
index 9d22a81..4937cad 100644
--- a/lekcut/attacut.py
+++ b/lekcut/attacut.py
@@ -203,6 +203,8 @@ def _make_feature(self, txt: str):
         return characters, features
 
     def tokenize(self, text: str) -> List[str]:
+        if not text:
+            return []
         tokens, features = self._make_feature(text)
         logits = self.model.run(None, {"input": features})[0]
         preds = (_sigmoid(logits) > 0.5).astype(int)
@@ -232,6 +234,8 @@ def _make_feature(self, txt: str):
         return characters, features
 
     def tokenize(self, text: str) -> List[str]:
+        if not text:
+            return []
         tokens, features = self._make_feature(text)
         logits = self.model.run(None, {"input": features})[0]
         preds = (_sigmoid(logits) > 0.5).astype(int)
diff --git a/lekcut/deepcut.py b/lekcut/deepcut.py
index 1c09243..8f14195 100644
--- a/lekcut/deepcut.py
+++ b/lekcut/deepcut.py
@@ -138,6 +138,8 @@ def load_model(self, path: str, providers: List[str]=None):
             self.model = ort.InferenceSession(self.path, providers=providers)
     
     def tokenize(self, text: str) -> List[str]:
+        if not text:
+            return []
         self.x_char, self.x_type = create_feature_array(text, n_pad=self.n_pad)
         self.x_char = self.x_char.astype(np.float32)
         self.x_type= self.x_type.astype(np.float32)
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_lekcut.py b/tests/test_lekcut.py
new file mode 100644
index 0000000..6296e80
--- /dev/null
+++ b/tests/test_lekcut.py
@@ -0,0 +1,153 @@
+# -*- coding: utf-8 -*-
+"""Unit tests for the LEKCut Thai word tokenization library."""
+import unittest
+
+from lekcut import word_tokenize
+
+
+class TestWordTokenizeDeepcut(unittest.TestCase):
+    """Tests for the default deepcut model."""
+
+    def test_basic_tokenization(self):
+        result = word_tokenize("ทดสอบการตัดคำ", model="deepcut")
+        self.assertIsInstance(result, list)
+        self.assertTrue(len(result) > 0)
+        self.assertEqual("".join(result), "ทดสอบการตัดคำ")
+
+    def test_known_output(self):
+        result = word_tokenize("ทดสอบการตัดคำ", model="deepcut")
+        self.assertEqual(result, ["ทดสอบ", "การ", "ตัด", "คำ"])
+
+    def test_empty_string(self):
+        result = word_tokenize("", model="deepcut")
+        self.assertIsInstance(result, list)
+
+    def test_single_word(self):
+        result = word_tokenize("สวัสดี", model="deepcut")
+        self.assertIsInstance(result, list)
+        self.assertEqual("".join(result), "สวัสดี")
+
+    def test_with_spaces(self):
+        result = word_tokenize("สวัสดี ครับ", model="deepcut")
+        self.assertIsInstance(result, list)
+        self.assertEqual("".join(result), "สวัสดี ครับ")
+
+    def test_default_model(self):
+        """word_tokenize defaults to deepcut."""
+        result = word_tokenize("ทดสอบการตัดคำ")
+        self.assertEqual(result, ["ทดสอบ", "การ", "ตัด", "คำ"])
+
+
+class TestWordTokenizeAttacutSC(unittest.TestCase):
+    """Tests for the attacut-sc model."""
+
+    def test_basic_tokenization(self):
+        result = word_tokenize("ทดสอบการตัดคำ", model="attacut-sc")
+        self.assertIsInstance(result, list)
+        self.assertTrue(len(result) > 0)
+        self.assertEqual("".join(result), "ทดสอบการตัดคำ")
+
+    def test_empty_string(self):
+        result = word_tokenize("", model="attacut-sc")
+        self.assertIsInstance(result, list)
+
+    def test_output_joins_to_input(self):
+        text = "ภาษาไทยสวยงาม"
+        result = word_tokenize(text, model="attacut-sc")
+        self.assertEqual("".join(result), text)
+
+
+class TestWordTokenizeAttacutC(unittest.TestCase):
+    """Tests for the attacut-c model."""
+
+    def test_basic_tokenization(self):
+        result = word_tokenize("ทดสอบการตัดคำ", model="attacut-c")
+        self.assertIsInstance(result, list)
+        self.assertTrue(len(result) > 0)
+        self.assertEqual("".join(result), "ทดสอบการตัดคำ")
+
+    def test_empty_string(self):
+        result = word_tokenize("", model="attacut-c")
+        self.assertIsInstance(result, list)
+
+    def test_output_joins_to_input(self):
+        text = "ภาษาไทยสวยงาม"
+        result = word_tokenize(text, model="attacut-c")
+        self.assertEqual("".join(result), text)
+
+
+class TestWordTokenizeOskut(unittest.TestCase):
+    """Tests for the oskut model."""
+
+    def test_basic_tokenization(self):
+        result = word_tokenize("ทดสอบการตัดคำ", model="oskut")
+        self.assertIsInstance(result, list)
+        self.assertTrue(len(result) > 0)
+        self.assertEqual("".join(result), "ทดสอบการตัดคำ")
+
+    def test_empty_string(self):
+        result = word_tokenize("", model="oskut")
+        self.assertIsInstance(result, list)
+        self.assertEqual(result, [])
+
+    def test_output_joins_to_input(self):
+        text = "ภาษาไทยสวยงาม"
+        result = word_tokenize(text, model="oskut")
+        self.assertEqual("".join(result), text)
+
+
+class TestWordTokenizeSefrWs1000(unittest.TestCase):
+    """Tests for the sefr-ws1000 model."""
+
+    def test_basic_tokenization(self):
+        result = word_tokenize("ทดสอบการตัดคำ", model="sefr-ws1000")
+        self.assertIsInstance(result, list)
+        self.assertTrue(len(result) > 0)
+        self.assertEqual("".join(result), "ทดสอบการตัดคำ")
+
+    def test_output_joins_to_input(self):
+        text = "ภาษาไทยสวยงาม"
+        result = word_tokenize(text, model="sefr-ws1000")
+        self.assertEqual("".join(result), text)
+
+
+class TestWordTokenizeSefrTnhc(unittest.TestCase):
+    """Tests for the sefr-tnhc model."""
+
+    def test_basic_tokenization(self):
+        result = word_tokenize("ทดสอบการตัดคำ", model="sefr-tnhc")
+        self.assertIsInstance(result, list)
+        self.assertTrue(len(result) > 0)
+        self.assertEqual("".join(result), "ทดสอบการตัดคำ")
+
+    def test_output_joins_to_input(self):
+        text = "ภาษาไทยสวยงาม"
+        result = word_tokenize(text, model="sefr-tnhc")
+        self.assertEqual("".join(result), text)
+
+
+class TestWordTokenizeSefrBest(unittest.TestCase):
+    """Tests for the sefr-best model."""
+
+    def test_basic_tokenization(self):
+        result = word_tokenize("ทดสอบการตัดคำ", model="sefr-best")
+        self.assertIsInstance(result, list)
+        self.assertTrue(len(result) > 0)
+        self.assertEqual("".join(result), "ทดสอบการตัดคำ")
+
+    def test_output_joins_to_input(self):
+        text = "ภาษาไทยสวยงาม"
+        result = word_tokenize(text, model="sefr-best")
+        self.assertEqual("".join(result), text)
+
+
+class TestWordTokenizeErrorHandling(unittest.TestCase):
+    """Tests for error handling in word_tokenize."""
+
+    def test_unsupported_model_raises(self):
+        with self.assertRaises(NotImplementedError):
+            word_tokenize("ทดสอบ", model="unknown-model")
+
+
+if __name__ == "__main__":
+    unittest.main()

From 620956fd4ae495c2ff4526f3061c06279b96a2c4 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 7 Apr 2026 08:51:43 +0000
Subject: [PATCH 3/3] Address code review: strengthen empty-string assertions,
 add missing sefr empty-string tests, add workflow permissions

Agent-Logs-Url: https://github.com/PyThaiNLP/LEKCut/sessions/6266cef0-f9f2-420d-abda-2c42afcb0a4a

Co-authored-by: wannaphong <8536487+wannaphong@users.noreply.github.com>
---
 .github/workflows/tests.yml |  3 +++
 tests/test_lekcut.py        | 18 +++++++++++++++---
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 259d887..ed1fe26 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -8,6 +8,9 @@ on:
     branches:
       - main
 
+permissions:
+  contents: read
+
 concurrency:
   group: >-
     ${{ github.workflow }}-${{
diff --git a/tests/test_lekcut.py b/tests/test_lekcut.py
index 6296e80..34d6f97 100644
--- a/tests/test_lekcut.py
+++ b/tests/test_lekcut.py
@@ -20,7 +20,7 @@ def test_known_output(self):
 
     def test_empty_string(self):
         result = word_tokenize("", model="deepcut")
-        self.assertIsInstance(result, list)
+        self.assertEqual(result, [])
 
     def test_single_word(self):
         result = word_tokenize("สวัสดี", model="deepcut")
@@ -49,7 +49,7 @@ def test_basic_tokenization(self):
 
     def test_empty_string(self):
         result = word_tokenize("", model="attacut-sc")
-        self.assertIsInstance(result, list)
+        self.assertEqual(result, [])
 
     def test_output_joins_to_input(self):
         text = "ภาษาไทยสวยงาม"
@@ -68,7 +68,7 @@ def test_basic_tokenization(self):
 
     def test_empty_string(self):
         result = word_tokenize("", model="attacut-c")
-        self.assertIsInstance(result, list)
+        self.assertEqual(result, [])
 
     def test_output_joins_to_input(self):
         text = "ภาษาไทยสวยงาม"
@@ -105,6 +105,10 @@ def test_basic_tokenization(self):
         self.assertTrue(len(result) > 0)
         self.assertEqual("".join(result), "ทดสอบการตัดคำ")
 
+    def test_empty_string(self):
+        result = word_tokenize("", model="sefr-ws1000")
+        self.assertEqual(result, [])
+
     def test_output_joins_to_input(self):
         text = "ภาษาไทยสวยงาม"
         result = word_tokenize(text, model="sefr-ws1000")
@@ -120,6 +124,10 @@ def test_basic_tokenization(self):
         self.assertTrue(len(result) > 0)
         self.assertEqual("".join(result), "ทดสอบการตัดคำ")
 
+    def test_empty_string(self):
+        result = word_tokenize("", model="sefr-tnhc")
+        self.assertEqual(result, [])
+
     def test_output_joins_to_input(self):
         text = "ภาษาไทยสวยงาม"
         result = word_tokenize(text, model="sefr-tnhc")
@@ -135,6 +143,10 @@ def test_basic_tokenization(self):
         self.assertTrue(len(result) > 0)
         self.assertEqual("".join(result), "ทดสอบการตัดคำ")
 
+    def test_empty_string(self):
+        result = word_tokenize("", model="sefr-best")
+        self.assertEqual(result, [])
+
     def test_output_joins_to_input(self):
         text = "ภาษาไทยสวยงาม"
         result = word_tokenize(text, model="sefr-best")