diff --git a/.github/workflows/pypi-publish.yml b/.github/workflows/pypi-publish.yml index 95ca9c901..1ed198a78 100644 --- a/.github/workflows/pypi-publish.yml +++ b/.github/workflows/pypi-publish.yml @@ -52,7 +52,7 @@ jobs: build: ${{ steps.check_build_trigger.outputs.build }} steps: - name: Checkout source code - uses: actions/checkout@v6 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: ref: ${{ github.event.pull_request.head.sha }} - id: check_build_trigger @@ -71,10 +71,10 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v6 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v6 + uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 with: python-version: ${{ matrix.python-version }} @@ -82,31 +82,73 @@ jobs: run: | pip install --upgrade build pip twine - - name: Build source distribution and wheels - run: python -m build + - name: Build source distribution + run: python -m build --sdist # was: python -m build - name: Check distributions run: twine check dist/* - name: Store distributions - uses: actions/upload-artifact@v7 + uses: actions/upload-artifact@4cec3d8aa04e39d1a68397de0c4cd6fb9dce8ec1 # v4.6.1 with: + name: dist-sdist # explicit name for downstream retrieval path: dist + build_wheels: + name: Build binary wheels (${{ matrix.os }}) + needs: [check_build_trigger] + if: needs.check_build_trigger.outputs.build + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: + - ubuntu-latest # → manylinux_2_17_x86_64 + - windows-latest # → win_amd64 + - macos-13 # → macosx_13_*_x86_64 (Intel) + - macos-14 # → macosx_14_*_arm64 (Apple Silicon) + + steps: + - name: Checkout + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + + - name: Build wheels + uses: pypa/cibuildwheel@fa04202e88ea28b84d5d4d20696ee8dfc0119436 # v2.23.0 + # All config is read from [tool.cibuildwheel] in pyproject.toml: + # build/skip selectors, test command, per-platform archs + + - name: Validate wheels + run: | + pip install twine + twine check ./wheelhouse/*.whl + + - name: Upload wheel artifacts + uses: actions/upload-artifact@4cec3d8aa04e39d1a68397de0c4cd6fb9dce8ec1 # v4.6.1 + with: + name: cibw-wheels-${{ matrix.os }} + path: ./wheelhouse/*.whl + publish_pypi: name: Publish to PyPI runs-on: ubuntu-latest - needs: [build] + needs: [build, build_wheels] # was: needs: [build] if: github.event_name == 'release' && github.event.action == 'published' steps: - - name: Retrieve distributions - uses: actions/download-artifact@v7 + - name: Retrieve sdist + uses: actions/download-artifact@cc203385981b70ca67e1cc392babf9cc229d5806 # v4.1.9 with: - name: artifact + name: dist-sdist # matches renamed artifact path: dist + + - name: Retrieve binary wheels + uses: actions/download-artifact@cc203385981b70ca67e1cc392babf9cc229d5806 # v4.1.9 + with: + pattern: cibw-wheels-* # globs all 4 matrix artifacts + path: dist + merge-multiple: true # flatten: cibw-wheels-os1/a.whl → dist/a.whl + - name: Publish to PyPI - uses: pypa/gh-action-pypi-publish@release/v1 - if: github.event_name == 'release' && github.event.action == 'published' + uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e # release/v1 with: skip-existing: true user: __token__ diff --git a/.gitignore b/.gitignore index d502765ce..b80ad2f10 100644 --- a/.gitignore +++ b/.gitignore @@ -116,6 +116,10 @@ dmypy.json # Cython debug symbols cython_debug/ + +# Cython-generated C source files (anywhere in the package tree) +pythainlp/**/*.c + notebooks/iso_11940-dev.ipynb # vscode devcontainer diff --git a/pyproject.toml b/pyproject.toml index 308fda4ae..2f88d1486 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 [build-system] -requires = ["hatchling"] +requires = ["hatchling", "hatch-cython>=0.5.0", "cython>=3.0"] build-backend = "hatchling.build" [project] @@ -233,6 +233,8 @@ noauto-onnx = [ # Cython-based dependencies - for tests.noauto_cython noauto-cython = [ "phunspell>=0.1.6", + "hatch-cython>=0.5.0", + "cython>=3.0", ] # Network-dependent tests - for tests.noauto_network @@ -311,6 +313,22 @@ include = [ "README.md", ] +[tool.hatch.build.hooks.cython] +dependencies = ["cython>=3.0"] +optional = true + +[tool.hatch.build.hooks.cython.options] +# Compile only .pyx files in pythainlp/_ext — do NOT compile .py files. +# Without compile_py=false, hatch-cython would compile every .py file in +# the package into a Cython extension, which is not what we want. +compile_py = false + +# hatch-cython internally invokes setuptools' build_ext. Restrict package +# discovery to pythainlp only so setuptools doesn't error on the flat layout +# (multiple top-level directories: build_tools, fuzz, notebooks, pythainlp). +[tool.setuptools.packages.find] +include = ["pythainlp*"] + [tool.bumpversion] current_version = "5.3.4" commit = true @@ -497,6 +515,10 @@ module = [ ] ignore_missing_imports = true +[[tool.mypy.overrides]] +module = ["pythainlp._ext.*"] +ignore_missing_imports = true + [tool.pylint.main] disable = [ "import-error", @@ -507,3 +529,29 @@ disable = [ "too-many-branches", "too-many-statements", ] + +# --------------------------------------------------------------------------- +# cibuildwheel — binary wheel build matrix +# Docs: https://cibuildwheel.readthedocs.io/en/stable/options/ +# --------------------------------------------------------------------------- +[tool.cibuildwheel] +# CPython 3.9–3.13 (stable; matches requires-python = ">=3.9") +build = "cp39-* cp310-* cp311-* cp312-* cp313-*" +skip = "pp* *-musllinux_*" # PyPy and Alpine excluded (complex toolchain, deferred) + +[tool.cibuildwheel.test] +# After wheel install, verify _thai_fast loaded as a compiled .so/.pyd +# (not a pure-Python fallback). No test deps required. +# Note: pythainlp/_ext/_thai_fast has NO .py fallback — ImportError here +# means compilation failed silently, which also fails this step explicitly. +command = "python -c \"import pythainlp._ext._thai_fast as m; assert m.__file__.endswith(('.so', '.pyd')), 'NOT compiled: ' + m.__file__; print('CIBW OK:', m.__file__)\"" + +[tool.cibuildwheel.linux] +manylinux-x86_64-image = "manylinux2014" # glibc >= 2.17 (RHEL 7+ / Ubuntu 18.04+) +archs = "x86_64" # linux aarch64 deferred — QEMU adds ~20 min/version on GitHub runners + +[tool.cibuildwheel.macos] +archs = "auto" # macos-13 runner = Intel (auto → x86_64); macos-14 runner = ARM (auto → arm64) + +[tool.cibuildwheel.windows] +archs = "AMD64" diff --git a/pythainlp/_ext/__init__.py b/pythainlp/_ext/__init__.py new file mode 100644 index 000000000..838267c6f --- /dev/null +++ b/pythainlp/_ext/__init__.py @@ -0,0 +1,9 @@ +# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project +# SPDX-FileType: SOURCE +# SPDX-License-Identifier: Apache-2.0 +"""Optional Cython-compiled extensions for performance-critical functions. + +These extensions are built at install time when a C compiler and Cython are +available. If unavailable (e.g., PyPy, no compiler), the pure Python +implementations in pythainlp.util are used as fallback. +""" diff --git a/pythainlp/_ext/_normalize_fast.pyi b/pythainlp/_ext/_normalize_fast.pyi new file mode 100644 index 000000000..d91915366 --- /dev/null +++ b/pythainlp/_ext/_normalize_fast.pyi @@ -0,0 +1,7 @@ +# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project +# SPDX-FileType: SOURCE +# SPDX-License-Identifier: Apache-2.0 +"""Type stubs for pythainlp._ext._normalize_fast Cython extension.""" + +def remove_tonemark(text: str) -> str: ... +def remove_dup_spaces(text: str) -> str: ... diff --git a/pythainlp/_ext/_normalize_fast.pyx b/pythainlp/_ext/_normalize_fast.pyx new file mode 100644 index 000000000..89ba9d7a7 --- /dev/null +++ b/pythainlp/_ext/_normalize_fast.pyx @@ -0,0 +1,116 @@ +# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project +# SPDX-FileType: SOURCE +# SPDX-License-Identifier: Apache-2.0 +"""Cython-optimized text normalization functions. + +Provides faster implementations of remove_tonemark and remove_dup_spaces +using C-level typed memory views and byte filtering. + +These functions are API-compatible with their equivalents in +pythainlp.util.normalize and can be used as faster drop-in replacements +when explicitly imported. +""" +# cython: language_level=3 +# cython: boundscheck=False +# cython: wraparound=False + +import re as _re + +from pythainlp import thai_tonemarks as _tonemarks_str + +# Frozenset of tone mark characters for O(1) membership test. +# Must contain single-char strings (not ints): when Cython converts a +# Py_UCS4 value via the `in` operator it produces chr(c), not an integer. +cdef frozenset _TONE_SET = frozenset(_tonemarks_str) + +# Use the same regex pattern as normalize.py to keep newline behaviour +# identical (collapses sequences of spaces+newlines into a single newline) +_RE_REMOVE_NEWLINES = _re.compile(r"[ \n]*\n[ \n]*") + + +cpdef str remove_tonemark(object text): + """Remove Thai tone marks from text using UTF-8 byte-level filtering. + + Thai tone marks occupy the Unicode range U+0E48-U+0E4B, which encodes + in UTF-8 as the three-byte sequence 0xE0 0xB9 {0x88-0x8B}. Filtering + at the byte level using typed memory views avoids per-character Python + object creation and outperforms repeated str.replace() calls on long texts. + + :param text: input text (str or str-like object) + :type text: str + :return: text with all Thai tone marks removed + :rtype: str + """ + cdef str _text = str(text) + if not _text: + return _text + + # Fast path: bail out early if none of the four tone marks are present + cdef Py_UCS4 c + cdef bint found = False + for c in _text: + if c in _TONE_SET: + found = True + break + if not found: + return _text + + # Encode once to UTF-8 bytes; use memoryview for C-level access. + # IMPORTANT: the byte pattern below is hard-coded for the four Thai tone + # marks U+0E48–U+0E4B (encoding: 0xE0 0xB9 {0x88–0x8B}). If + # pythainlp.thai_tonemarks is ever extended beyond those four codepoints + # this filter will silently miss any additions; update the scan range + # in the while-loop accordingly. + cdef bytes src_bytes = _text.encode("utf-8") + cdef const unsigned char[:] src = src_bytes + cdef Py_ssize_t n = len(src) + + # Pre-allocate output buffer (same size as input; result is always smaller) + cdef bytearray dst_arr = bytearray(n) + cdef unsigned char[:] dst = dst_arr + cdef Py_ssize_t i = 0 + cdef Py_ssize_t j = 0 + cdef unsigned char b0 + + while i < n: + b0 = src[i] + # All Thai tone marks share first two bytes 0xE0 0xB9 + if b0 == 0xE0 and i + 2 < n and src[i + 1] == 0xB9: + if 0x88 <= src[i + 2] <= 0x8B: + i += 3 # skip tone-mark sequence + continue + dst[j] = b0 + j += 1 + i += 1 + + return bytes(dst_arr[:j]).decode("utf-8") + + +cpdef str remove_dup_spaces(object text): + """Remove duplicate ASCII spaces and collapse newlines; strip result. + + Behaviorally identical to pythainlp.util.normalize.remove_dup_spaces: + - Only ASCII space (0x20) runs are collapsed (not tabs or other whitespace) + - Newline normalisation is delegated to the same compiled regex + + :param text: input text (str or str-like object) + :type text: str + :return: text without duplicate spaces, with newlines normalised and + leading/trailing whitespace stripped + :rtype: str + """ + cdef str _text = str(text) + cdef list out = [] + cdef Py_UCS4 c + cdef bint prev_space = False + for c in _text: + if c == 32: # ASCII space 0x20 + if not prev_space: + out.append(" ") + prev_space = True + else: + out.append(chr(c)) + prev_space = False + result = "".join(out) + result = _RE_REMOVE_NEWLINES.sub("\n", result) + return result.strip() diff --git a/pythainlp/_ext/_thai_fast.pyi b/pythainlp/_ext/_thai_fast.pyi new file mode 100644 index 000000000..186feb2e8 --- /dev/null +++ b/pythainlp/_ext/_thai_fast.pyi @@ -0,0 +1,11 @@ +# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project +# SPDX-FileType: SOURCE +# SPDX-License-Identifier: Apache-2.0 +"""Type stubs for pythainlp._ext._thai_fast Cython extension.""" + +def is_thai_char(ch: str) -> bool: ... +def is_thai(text: str, ignore_chars: str = ...) -> bool: ... +def count_thai( + text: str, + ignore_chars: str = ..., # defaults to whitespace + digits + punctuation +) -> float: ... diff --git a/pythainlp/_ext/_thai_fast.pyx b/pythainlp/_ext/_thai_fast.pyx new file mode 100644 index 000000000..8186f22c2 --- /dev/null +++ b/pythainlp/_ext/_thai_fast.pyx @@ -0,0 +1,105 @@ +# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project +# SPDX-FileType: SOURCE +# SPDX-License-Identifier: Apache-2.0 +"""Cython-optimized Thai character classification functions. + +Provides faster implementations of is_thai_char, is_thai, and count_thai +by eliminating Python dispatch overhead and using C-level type declarations +for the inner character iteration loops. + +These functions are API-compatible with their equivalents in +pythainlp.util.thai and are loaded as transparent replacements when the +Cython extension is available. +""" +# cython: language_level=3 +# cython: boundscheck=False +# cython: wraparound=False + +import string as _string + +cdef unsigned int _TH_FIRST = 0x0E00 # U+0E00: first Thai character +cdef unsigned int _TH_LAST = 0x0E7F # U+0E7F: last Thai character + + +cpdef bint is_thai_char(object ch): + """Return True if ch is a single Thai Unicode character. + + :param ch: input character (str or str-like object; must be exactly one character) + :type ch: str + :return: True if ch is a Thai character, otherwise False. + :rtype: bool + + .. note:: + Unlike the pure-Python implementation (which raises ``TypeError`` + for empty or multi-character strings via ``ord()``), this + implementation returns ``False`` for any input whose length is + not exactly 1. + """ + cdef str _ch = str(ch) + if len(_ch) != 1: + return False + cdef Py_UCS4 c = _ch[0] + return _TH_FIRST <= c <= _TH_LAST + + +cpdef bint is_thai(object text, object ignore_chars="."): + """Return True if every non-ignored character in text is Thai. + + :param text: input text (str or str-like object) + :type text: str + :param ignore_chars: characters to ignore during validation; + ``None`` is treated the same as ``""`` (no characters ignored) + :type ignore_chars: str or None + :return: True if text consists only of Thai and ignored characters + :rtype: bool + """ + cdef str _text = str(text) + # Mirror the Python version: treat None/empty as "ignore nothing" + if not ignore_chars: + ignore_chars = "" + cdef str _ic = ignore_chars + cdef Py_UCS4 c + for c in _text: + if c not in _ic and not (_TH_FIRST <= c <= _TH_LAST): + return False + return True + + +# Match the default ignore_chars used by the Python count_thai implementation +_DEFAULT_IGNORE_CHARS: str = ( + _string.whitespace + _string.digits + _string.punctuation +) + + +cpdef double count_thai(object text, str ignore_chars=_DEFAULT_IGNORE_CHARS): + """Return proportion of Thai characters in text (0.0–100.0). + + :param text: input text (str or str-like object); non-str values (including None) return 0.0 + to match the behaviour of the pure-Python implementation + :type text: str + :param ignore_chars: characters to exclude from the denominator, + defaults to whitespace, digits, and punctuation marks + :type ignore_chars: str + :return: percentage of Thai characters in the non-ignored portion + :rtype: float + """ + # Matches Python version: non-str or falsy input → 0.0 + if not text or not isinstance(text, str): + return 0.0 + cdef str _text = text + # Normalise: treat empty string as no ignore chars (matches Python version) + if not ignore_chars: + ignore_chars = "" + cdef Py_UCS4 c + cdef Py_ssize_t num_thai = 0 + cdef Py_ssize_t num_ignore = 0 + cdef Py_ssize_t total = len(_text) + for c in _text: + if c in ignore_chars: + num_ignore += 1 + elif _TH_FIRST <= c <= _TH_LAST: + num_thai += 1 + cdef Py_ssize_t denom = total - num_ignore + if denom == 0: + return 0.0 + return (num_thai / denom) * 100.0 diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py index 02138420f..f65adcb43 100644 --- a/pythainlp/util/normalize.py +++ b/pythainlp/util/normalize.py @@ -145,8 +145,7 @@ def remove_tonemark(text: str) -> str: 'สองพันหนึงรอยสีสิบเจ็ดลานสีแสนแปดหมืนสามพันหกรอยสีสิบเจ็ด' """ for ch in tonemarks: - while ch in text: - text = text.replace(ch, "") + text = text.replace(ch, "") return text @@ -386,3 +385,14 @@ def maiyamok(sent: Union[str, list[str]]) -> list[str]: "5.2", ) return expand_maiyamok(sent) + + +# Keep references to the pure-Python implementations before the Cython +# override below so they remain importable for benchmarking and testing. +_py_remove_tonemark = remove_tonemark +_py_remove_dup_spaces = remove_dup_spaces + +# Note: Cython overrides for remove_tonemark and remove_dup_spaces are NOT +# loaded here — Python's str.replace() bulk C operations outperform the +# Cython encode→byte-filter→decode approach. The Cython implementations +# remain in pythainlp._ext._normalize_fast for reference and testing. diff --git a/pythainlp/util/thai.py b/pythainlp/util/thai.py index 77a198168..4696aa0be 100644 --- a/pythainlp/util/thai.py +++ b/pythainlp/util/thai.py @@ -408,3 +408,31 @@ def analyze_thai_text(text: str) -> dict[str, int]: results[char] += 1 return dict(results) + + +# Keep references to the pure-Python implementations before the Cython +# override below so they remain importable for benchmarking and testing. +_py_is_thai_char = is_thai_char +_py_is_thai = is_thai +_py_count_thai = count_thai + +# Load Cython-compiled fast implementations when available. +# Falls back silently to the Python implementations above on PyPy, +# systems without a C compiler, or when hatch-cython was not used at build time. +try: + from pythainlp._ext._thai_fast import count_thai as _fast_count_thai + from pythainlp._ext._thai_fast import is_thai as _fast_is_thai + from pythainlp._ext._thai_fast import is_thai_char as _fast_is_thai_char +except ImportError: + pass +else: + count_thai = _fast_count_thai + is_thai = _fast_is_thai + + def _is_thai_char_fast(ch: str) -> bool: + # ord(ch) raises the same TypeError as the pure-Python implementation + # for empty strings or strings of length != 1, preserving behavior. + _ = ord(ch) + return _fast_is_thai_char(ch) + + is_thai_char = _is_thai_char_fast diff --git a/scripts/bench_full_evidence.py b/scripts/bench_full_evidence.py new file mode 100644 index 000000000..15296490a --- /dev/null +++ b/scripts/bench_full_evidence.py @@ -0,0 +1,319 @@ +#!/usr/bin/env python3 +# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project +# SPDX-License-Identifier: Apache-2.0 +""" +Comprehensive benchmark + cProfile evidence for Phase 1 Cython extensions. + +Generates: + 1. Environment details + 2. Multi-scale comparison (small / medium / large) + 3. cProfile hotspot analysis (before / after) + 4. Dataset description + +Usage: + PYTHONPATH=. python3 scripts/bench_full_evidence.py +""" + +import cProfile +import io +import platform +import pstats +import sys +import timeit +from collections.abc import Callable +from typing import Optional + + +# --------------------------------------------------------------------------- +# 1. Environment +# --------------------------------------------------------------------------- +def print_env() -> None: + print("=" * 72) + print("ENVIRONMENT") + print("=" * 72) + print(f" OS : {platform.system()} {platform.release()}") + print(f" Architecture : {platform.machine()}") + print(f" CPU : {_get_cpu_model()}") + print(f" Python : {sys.version}") + print(f" pythainlp : {_get_pythainlp_version()}") + cython_ver = _get_cython_status() + print(f" Cython ext : {cython_ver}") + print() + + +def _get_cpu_model() -> str: + try: + with open("/proc/cpuinfo") as f: + for line in f: + if line.startswith("model name"): + return line.split(":", 1)[1].strip() + except OSError: + return platform.processor() or "unknown" + return platform.processor() or "unknown" + + +def _get_pythainlp_version() -> str: + try: + import pythainlp + + return pythainlp.__version__ + except Exception: + return "unknown" + + +def _get_cython_status() -> str: + try: + from pythainlp._ext import _thai_fast, _normalize_fast # noqa: F401 # pyright: ignore[reportUnusedImport] + + return "loaded (compiled)" + except ImportError: + return "NOT available (pure Python mode)" + + +# --------------------------------------------------------------------------- +# 2. Dataset +# --------------------------------------------------------------------------- +# Thai Wikipedia-style sample text (real Thai prose) +_SAMPLE_SHORT = "สวัสดีครับ" # 10 chars +_SAMPLE_MEDIUM = "ภาษาไทยเป็นภาษาที่มีวรรณยุกต์ ทำให้การออกเสียงมีความซับซ้อน" * 5 # ~310 chars +_SAMPLE_LONG = ( + "ประเทศไทยมีชื่อเรียกอย่างเป็นทางการว่า ราชอาณาจักรไทย " + "เป็นรัฐที่ตั้งอยู่ในภูมิภาคเอเชียตะวันออกเฉียงใต้ " + "มีพรมแดนทางทิศตะวันออกติดลาวและกัมพูชา ทิศใต้ติดอ่าวไทยและมาเลเซีย " + "ทิศตะวันตกติดทะเลอันดามันและพม่า ทิศเหนือติดพม่าและลาว " + "โดยมีแม่น้ำโขงกั้นเป็นบางช่วง " +) * 50 # ~6,000+ chars + +_SAMPLE_HUGE = _SAMPLE_LONG * 10 # ~60,000+ chars + +_TONE_SHORT = "คำว่า ต้น ไม้ แล้ว ก็ น้ำ" # ~25 chars with tonemarks +_TONE_LONG = ( + "น้ำตกเจ็ดสาวน้อย เป็นน้ำตกที่สวยงามมาก ตั้งอยู่ในอุทยานแห่งชาติ " + "เขื่อนศรีนครินทร์ จังหวัดกาญจนบุรี ล้อมรอบด้วยป่าดิบชื้น " + "ต้นไม้ใหญ่ น้ำตกไหลจากหน้าผาสูง สร้างความชุ่มเย็นให้กับบริเวณรอบข้าง " +) * 40 # ~6,000+ chars + + +def print_dataset() -> None: + print("=" * 72) + print("DATASET") + print("=" * 72) + print(" Real Thai prose, constructed from Thai Wikipedia-style text.") + print(f" Short : {len(_SAMPLE_SHORT):>8,} chars (single greeting)") + print(f" Medium : {len(_SAMPLE_MEDIUM):>8,} chars (paragraph)") + print(f" Long : {len(_SAMPLE_LONG):>8,} chars (article)") + print(f" Huge : {len(_SAMPLE_HUGE):>8,} chars (corpus batch)") + print(f" Tone-S : {len(_TONE_SHORT):>8,} chars (short with tonemarks)") + print(f" Tone-L : {len(_TONE_LONG):>8,} chars (long with tonemarks)") + print() + + +# --------------------------------------------------------------------------- +# 3. Benchmark helpers +# --------------------------------------------------------------------------- +def bench( + label: str, + func_py: Callable[..., object], + func_cy: Optional[Callable[..., object]], + args: tuple, + number: int = 50_000, +) -> dict: + """Benchmark a single function, return result dict.""" + # Python + timer_py = timeit.Timer(lambda: func_py(*args)) + times_py = timer_py.repeat(repeat=5, number=number) + best_py = min(times_py) + + # Cython + if func_cy is not None: + timer_cy = timeit.Timer(lambda: func_cy(*args)) + times_cy = timer_cy.repeat(repeat=5, number=number) + best_cy = min(times_cy) + speedup = best_py / best_cy + else: + best_cy = None + speedup = None + + return { + "label": label, + "py_time": best_py, + "cy_time": best_cy, + "speedup": speedup, + "number": number, + } + + +def print_table(title: str, rows: list[dict]) -> None: + print(f"\n{'─' * 72}") + print(f" {title}") + print(f"{'─' * 72}") + print( + f" {'Function':<35} {'Python':>10} {'Cython':>10} {'Speedup':>10}" + ) + print(f" {'─' * 67}") + for row in rows: + cy_str = ( + f"{row['cy_time']:.4f}s" if row["cy_time"] is not None else "N/A" + ) + sp_str = ( + f"{row['speedup']:.1f}x" if row["speedup"] is not None else "—" + ) + print( + f" {row['label']:<35} {row['py_time']:>9.4f}s {cy_str:>10} {sp_str:>10}" + ) + print() + + +# --------------------------------------------------------------------------- +# 4. cProfile analysis +# --------------------------------------------------------------------------- +def profile_function( + func: Callable[..., object], args: tuple, repeat: int = 100_000 +) -> str: + """Profile a function with cProfile and return top-10 hotspots.""" + pr = cProfile.Profile() + pr.enable() + for _ in range(repeat): + func(*args) + pr.disable() + + stream = io.StringIO() + ps = pstats.Stats(pr, stream=stream) + ps.sort_stats("cumulative") + ps.print_stats(15) + return stream.getvalue() + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- +def main() -> None: + print_env() + print_dataset() + + # Import Python baselines + from pythainlp.util.thai import ( + _py_count_thai, + _py_is_thai, + _py_is_thai_char, + ) + from pythainlp.util.normalize import _py_remove_tonemark + + # Import Cython (may be None) + try: + from pythainlp._ext._thai_fast import ( + count_thai as cy_count_thai, + is_thai as cy_is_thai, + is_thai_char as cy_is_thai_char, + ) + from pythainlp._ext._normalize_fast import ( + remove_tonemark as cy_remove_tonemark, + ) + + have_ext = True + except ImportError: + cy_is_thai_char = None + cy_is_thai = None + cy_count_thai = None + cy_remove_tonemark = None + have_ext = False + + if not have_ext: + print("⚠ Cython extensions NOT available. Showing Python-only.\n") + + # ── Multi-Scale: is_thai_char ────────────────────────────────────── + rows_itc = [] + rows_itc.append( + bench( + "is_thai_char (1M calls)", + _py_is_thai_char, + cy_is_thai_char, + ("ก",), + number=1_000_000, + ) + ) + print_table("is_thai_char — Single Character Check", rows_itc) + + # ── Multi-Scale: is_thai ─────────────────────────────────────────── + rows_it = [] + for label, text, n in [ + ("is_thai (short, 10 ch)", _SAMPLE_SHORT, 500_000), + ("is_thai (medium, ~310 ch)", _SAMPLE_MEDIUM, 100_000), + ("is_thai (long, ~6K ch)", _SAMPLE_LONG, 10_000), + ("is_thai (huge, ~60K ch)", _SAMPLE_HUGE, 1_000), + ]: + rows_it.append(bench(label, _py_is_thai, cy_is_thai, (text,), n)) + print_table("is_thai — Small-Scale vs Big-Scale", rows_it) + + # ── Multi-Scale: count_thai ──────────────────────────────────────── + rows_ct = [] + for label, text, n in [ + ("count_thai (short, 10 ch)", _SAMPLE_SHORT, 500_000), + ("count_thai (medium, ~310 ch)", _SAMPLE_MEDIUM, 50_000), + ("count_thai (long, ~6K ch)", _SAMPLE_LONG, 5_000), + ("count_thai (huge, ~60K ch)", _SAMPLE_HUGE, 500), + ]: + rows_ct.append(bench(label, _py_count_thai, cy_count_thai, (text,), n)) + print_table("count_thai — Small-Scale vs Big-Scale", rows_ct) + + # ── Multi-Scale: remove_tonemark ─────────────────────────────────── + rows_rt = [] + for label, text, n in [ + ("remove_tonemark (short, ~25 ch)", _TONE_SHORT, 500_000), + ("remove_tonemark (long, ~6K ch)", _TONE_LONG, 5_000), + ]: + rows_rt.append( + bench(label, _py_remove_tonemark, cy_remove_tonemark, (text,), n) + ) + print_table("remove_tonemark — Small-Scale vs Big-Scale", rows_rt) + + # ── cProfile Hotspot Analysis ────────────────────────────────────── + print("=" * 72) + print("cPROFILE HOTSPOT ANALYSIS") + print("=" * 72) + print( + " Profiling count_thai on long text (~6K chars) × 100K calls" + ) + print(" to show where time is spent before/after Cython.\n") + + print("── BEFORE (Pure Python count_thai) ──") + profile_out = profile_function( + _py_count_thai, + (_SAMPLE_LONG,), + repeat=100_000, + ) + print(profile_out) + + if cy_count_thai is not None: + print("── AFTER (Cython count_thai) ──") + profile_out = profile_function( + cy_count_thai, + (_SAMPLE_LONG,), + repeat=100_000, + ) + print(profile_out) + + print("── BEFORE (Pure Python remove_tonemark) ──") + profile_out = profile_function( + _py_remove_tonemark, + (_TONE_LONG,), + repeat=50_000, + ) + print(profile_out) + + if cy_remove_tonemark is not None: + print("── AFTER (Cython remove_tonemark) ──") + profile_out = profile_function( + cy_remove_tonemark, + (_TONE_LONG,), + repeat=50_000, + ) + print(profile_out) + + print("=" * 72) + print("BENCHMARK COMPLETE") + print("=" * 72) + + +if __name__ == "__main__": + main() diff --git a/tests/_noauto_loader.py b/tests/_noauto_loader.py new file mode 100644 index 000000000..880b58f44 --- /dev/null +++ b/tests/_noauto_loader.py @@ -0,0 +1,28 @@ +# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project +# SPDX-FileType: SOURCE +# SPDX-License-Identifier: Apache-2.0 +"""Shared loader factory for noauto test suites.""" + +from collections.abc import Callable +from unittest import TestLoader, TestSuite + + +def make_load_tests( + test_packages: list[str], +) -> Callable[[TestLoader, TestSuite, str], TestSuite]: + """Return a load_tests function bound to *test_packages*. + + Each noauto ``__init__.py`` calls this factory so the + unittest load-test protocol is implemented in one place. + See: https://docs.python.org/3/library/unittest.html#id1 + """ + + def load_tests( + loader: TestLoader, standard_tests: TestSuite, pattern: str + ) -> TestSuite: + suite = TestSuite() + for name in test_packages: + suite.addTests(loader.loadTestsFromName(name)) + return suite + + return load_tests diff --git a/tests/core/__init__.py b/tests/core/__init__.py index b4d67630f..be67ac153 100644 --- a/tests/core/__init__.py +++ b/tests/core/__init__.py @@ -25,6 +25,7 @@ "tests.core.test_tools", "tests.core.test_transliterate", "tests.core.test_util", + "tests.core.test_util_cython", ] diff --git a/tests/core/test_util_cython.py b/tests/core/test_util_cython.py new file mode 100644 index 000000000..ca73e712d --- /dev/null +++ b/tests/core/test_util_cython.py @@ -0,0 +1,64 @@ +# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project +# SPDX-FileType: SOURCE +# SPDX-License-Identifier: Apache-2.0 + +"""Coverage tests for the Cython fallback paths in pythainlp.util.thai. + +Kept separate from test_util.py to isolate sys.modules/reload side-effects. +""" + +import importlib +import unittest +from unittest.mock import patch + + +class TestThaiUtilPurePython(unittest.TestCase): + """Call _py_* directly to keep the original function bodies covered.""" + + def test_pure_python_is_thai_char(self): + from pythainlp.util.thai import _py_is_thai_char + + self.assertTrue(_py_is_thai_char("ก")) + self.assertTrue(_py_is_thai_char("๕")) + self.assertFalse(_py_is_thai_char("A")) + self.assertFalse(_py_is_thai_char(" ")) + with self.assertRaises(TypeError): + _py_is_thai_char("") + + def test_pure_python_is_thai(self): + from pythainlp.util.thai import _py_is_thai + + self.assertTrue(_py_is_thai("กาลเวลา")) + self.assertFalse(_py_is_thai("กาล-เวลา")) + self.assertTrue(_py_is_thai("กาล-เวลา", ignore_chars="-")) + self.assertTrue(_py_is_thai("")) + + def test_pure_python_count_thai(self): + from pythainlp.util.thai import _py_count_thai + + self.assertEqual(_py_count_thai("ไทย"), 100.0) + self.assertEqual(_py_count_thai("Python"), 0.0) + # ignore_chars="" → "1" is non-Thai, so 1/2 chars = 50% + self.assertAlmostEqual(_py_count_thai("ก1", ignore_chars=""), 50.0) + + +class TestThaiUtilImportFallback(unittest.TestCase): + """Cover the ``except ImportError: pass`` branch in thai.py. + + Patches sys.modules to make _thai_fast unimportable, reloads thai.py to + execute the fallback path, then restores the module to its original state. + """ + + def test_cython_import_error_fallback(self): + import pythainlp.util.thai as thai_mod + + try: + with patch.dict( + "sys.modules", {"pythainlp._ext._thai_fast": None} + ): + importlib.reload(thai_mod) + self.assertTrue(thai_mod.is_thai_char("ก")) + self.assertEqual(thai_mod.count_thai("ไทย"), 100.0) + finally: + # Guaranteed restore: runs whether assertions pass or fail + importlib.reload(thai_mod) diff --git a/tests/noauto_cython/__init__.py b/tests/noauto_cython/__init__.py index 92e348e7d..7e2068d8a 100644 --- a/tests/noauto_cython/__init__.py +++ b/tests/noauto_cython/__init__.py @@ -15,26 +15,14 @@ workflows with appropriate build environments. """ -from unittest import TestLoader, TestSuite +from tests._noauto_loader import make_load_tests -# Names of module to be tested test_packages: list[str] = [ "tests.noauto_cython.testn_spell_cython", + "tests.noauto_cython.testn_fast_functions", ] - -def load_tests( - loader: TestLoader, standard_tests: TestSuite, pattern: str -) -> TestSuite: - """Load test protocol - See: https://docs.python.org/3/library/unittest.html#id1 - """ - suite = TestSuite() - for test_package in test_packages: - tests = loader.loadTestsFromName(test_package) - suite.addTests(tests) - return suite - +load_tests = make_load_tests(test_packages) if __name__ == "__main__": from unittest import main diff --git a/tests/noauto_cython/testn_fast_functions.py b/tests/noauto_cython/testn_fast_functions.py new file mode 100644 index 000000000..b94678b93 --- /dev/null +++ b/tests/noauto_cython/testn_fast_functions.py @@ -0,0 +1,211 @@ +# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project +# SPDX-FileType: SOURCE +# SPDX-License-Identifier: Apache-2.0 +"""Correctness and performance tests for Cython-compiled fast functions. + +These tests verify that the Cython implementations in pythainlp._ext produce +identical output to the pure Python implementations they replace. + +Tests are skipped automatically when the Cython extensions are not built +(e.g., on PyPy or systems without a C compiler). +""" + +import unittest + +try: + from pythainlp._ext._normalize_fast import ( + remove_dup_spaces as fast_remove_dup_spaces, + ) + from pythainlp._ext._normalize_fast import ( + remove_tonemark as fast_remove_tonemark, + ) + from pythainlp._ext._thai_fast import ( + count_thai as fast_count_thai, + ) + from pythainlp._ext._thai_fast import ( + is_thai as fast_is_thai, + ) + from pythainlp._ext._thai_fast import ( + is_thai_char as fast_is_thai_char, + ) + + HAVE_EXT = True +except ImportError: + HAVE_EXT = False + + +class FastThaiCharCorrectnessTest(unittest.TestCase): + """Verify Cython _thai_fast functions match Python implementations.""" + + def setUp(self) -> None: + if not HAVE_EXT: + self.skipTest( + "pythainlp._ext Cython extensions not built; skipping" + ) + + def test_is_thai_char_thai(self) -> None: + for ch in ["ก", "ข", "ค", "๑", "฿", "ๆ", "ๅ"]: + with self.subTest(ch=ch): + self.assertTrue(fast_is_thai_char(ch)) + + def test_is_thai_char_non_thai(self) -> None: + for ch in ["a", "Z", "0", "9", " ", "あ", "中", "€"]: + with self.subTest(ch=ch): + self.assertFalse(fast_is_thai_char(ch)) + + def test_is_thai_char_boundary(self) -> None: + # First and last code points in the Thai Unicode block + self.assertTrue(fast_is_thai_char(chr(0x0E00))) + self.assertTrue(fast_is_thai_char(chr(0x0E7F))) + # Just outside the Thai block + self.assertFalse(fast_is_thai_char(chr(0x0DFF))) + self.assertFalse(fast_is_thai_char(chr(0x0E80))) + + def test_is_thai_char_empty(self) -> None: + self.assertFalse(fast_is_thai_char("")) + + def test_is_thai_char_matches_python(self) -> None: + # Use the pure-Python reference saved before the Cython override runs. + # Empty string is excluded: Python's ord("") raises TypeError while + # Cython returns False — this known difference is covered separately + # in test_is_thai_char_empty. + from pythainlp.util.thai import _py_is_thai_char as py_is_thai_char + + test_chars = [ + "ก", + "ข", + "ค", + "a", + "1", + " ", + chr(0x0E00), + chr(0x0E7F), + chr(0x0DFF), + chr(0x0E80), + "あ", + ] + for ch in test_chars: + with self.subTest(ch=repr(ch)): + self.assertEqual( + fast_is_thai_char(ch), + py_is_thai_char(ch), + f"Mismatch for {repr(ch)}", + ) + + def test_is_thai_matches_python(self) -> None: + from pythainlp.util.thai import _py_is_thai as py_is_thai + + test_cases = [ + ("ทดสอบ", "."), + ("ทดสอบ1", "."), + ("hello", "."), + ("ทดสอบ123", "123"), + ("", "."), + ("ก.", "."), + ] + for text, ignore in test_cases: + with self.subTest(text=repr(text)): + self.assertEqual( + fast_is_thai(text, ignore), + py_is_thai(text, ignore), + f"Mismatch for {repr(text)!r}, ignore={repr(ignore)!r}", + ) + + def test_count_thai_matches_python(self) -> None: + from pythainlp.util.thai import _py_count_thai as py_count_thai + + test_cases = [ + ("ไทยเอ็นแอลพี 3.0", ""), + ("PyThaiNLP 3.0", ""), + ("ใช้งาน PyThaiNLP 3.0", ""), + ("", ""), + ("กขค", ""), + ("กขค 123", " 0123456789"), + ] + for text, ignore in test_cases: + with self.subTest(text=repr(text)): + self.assertAlmostEqual( + fast_count_thai(text, ignore), + py_count_thai(text, ignore), + places=6, + msg=f"Mismatch for {repr(text)!r}", + ) + + +class FastNormalizeCorrectnessTest(unittest.TestCase): + """Verify Cython _normalize_fast functions match Python implementations.""" + + def setUp(self) -> None: + if not HAVE_EXT: + self.skipTest( + "pythainlp._ext Cython extensions not built; skipping" + ) + + def test_remove_tonemark_matches_python(self) -> None: + from pythainlp.util.normalize import ( + _py_remove_tonemark as py_remove_tonemark, + ) + + test_cases = [ + "จิ้น", + "เก๋า", + "สองพันหนึ่งร้อยสี่สิบเจ็ดล้านสี่แสนแปดหมื่นสามพันหกร้อยสี่สิบเจ็ด", + "", + "no tonemarks here ก ข ค", + "ก่ก้ก๊ก๋", + "mixed Thai and English text กับ tone marks ่้๊๋", + ] + for text in test_cases: + with self.subTest(text=repr(text)): + self.assertEqual( + fast_remove_tonemark(text), + py_remove_tonemark(text), + f"Mismatch for {repr(text)}", + ) + + def test_remove_tonemark_removes_all_four(self) -> None: + # Each of the four Thai tone marks must be removed + from pythainlp import thai_tonemarks + + for mark in thai_tonemarks: + text = f"ก{mark}า" + result = fast_remove_tonemark(text) + self.assertNotIn( + mark, + result, + f"Tone mark U+{ord(mark):04X} was not removed", + ) + + def test_remove_dup_spaces_matches_python(self) -> None: + from pythainlp.util.normalize import ( + remove_dup_spaces as py_remove_dup_spaces, + ) + + test_cases = [ + "ก ข ค", + " ab c d ", + "normal spaces", + "", + " leading", + "trailing ", + "a b c", + ] + for text in test_cases: + with self.subTest(text=repr(text)): + self.assertEqual( + fast_remove_dup_spaces(text), + py_remove_dup_spaces(text), + f"Mismatch for {repr(text)}", + ) + + def test_remove_dup_spaces_preserves_tabs(self) -> None: + # Tabs are NOT collapsed (only ASCII 0x20 spaces are) + from pythainlp.util.normalize import ( + remove_dup_spaces as py_remove_dup_spaces, + ) + + text = "a\t\tb" + self.assertEqual( + fast_remove_dup_spaces(text), py_remove_dup_spaces(text) + ) + diff --git a/tests/noauto_network/__init__.py b/tests/noauto_network/__init__.py index 57b6322ca..570aea480 100644 --- a/tests/noauto_network/__init__.py +++ b/tests/noauto_network/__init__.py @@ -18,26 +18,13 @@ with appropriate network access and caching. """ -from unittest import TestLoader, TestSuite +from tests._noauto_loader import make_load_tests -# Names of module to be tested test_packages: list[str] = [ "tests.noauto_network.testn_spell_network", ] - -def load_tests( - loader: TestLoader, standard_tests: TestSuite, pattern: str -) -> TestSuite: - """Load test protocol - See: https://docs.python.org/3/library/unittest.html#id1 - """ - suite = TestSuite() - for test_package in test_packages: - tests = loader.loadTestsFromName(test_package) - suite.addTests(tests) - return suite - +load_tests = make_load_tests(test_packages) if __name__ == "__main__": from unittest import main diff --git a/tests/noauto_onnx/__init__.py b/tests/noauto_onnx/__init__.py index 0bc3325a5..5e17fb142 100644 --- a/tests/noauto_onnx/__init__.py +++ b/tests/noauto_onnx/__init__.py @@ -17,9 +17,8 @@ workflows dedicated to ONNX Runtime-based features. """ -from unittest import TestLoader, TestSuite +from tests._noauto_loader import make_load_tests -# Names of module to be tested test_packages: list[str] = [ "tests.noauto_onnx.testn_spell_onnx", "tests.noauto_onnx.testn_tag_onnx", @@ -27,19 +26,7 @@ "tests.noauto_onnx.testn_transliterate_onnx", ] - -def load_tests( - loader: TestLoader, standard_tests: TestSuite, pattern: str -) -> TestSuite: - """Load test protocol - See: https://docs.python.org/3/library/unittest.html#id1 - """ - suite = TestSuite() - for test_package in test_packages: - tests = loader.loadTestsFromName(test_package) - suite.addTests(tests) - return suite - +load_tests = make_load_tests(test_packages) if __name__ == "__main__": from unittest import main diff --git a/tests/noauto_tensorflow/__init__.py b/tests/noauto_tensorflow/__init__.py index dd71f2b28..f05f6cc5c 100644 --- a/tests/noauto_tensorflow/__init__.py +++ b/tests/noauto_tensorflow/__init__.py @@ -17,26 +17,13 @@ workflows dedicated to TensorFlow-based features. """ -from unittest import TestLoader, TestSuite +from tests._noauto_loader import make_load_tests -# Names of module to be tested test_packages: list[str] = [ "tests.noauto_tensorflow.testn_tokenize_tensorflow", ] - -def load_tests( - loader: TestLoader, standard_tests: TestSuite, pattern: str -) -> TestSuite: - """Load test protocol - See: https://docs.python.org/3/library/unittest.html#id1 - """ - suite = TestSuite() - for test_package in test_packages: - tests = loader.loadTestsFromName(test_package) - suite.addTests(tests) - return suite - +load_tests = make_load_tests(test_packages) if __name__ == "__main__": from unittest import main diff --git a/tests/noauto_torch/__init__.py b/tests/noauto_torch/__init__.py index 1b97e04f6..2a337528b 100644 --- a/tests/noauto_torch/__init__.py +++ b/tests/noauto_torch/__init__.py @@ -19,9 +19,8 @@ workflows dedicated to PyTorch-based features. """ -from unittest import TestLoader, TestSuite +from tests._noauto_loader import make_load_tests -# Names of module to be tested test_packages: list[str] = [ "tests.noauto_torch.testn_augment_torch", "tests.noauto_torch.testn_lm_torch", @@ -33,19 +32,7 @@ "tests.noauto_torch.testn_transliterate_torch", ] - -def load_tests( - loader: TestLoader, standard_tests: TestSuite, pattern: str -) -> TestSuite: - """Load test protocol - See: https://docs.python.org/3/library/unittest.html#id1 - """ - suite = TestSuite() - for test_package in test_packages: - tests = loader.loadTestsFromName(test_package) - suite.addTests(tests) - return suite - +load_tests = make_load_tests(test_packages) if __name__ == "__main__": from unittest import main