From bbe6709834e0e35b30f9c918cc1b0ef3136f1193 Mon Sep 17 00:00:00 2001 From: AshenWELI Date: Wed, 20 May 2026 17:14:21 +0200 Subject: [PATCH] Added Sinhala test cases for spaCy, implemented si_tokenizer and si_vocab, and fixed version issues in the requirements file. --- git_info.py | 3 + requirements.txt | 2 +- run_test_si.py | 11 ++ spacy/lang/si/lex_attrs.py | 106 ++++++++++++------- spacy/tests/conftest.py | 8 ++ spacy/tests/lang/si/__init__.py | 0 spacy/tests/lang/si/test_text.py | 69 ++++++++++++ spacy/tests/lang/si/test_tokenizer.py | 23 ++++ spacy/tests/package/test.cfg | 147 ++++++++++++++++++++++++++ spacy/tests/package/test.toml | 74 +++++++++++++ spacy/tests/package/test.txt | 37 +++++++ 11 files changed, 438 insertions(+), 42 deletions(-) create mode 100644 git_info.py create mode 100644 run_test_si.py create mode 100644 spacy/tests/lang/si/__init__.py create mode 100644 spacy/tests/lang/si/test_text.py create mode 100644 spacy/tests/lang/si/test_tokenizer.py create mode 100644 spacy/tests/package/test.cfg create mode 100644 spacy/tests/package/test.toml create mode 100644 spacy/tests/package/test.txt diff --git a/git_info.py b/git_info.py new file mode 100644 index 00000000000..373b9f10599 --- /dev/null +++ b/git_info.py @@ -0,0 +1,3 @@ +# THIS FILE IS GENERATED FROM SPACY SETUP.PY +# +GIT_VERSION = "Unknown" diff --git a/requirements.txt b/requirements.txt index 50c6382bea3..a36209c4b25 100644 --- a/requirements.txt +++ b/requirements.txt @@ -34,4 +34,4 @@ types-requests types-setuptools>=57.0.0 ruff>=0.9.0 cython-lint>=0.15.0 -confection>=1.1.0,<2.0.0 +confection>=1.3.2,<2.0.0 diff --git a/run_test_si.py b/run_test_si.py new file mode 100644 index 00000000000..81c8969207b --- /dev/null +++ b/run_test_si.py @@ -0,0 +1,11 @@ +import spacy + +nlp = spacy.blank("si") + +text = "එහි අරමුණු කිහිපයකි. ඒවා අතර;" +doc = nlp(text) + +print(f"Total tokens: {len(doc)}") +print() +for i, token in enumerate(doc): + print(f"{i+1:>3} {token.text}") \ No newline at end of file diff --git a/spacy/lang/si/lex_attrs.py b/spacy/lang/si/lex_attrs.py index aa061852d07..092f83b4710 100644 --- a/spacy/lang/si/lex_attrs.py +++ b/spacy/lang/si/lex_attrs.py @@ -2,48 +2,55 @@ _num_words = [ "බින්දුව", - "බිංදුව", - "එක", - "දෙක", - "තුන", - "හතර", - "පහ", - "හය", - "හත", - "අට", - "නවය", - "නමය", - "දහය", - "එකොළහ", - "දොළහ", - "දහතුන", - "දහහතර", - "දාහතර", - "පහළව", - "පහළොව", - "දහසය", - "දහහත", - "දාහත", - "දහඅට", - "දහනවය", - "විස්ස", - "තිහ", - "හතළිහ", - "පනහ", - "හැට", - "හැත්තෑව", - "අසූව", - "අනූව", - "සියය", - "දහස", - "දාහ", - "ලක්ෂය", - "මිලියනය", - "කෝටිය", - "බිලියනය", - "ට්‍රිලියනය", -] + "බිංදුව","එක","දෙක","තුන","හතර","පහ","හය","හත","අට","නවය","නමය","දහය", + "එකොළහ","දොළහ","දහතුන","දහහතර","දාහතර","පහළව","පහළොව","දහසය","දහහත","දාහත","දහඅට","දහනවය", + "විස්ස","තිහ","හතළිහ","පනහ","හැට","හැත්තෑව","අසූව","අනූව","සියය","සියවෙනි" + "දහස","දාහ","ලක්ෂය","මිලියනය","කෝටිය","බිලියනය","ට්‍රිලියනය", + ] +_ordinal_words = [ + "පළමු", # first + "දෙවන", # second + "තෙවන","තුන්වන", # third + "සතරවන", # fourth + "පස්වන", # fifth + "හයවන", # sixth + "හත්වන", # seventh + "අටවන", # eighth + "නවවන", # ninth + "දහවන", # tenth + "එකොළොස්වන", # eleventh + "දොළොස්වන", # twelfth + "දහතුන්වන", # thirteenth + "දහහතරවන", # fourteenth + "පහලොස්වන", # fifteenth + "දහසයවන", # sixteenth + "දහහත්වන", # seventeenth + "දහඅටවන", # eighteenth + "දහනවවන", # nineteenth + "විසිවන", # twentieth + "තිස්වන", # thirtieth + "හතළිස්වන", # fortieth + "පනස්වන", # fiftieth + "හැටවන", # sixtieth + "හැත්තෑවන", # seventieth + "අසූවන", # eightieth + "අනූවන", # ninetieth + "සියවන", # hundredth + "දහස්වන", # thousandth + "මිලියනවන", # millionth + "බිලියනවන", # billionth + "ට්‍රිලියනවන", # trillionth + "ක්වාඩ්‍රිලියන්වන", # quadrillionth + "ක්වින්ටිලියන්වන", # quintillionth + "සෙක්ස්ටිලියන්වන", # sextillionth + "සෙප්ටිලියන්වන", # septillionth + "ඔක්ටිලියන්වන", # octillionth + "නොනිලියන්වන", # nonillionth + "ඩෙසිලියන්වන", # decillionth + "ගජිලියන්වන", # gajillionth (informal/made-up) + "බසිලියන්වන", # bazillionth (informal/made-up) +] def like_num(text): text = text.replace(",", "").replace(".", "") @@ -55,7 +62,24 @@ def like_num(text): return True if text.lower() in _num_words: return True + # Sinhala ordinal suffix check — no .lower() needed + if text in _ordinal_words: + return True + # "23 වෙනි" / "100 වන" — with space + parts = text.split() + if len(parts) == 2 and parts[0].isdigit() and parts[1] in ("වන", "වෙනි"): + return True + # "තුන්වන", "සියවන" — suffix attached + if text.endswith("වෙනි"): + stem = text[:-4] + if stem.isdigit() or stem in _num_words: + return True + if text.endswith("වන"): + stem = text[:-2] + if stem.isdigit() or stem in _num_words: + return True return False + LEX_ATTRS = {LIKE_NUM: like_num} diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index ae5255c287b..b6cf0a22462 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -401,6 +401,14 @@ def sl_tokenizer(): def sr_tokenizer(): return get_lang_class("sr")().tokenizer +@pytest.fixture(scope="session") +def si_tokenizer(): + return get_lang_class("si")().tokenizer + + +@pytest.fixture(scope="session") +def si_vocab(): + return get_lang_class("si")().vocab @pytest.fixture(scope="session") def sq_tokenizer(): diff --git a/spacy/tests/lang/si/__init__.py b/spacy/tests/lang/si/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/spacy/tests/lang/si/test_text.py b/spacy/tests/lang/si/test_text.py new file mode 100644 index 00000000000..d1850ece0d9 --- /dev/null +++ b/spacy/tests/lang/si/test_text.py @@ -0,0 +1,69 @@ +import pytest + +from spacy.lang.si.lex_attrs import like_num + +# note: this text taken from https://www.bbc.com/sinhala/articles/cgmpy0kpljno +def test_si_tokenizer_handles_long_text(si_tokenizer): + text = """දිනය 2025 නොවැම්බර් 27 වන දා යි. + +එදින මහනුවර සිට නාවලපිටිය බලා ධාවනය වන මගී දුම්රිය පස්වරු 2.06ට පමණ මහනුවර දුම්රිය ස්ථානයෙන් තම දුම්රිය ගමන ආරම්භ කර තිබුණේ, +දැඩි වර්ෂාව මධ්‍යයේ ය.මැදිරි හතරකින් සමන්විත මෙම දුම්රිය කෙටිදුර ධාවනයේ නිරත වන මන්දගාමී දුම්රියකි. +එදින මෙම දුම්රිය ගමනාන්තය කරා ගෙන යාමේ කාර්යය භාර වී තිබුණේ, විශේෂ පන්ති දුම්රිය රියැදුරෙකු වු ජයම්පති මඩිගසේකරට ය. + +1982 වසරේ දුම්රිය සේවයට එක්වූ ජයම්පති මඩිගසේකර දැනට විශ්‍රාම ගොස් වසර 6කි.""" + tokens = si_tokenizer(text) + assert len(tokens) == 83 + + +@pytest.mark.parametrize( + "text,length", + [ + ("""හැන්දෑවේ 4 - 5ට විතර හොඳට ම වැස්සා මගේ ජීවිතේට ම දැකල නැති වැස්සක්.""", 15), + ("ඩොලරයේ අගය ඉහළ ගියේ ඇයි ?", 6), + ("මහනුවර සිට නාවලපිටිය", 3), + ("ශ්‍රී ලාංකිකයන් බොහෝ දෙනෙකු සිංහල අලුත් අවුරුද්ද සැමරීමට සූදානම්.", 10), + ("සිසුන් 110,000කට පමණ වෘත්තීය අධ්‍යාපනය ලබා දීම", 7), + ("එහෙමද? මම එහෙම කීවේ නැ!", 7), + ("එහි අරමුණු කිහිපයකි. ඒවා අතර;", 7), + ], +) +def test_si_tokenizer_handles_cnts(si_tokenizer, text, length): + tokens = si_tokenizer(text) + assert len(tokens) == length + + +@pytest.mark.parametrize( + "text,match", + [ + ("10", True), + ("1", True), + ("10,000", True), + ("1,000", True), + ("999.0", True), + ("එක", True), + ("දෙක", True), + ("බිලියනය", True), + ("බල්ලා", False), + (",", False), + ("1/2", True), + ("අශේන්", False), + ("වැලිගල්ල", False), + ], +) +def test_lex_attrs_like_number(si_tokenizer, text, match): + tokens = si_tokenizer(text) + assert len(tokens) == 1 + assert tokens[0].like_num == match + + +@pytest.mark.parametrize( + "word", ["තෙවන", "මිලියනවන", "100වන", "සියවන", "23වෙනි", "52වෙනි"] +) +def test_si_lex_attrs_like_number_for_ordinal(word): + assert like_num(word) + + +@pytest.mark.parametrize("word", ["එකොළොස්වන"]) +def test_si_lex_attrs_capitals(word): + assert like_num(word) + assert like_num(word.upper()) diff --git a/spacy/tests/lang/si/test_tokenizer.py b/spacy/tests/lang/si/test_tokenizer.py new file mode 100644 index 00000000000..2c10da7a982 --- /dev/null +++ b/spacy/tests/lang/si/test_tokenizer.py @@ -0,0 +1,23 @@ +import pytest + +SI_TOKEN_EXCEPTION_TESTS = [ + ( + "ශ්‍රී ලංකා හමුදාව නිර්භීතව ත්‍රස්තවාදීන් පරාජය කලහ.", + ["ශ්‍රී", "ලංකා", "හමුදාව", "නිර්භීතව", "ත්‍රස්තවාදීන්", "පරාජය", "කලහ","."], + ), + ( + "සමන්, කරුණාකරලා 10වෙනි පිටුව පෙරලලා කියවන්න.", + ["සමන්",",", "කරුණාකරලා", "10වෙනි", "පිටුව", "පෙරලලා", "කියවන්න", "."], + ), + ( + "දෙවන විමලධර්මසූරිය රජුගේ කාලයේ බුද්ධාගම ප්‍රචලිත කලේය.", + ["දෙවන", "විමලධර්මසූරිය", "රජුගේ", "කාලයේ", "බුද්ධාගම", "ප්‍රචලිත", "කලේය","."], + ), +] + + +@pytest.mark.parametrize("text,expected_tokens", SI_TOKEN_EXCEPTION_TESTS) +def test_si_tokenizer_handles_exception_cases(si_tokenizer, text, expected_tokens): + tokens = si_tokenizer(text) + token_list = [token.text for token in tokens if not token.is_space] + assert token_list == expected_tokens diff --git a/spacy/tests/package/test.cfg b/spacy/tests/package/test.cfg new file mode 100644 index 00000000000..7f9b200caed --- /dev/null +++ b/spacy/tests/package/test.cfg @@ -0,0 +1,147 @@ +[metadata] +description = Industrial-strength Natural Language Processing (NLP) in Python +url = https://spacy.io +author = Explosion +author_email = contact@explosion.ai +license = MIT +long_description = file: README.md +long_description_content_type = text/markdown +classifiers = + Development Status :: 5 - Production/Stable + Environment :: Console + Intended Audience :: Developers + Intended Audience :: Science/Research + License :: OSI Approved :: MIT License + Operating System :: POSIX :: Linux + Operating System :: MacOS :: MacOS X + Operating System :: Microsoft :: Windows + Programming Language :: Cython + Programming Language :: Python :: 3 + Programming Language :: Python :: 3.9 + Programming Language :: Python :: 3.10 + Programming Language :: Python :: 3.11 + Programming Language :: Python :: 3.12 + Programming Language :: Python :: 3.13 + Programming Language :: Python :: 3.14 + Topic :: Scientific/Engineering +project_urls = + Release notes = https://github.com/explosion/spaCy/releases + Source = https://github.com/explosion/spaCy + +[options] +zip_safe = false +include_package_data = true +python_requires = >=3.9,<3.15 +# NOTE: This section is superseded by pyproject.toml and will be removed in +# spaCy v4 +setup_requires = + cython>=3.0,<4.0 + numpy>=2.0.0,<3.0.0; python_version < "3.9" + numpy>=2.0.0,<3.0.0; python_version >= "3.9" + # We also need our Cython packages here to compile against + cymem>=2.0.2,<2.1.0 + preshed>=3.0.2,<3.1.0 + murmurhash>=0.28.0,<1.1.0 + thinc>=8.3.12,<8.4.0 +install_requires = + # Our libraries + spacy-legacy>=3.0.11,<3.1.0 + spacy-loggers>=1.0.0,<2.0.0 + murmurhash>=0.28.0,<1.1.0 + cymem>=2.0.2,<2.1.0 + preshed>=3.0.2,<3.1.0 + thinc>=8.3.12,<8.4.0 + wasabi>=0.9.1,<1.2.0 + srsly>=2.5.3,<3.0.0 + catalogue>=2.0.6,<2.1.0 + weasel>=1.0.0,<2.0.0 + confection>=1.3.2,<2.0.0 + # Third-party dependencies + typer>=0.3.0,<1.0.0 + tqdm>=4.38.0,<5.0.0 + numpy>=1.15.0; python_version < "3.9" + numpy>=1.19.0; python_version >= "3.9" + requests>=2.13.0,<3.0.0 + pydantic>=2.0.0,<3.0.0 + jinja2 + # Official Python utilities + setuptools + packaging>=20.0 + +[options.entry_points] +console_scripts = + spacy = spacy.cli:setup_cli + +[options.extras_require] +lookups = + spacy_lookups_data>=1.0.3,<1.1.0 +transformers = + spacy_transformers>=1.1.2,<1.4.0 +cuda = + cupy>=5.0.0b4,<13.0.0 +cuda80 = + cupy-cuda80>=5.0.0b4,<13.0.0 +cuda90 = + cupy-cuda90>=5.0.0b4,<13.0.0 +cuda91 = + cupy-cuda91>=5.0.0b4,<13.0.0 +cuda92 = + cupy-cuda92>=5.0.0b4,<13.0.0 +cuda100 = + cupy-cuda100>=5.0.0b4,<13.0.0 +cuda101 = + cupy-cuda101>=5.0.0b4,<13.0.0 +cuda102 = + cupy-cuda102>=5.0.0b4,<13.0.0 +cuda110 = + cupy-cuda110>=5.0.0b4,<13.0.0 +cuda111 = + cupy-cuda111>=5.0.0b4,<13.0.0 +cuda112 = + cupy-cuda112>=5.0.0b4,<13.0.0 +cuda113 = + cupy-cuda113>=5.0.0b4,<13.0.0 +cuda114 = + cupy-cuda114>=5.0.0b4,<13.0.0 +cuda115 = + cupy-cuda115>=5.0.0b4,<13.0.0 +cuda116 = + cupy-cuda116>=5.0.0b4,<13.0.0 +cuda117 = + cupy-cuda117>=5.0.0b4,<13.0.0 +cuda11x = + cupy-cuda11x>=11.0.0,<13.0.0 +cuda12x = + cupy-cuda12x>=11.5.0,<13.0.0 +cuda-autodetect = + cupy-wheel>=11.0.0,<13.0.0 +apple = + thinc-apple-ops>=1.0.0,<2.0.0 +# Language tokenizers with external dependencies +ja = + sudachipy>=0.5.2,!=0.6.1 + sudachidict_core>=20211220 +ko = + natto-py>=0.9.0 +th = + pythainlp>=2.0 + +[bdist_wheel] +universal = false + +[sdist] +formats = gztar + +[tool:pytest] +markers = + slow: mark a test as slow + issue: reference specific issue +filterwarnings = + error + ignore:Core Pydantic V1:UserWarning:pydantic + +[mypy] +ignore_missing_imports = True +no_implicit_optional = True +plugins = pydantic.mypy, thinc.mypy +allow_redefinition = True diff --git a/spacy/tests/package/test.toml b/spacy/tests/package/test.toml new file mode 100644 index 00000000000..395c2f7a108 --- /dev/null +++ b/spacy/tests/package/test.toml @@ -0,0 +1,74 @@ +[build-system] +requires = [ + "setuptools", + "cython>=3.0,<4.0", + "cymem>=2.0.2,<2.1.0", + "preshed>=3.0.2,<3.1.0", + "murmurhash>=0.28.0,<1.1.0", + "thinc>=8.3.12,<8.4.0", + "numpy>=2.0.0,<3.0.0" +] +build-backend = "setuptools.build_meta" + +[tool.cibuildwheel] +build = "*" +skip = "cp39* *-win32 *i686* cp3??t-* *cp310-win_arm64" +test-skip = "" + +archs = ["native"] + +build-frontend = "default" +config-settings = {} +dependency-versions = "pinned" +environment = { PIP_CONSTRAINT = "build-constraints.txt" } + +environment-pass = [] +build-verbosity = 0 + +before-all = "curl https://sh.rustup.rs -sSf | sh -s -- -y --profile minimal --default-toolchain stable" +before-build = "pip install -r requirements.txt && python setup.py clean" +repair-wheel-command = "" + +test-command = "" +before-test = "" +test-requires = [] +test-extras = [] + +container-engine = "docker" + +manylinux-x86_64-image = "manylinux2014" +manylinux-i686-image = "manylinux2014" +manylinux-aarch64-image = "manylinux2014" +manylinux-ppc64le-image = "manylinux2014" +manylinux-s390x-image = "manylinux2014" +manylinux-pypy_x86_64-image = "manylinux2014" +manylinux-pypy_i686-image = "manylinux2014" +manylinux-pypy_aarch64-image = "manylinux2014" + +musllinux-x86_64-image = "musllinux_1_2" +musllinux-i686-image = "musllinux_1_2" +musllinux-aarch64-image = "musllinux_1_2" +musllinux-ppc64le-image = "musllinux_1_2" +musllinux-s390x-image = "musllinux_1_2" + +[tool.cibuildwheel.linux] +repair-wheel-command = "auditwheel repair -w {dest_dir} {wheel}" + +[tool.cibuildwheel.macos] +repair-wheel-command = "delocate-wheel --require-archs {delocate_archs} -w {dest_dir} -v {wheel}" + +[tool.cibuildwheel.windows] + +[tool.cibuildwheel.pyodide] + + +[tool.ruff] +line-length = 88 + +[tool.ruff.lint] +select = ["E", "F", "W", "C", "B", "B9"] +ignore = ["E203", "E266", "E501", "E731", "E741", "F541"] + +[tool.ruff.lint.isort] +combine-as-imports = true +split-on-trailing-comma = true diff --git a/spacy/tests/package/test.txt b/spacy/tests/package/test.txt new file mode 100644 index 00000000000..a36209c4b25 --- /dev/null +++ b/spacy/tests/package/test.txt @@ -0,0 +1,37 @@ +# Our libraries +spacy-legacy>=3.0.11,<3.1.0 +spacy-loggers>=1.0.0,<2.0.0 +cymem>=2.0.2,<2.1.0 +preshed>=3.0.2,<3.1.0 +thinc>=8.3.12,<8.4.0 +ml_datasets>=0.2.1,<0.3.0 +murmurhash>=0.28.0,<1.1.0 +wasabi>=0.9.1,<1.2.0 +srsly>=2.5.3,<3.0.0 +catalogue>=2.0.6,<2.1.0 +typer>=0.3.0,<1.0.0 +weasel>=1.0.0,<2.0.0 +# Third party dependencies +numpy>=2.0.0,<3.0.0 +requests>=2.13.0,<3.0.0 +tqdm>=4.38.0,<5.0.0 +pydantic>=2.0.0,<3.0.0 +jinja2 +# Official Python utilities +setuptools +packaging>=20.0 +# Development dependencies +pre-commit>=2.13.0 +cython>=3.0,<4.0 +pytest>=5.2.0,!=7.1.0 +pytest-timeout>=1.3.0,<2.0.0 +mock>=2.0.0,<3.0.0 +hypothesis>=3.27.0,<7.0.0 +mypy>=1.5.0,<1.6.0; platform_machine != "aarch64" and python_version >= "3.8" +types-mock>=0.1.1 +types-setuptools>=57.0.0 +types-requests +types-setuptools>=57.0.0 +ruff>=0.9.0 +cython-lint>=0.15.0 +confection>=1.3.2,<2.0.0