marcotcr · noyuri2z · Nov 2, 2025 · Nov 2, 2025 · Nov 2, 2025 · Nov 2, 2025
diff --git a/.github/workflows/text-tests.yml b/.github/workflows/text-tests.yml
@@ -0,0 +1,28 @@
+name: Text tests
+
+on:
+  push:
+    branches: [ main, master ]
+  pull_request:
+    branches: [ main, master ]
+
+jobs:
+  tests:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -e .[dev]
+
+      - name: Run narrative/text-only tests
+        run: |
+          pytest tests/test_lime_explanation_narrative.py tests/test_plain_text_summary*.py -q
diff --git a/README.md b/README.md
@@ -84,6 +84,22 @@ The raw (non-html) notebooks for these tutorials are available [here](https://gi
 
 The API reference is available [here](https://lime-ml.readthedocs.io/en/latest/).
 
+## Japanese text tokenization (Sudachi)
+
+For Japanese (`lang='jp'`) tokenization, this fork uses Sudachi to avoid external MeCab/UniDic setup.
+
+- Install SudachiPy:
+  - `pip install sudachipy`
+- Install a dictionary (if not already bundled via your Sudachi distribution):
+  - Core dictionary: `pip install sudachidict_core`
+- Usage:
+  - `explainer = LimeTextExplainer(lang='jp')` will use Sudachi-backed splitter automatically.
+
+Notes:
+- The tokenizer is instantiated once (singleton) for performance.
+- If Sudachi is not installed, `LimeTextExplainer(lang='jp')` will raise an ImportError with instructions.
+- A minimal character-based fallback exists in `lime.japanese.splitters.mecab_unidic_split` to keep basic tests runnable without Sudachi, but production usage should install Sudachi.
+
 ## What are explanations?
 
 Intuitively, an explanation is a local linear approximation of the model's behaviour.

diff --git a/lime/japanese/__init__.py b/lime/japanese/__init__.py
@@ -0,0 +1,10 @@
+"""Japanese-specific text processing utilities for LIME.
+
+Export a small, stable API and keep implementation details in separate
+modules. Users can import `lime.japanese` when they need Japanese-specific
+functionality.
+"""
+
+from .splitters import split as mecab_unidic_split, active_japanese_tokenizer  # type: ignore
+
+__all__ = ["mecab_unidic_split", "active_japanese_tokenizer"]
diff --git a/lime/japanese/splitters.py b/lime/japanese/splitters.py
@@ -0,0 +1,33 @@
+"""High-level splitter functions for Japanese text.
+
+These functions use singleton tokenizer instances from tokenizers.py and
+provide simple splitter APIs. They raise ImportError with clear instructions
+if required libraries are missing.
+"""
+from .tokenizers import _SUDACHI_TOKENIZER, _SUDACHI_MODE, has_sudachi
+
+
+def active_japanese_tokenizer():
+    """Return which Japanese tokenizer backend is active: 'sudachi' or 'fallback'.
+
+    Returns 'sudachi' only when SudachiPy is installed and its dictionary
+    can be created successfully; otherwise returns 'fallback'.
+    """
+    return 'sudachi' if has_sudachi() else 'fallback'
+
+
+def split(text):
+    """Split Japanese text using Sudachi when available, else a simple fallback.
+
+    Note: despite the name, this now uses Sudachi for tokenization to avoid
+    external dictionary management. The API remains the same.
+    """
+    if not has_sudachi():
+        # Simple fallback: return non-space characters as tokens.
+        return [ch for ch in text if not ch.isspace()]
+
+    # Sudachi returns morphemes; use surface() to get token strings
+    return [m.surface() for m in _SUDACHI_TOKENIZER.tokenize(text, _SUDACHI_MODE)]
+
+
+__all__ = ["split", "active_japanese_tokenizer"]
diff --git a/lime/japanese/tests/test_sudachi_backend.py b/lime/japanese/tests/test_sudachi_backend.py
@@ -0,0 +1,30 @@
+"""Tests for Sudachi-backed Japanese tokenization.
+
+These tests are skipped if `sudachipy` is not installed so CI can run
+without the optional dependency.
+"""
+import pytest
+
+
+def _is_installed(pkg):
+    try:
+        __import__(pkg)
+        return True
+    except Exception:
+        return False
+
+
+@pytest.mark.skipif(not _is_installed('sudachipy'), reason='sudachipy not installed')
+def test_active_backend_is_sudachi():
+    from lime.japanese import active_japanese_tokenizer
+    assert active_japanese_tokenizer() == 'sudachi'
+
+
+@pytest.mark.skipif(not _is_installed('sudachipy'), reason='sudachipy not installed')
+def test_sudachi_splitter_returns_tokens():
+    from lime.japanese import mecab_unidic_split
+    s = "今日はいい天気です。"
+    tokens = mecab_unidic_split(s)
+    assert isinstance(tokens, list)
+    assert all(isinstance(t, str) for t in tokens)
+    assert len(tokens) >= 2
diff --git a/lime/japanese/tests/test_tokenizers.py b/lime/japanese/tests/test_tokenizers.py
@@ -0,0 +1,40 @@
+"""Tests for the `lime.japanese` package tokenizers and splitters.
+
+These tests are skipped if `fugashi` is not installed so the repository
+can be tested on systems without the dependency.
+"""
+
+import pytest
+
+
+def _is_installed(pkg):
+    try:
+        __import__(pkg)
+        return True
+    except Exception:
+        return False
+
+
+@pytest.mark.skipif(not _is_installed('fugashi'), reason='fugashi not installed')
+def test_mecab_unidic_singleton_and_splitter():
+    from lime.japanese import tokenizers, splitters
+
+    assert tokenizers.has_fugashi()
+
+    text = "私は学生です。"
+    toks = splitters.mecab_unidic_split(text)
+    assert isinstance(toks, list)
+    assert len(toks) >= 2
+    assert all(isinstance(t, str) for t in toks)
+
+
+@pytest.mark.skipif(not _is_installed('fugashi'), reason='fugashi not installed')
+def test_utils_shim_works():
+    # ensure old shim continues to work
+    from lime import utils
+    from lime.japanese import splitters
+
+    assert getattr(utils, 'mecab_unidic_split', None) is not None
+
+    text = "今日はいい天気です。"
+    assert splitters.mecab_unidic_split(text) == utils.mecab_unidic_split(text)
diff --git a/lime/japanese/tokenizers.py b/lime/japanese/tokenizers.py
@@ -0,0 +1,24 @@
+"""Singleton tokenizer instances for Japanese processing.
+
+Use Sudachi (Python bindings) to avoid external MeCab/UniDic setup.
+The tokenizer is instantiated once at import for performance.
+"""
+
+try:
+    # SudachiPy API (compatible with sudachi.rs python bindings)
+    from sudachipy import tokenizer as _sudachi_tokenizer  # type: ignore
+    from sudachipy import dictionary as _sudachi_dictionary  # type: ignore
+
+    _SUDACHI_TOKENIZER = _sudachi_dictionary.Dictionary().create()
+    _SUDACHI_MODE = _sudachi_tokenizer.Tokenizer.SplitMode.C
+except Exception:
+    _SUDACHI_TOKENIZER = None
+    _SUDACHI_MODE = None
+
+
+def has_sudachi():
+    """Return True if the Sudachi tokenizer was successfully initialized."""
+    return _SUDACHI_TOKENIZER is not None
+
+
+__all__ = ["_SUDACHI_TOKENIZER", "_SUDACHI_MODE", "has_sudachi"]