Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
The diff you're trying to view is too large. We only load the first 3000 changed files.
28 changes: 28 additions & 0 deletions .github/workflows/text-tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
name: Text tests

on:
push:
branches: [ main, master ]
pull_request:
branches: [ main, master ]

jobs:
tests:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -e .[dev]

- name: Run narrative/text-only tests
run: |
pytest tests/test_lime_explanation_narrative.py tests/test_plain_text_summary*.py -q
16 changes: 16 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,22 @@ The raw (non-html) notebooks for these tutorials are available [here](https://gi

The API reference is available [here](https://lime-ml.readthedocs.io/en/latest/).

## Japanese text tokenization (Sudachi)

For Japanese (`lang='jp'`) tokenization, this fork uses Sudachi to avoid external MeCab/UniDic setup.

- Install SudachiPy:
- `pip install sudachipy`
- Install a dictionary (if not already bundled via your Sudachi distribution):
- Core dictionary: `pip install sudachidict_core`
- Usage:
- `explainer = LimeTextExplainer(lang='jp')` will use Sudachi-backed splitter automatically.

Notes:
- The tokenizer is instantiated once (singleton) for performance.
- If Sudachi is not installed, `LimeTextExplainer(lang='jp')` will raise an ImportError with instructions.
- A minimal character-based fallback exists in `lime.japanese.splitters.mecab_unidic_split` to keep basic tests runnable without Sudachi, but production usage should install Sudachi.

## What are explanations?

Intuitively, an explanation is a local linear approximation of the model's behaviour.
Expand Down
10 changes: 10 additions & 0 deletions lime/japanese/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
"""Japanese-specific text processing utilities for LIME.

Export a small, stable API and keep implementation details in separate
modules. Users can import `lime.japanese` when they need Japanese-specific
functionality.
"""

from .splitters import split as mecab_unidic_split, active_japanese_tokenizer # type: ignore

__all__ = ["mecab_unidic_split", "active_japanese_tokenizer"]
33 changes: 33 additions & 0 deletions lime/japanese/splitters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
"""High-level splitter functions for Japanese text.

These functions use singleton tokenizer instances from tokenizers.py and
provide simple splitter APIs. They raise ImportError with clear instructions
if required libraries are missing.
"""
from .tokenizers import _SUDACHI_TOKENIZER, _SUDACHI_MODE, has_sudachi


def active_japanese_tokenizer():
"""Return which Japanese tokenizer backend is active: 'sudachi' or 'fallback'.

Returns 'sudachi' only when SudachiPy is installed and its dictionary
can be created successfully; otherwise returns 'fallback'.
"""
return 'sudachi' if has_sudachi() else 'fallback'


def split(text):
"""Split Japanese text using Sudachi when available, else a simple fallback.

Note: despite the name, this now uses Sudachi for tokenization to avoid
external dictionary management. The API remains the same.
"""
if not has_sudachi():
# Simple fallback: return non-space characters as tokens.
return [ch for ch in text if not ch.isspace()]

# Sudachi returns morphemes; use surface() to get token strings
return [m.surface() for m in _SUDACHI_TOKENIZER.tokenize(text, _SUDACHI_MODE)]


__all__ = ["split", "active_japanese_tokenizer"]
30 changes: 30 additions & 0 deletions lime/japanese/tests/test_sudachi_backend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
"""Tests for Sudachi-backed Japanese tokenization.

These tests are skipped if `sudachipy` is not installed so CI can run
without the optional dependency.
"""
import pytest


def _is_installed(pkg):
try:
__import__(pkg)
return True
except Exception:
return False


@pytest.mark.skipif(not _is_installed('sudachipy'), reason='sudachipy not installed')
def test_active_backend_is_sudachi():
from lime.japanese import active_japanese_tokenizer
assert active_japanese_tokenizer() == 'sudachi'


@pytest.mark.skipif(not _is_installed('sudachipy'), reason='sudachipy not installed')
def test_sudachi_splitter_returns_tokens():
from lime.japanese import mecab_unidic_split
s = "今日はいい天気です。"
tokens = mecab_unidic_split(s)
assert isinstance(tokens, list)
assert all(isinstance(t, str) for t in tokens)
assert len(tokens) >= 2
40 changes: 40 additions & 0 deletions lime/japanese/tests/test_tokenizers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
"""Tests for the `lime.japanese` package tokenizers and splitters.

These tests are skipped if `fugashi` is not installed so the repository
can be tested on systems without the dependency.
"""

import pytest


def _is_installed(pkg):
try:
__import__(pkg)
return True
except Exception:
return False


@pytest.mark.skipif(not _is_installed('fugashi'), reason='fugashi not installed')
def test_mecab_unidic_singleton_and_splitter():
from lime.japanese import tokenizers, splitters

assert tokenizers.has_fugashi()

text = "私は学生です。"
toks = splitters.mecab_unidic_split(text)
assert isinstance(toks, list)
assert len(toks) >= 2
assert all(isinstance(t, str) for t in toks)


@pytest.mark.skipif(not _is_installed('fugashi'), reason='fugashi not installed')
def test_utils_shim_works():
# ensure old shim continues to work
from lime import utils
from lime.japanese import splitters

assert getattr(utils, 'mecab_unidic_split', None) is not None

text = "今日はいい天気です。"
assert splitters.mecab_unidic_split(text) == utils.mecab_unidic_split(text)
24 changes: 24 additions & 0 deletions lime/japanese/tokenizers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
"""Singleton tokenizer instances for Japanese processing.

Use Sudachi (Python bindings) to avoid external MeCab/UniDic setup.
The tokenizer is instantiated once at import for performance.
"""

try:
# SudachiPy API (compatible with sudachi.rs python bindings)
from sudachipy import tokenizer as _sudachi_tokenizer # type: ignore
from sudachipy import dictionary as _sudachi_dictionary # type: ignore

_SUDACHI_TOKENIZER = _sudachi_dictionary.Dictionary().create()
_SUDACHI_MODE = _sudachi_tokenizer.Tokenizer.SplitMode.C
except Exception:
_SUDACHI_TOKENIZER = None
_SUDACHI_MODE = None


def has_sudachi():
"""Return True if the Sudachi tokenizer was successfully initialized."""
return _SUDACHI_TOKENIZER is not None


__all__ = ["_SUDACHI_TOKENIZER", "_SUDACHI_MODE", "has_sudachi"]
Loading