ml-rust · farhan-syah · Mar 12, 2026 · Feb 27, 2026 · Feb 27, 2026 · Feb 27, 2026
diff --git a/.version b/.version
@@ -1 +1 @@
-0.8.0
+0.9.0
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,29 +1,32 @@
 [package]
 name = "splintr"
-version = "0.8.0"
+version = "0.9.0"
 edition = "2021"
-description = "Fast Rust BPE tokenizer with Python bindings"
+description = "Fast Rust tokenizer (BPE + SentencePiece) with Python bindings"
 license = "MIT"
 repository = "https://github.com/ml-rust/splintr"
 homepage = "https://github.com/ml-rust/splintr"
 readme = "README.md"
-keywords = ["tokenizer", "bpe", "tiktoken", "gpt", "llm"]
+keywords = ["tokenizer", "bpe", "sentencepiece", "tiktoken", "llm"]
 categories = ["text-processing", "encoding"]
 
 [lib]
 name = "splintr"
 crate-type = ["cdylib", "rlib"]
 
 [features]
-default = ["pcre2"]
+default = ["rayon", "regexr-jit"]
 python = ["dep:pyo3"]
 pcre2 = ["dep:pcre2"]
+rayon = ["dep:rayon"]
+regexr-jit = ["regexr/jit", "regexr/simd"]
+wasm = []  # disables rayon, uses scalar regex — use with --no-default-features
 
 [dependencies]
 # PCRE2 regex with JIT support (optional, for benchmarking)
 pcre2 = { version = "0.2", optional = true }
 # Rayon for internal parallelism
-rayon = "1.10"
+rayon = { version = "1.10", optional = true }
 # Fast hashing (FxHashMap)
 rustc-hash = "2.0"
 # Error handling
@@ -37,7 +40,7 @@ aho-corasick = "1.1"
 # LRU cache for frequent token sequences
 lru = "0.16"
 # regexr regex engine (default backend)
-regexr = { version = "0.1.0-beta.5", features = ["jit", "simd"] }
+regexr = { version = "0.1.0-beta.5", default-features = false }
 
 [dev-dependencies]
 # PCRE2 for benchmarking comparisons

diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 
 [![Crates.io](https://img.shields.io/crates/v/splintr.svg)](https://crates.io/crates/splintr) [![PyPI](https://img.shields.io/pypi/v/splintr-rs.svg)](https://pypi.org/project/splintr-rs/) [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT)
 
-**A high-performance BPE tokenizer built with Rust with Python bindings, focused on speed, safety, and resource optimization.**
+**A high-performance tokenizer (BPE + SentencePiece) built with Rust with Python bindings, focused on speed, safety, and resource optimization.**
 
 ## The Problem
 
@@ -85,7 +85,7 @@ See the [API Guide](docs/api_guide.md) and [docs.rs](https://docs.rs/splintr) fo
 - **Compatible vocabularies** - Supports cl100k_base, o200k_base (OpenAI), Llama 3 family (Meta), DeepSeek V3 (DeepSeek), and Mistral V1/V2/V3 (Mistral AI)
 - **Streaming decoders** - Real-time LLM output display with proper UTF-8 handling ([guide](docs/api_guide.md#streaming-decoder))
 - **54 agent tokens** - Built-in support for chat, CoT reasoning, ReAct agents, tool calling, RAG citations ([docs](docs/special_tokens.md))
-- **Battle-tested algorithms** - Regexr with JIT (pure Rust), Aho-Corasick for special tokens, linked-list BPE
+- **Battle-tested algorithms** - Regexr with JIT (pure Rust), Aho-Corasick for special tokens, linked-list BPE, SentencePiece unigram
 
 **Cross-platform:**
 
@@ -219,15 +219,15 @@ See the [API Guide](docs/api_guide.md#streaming-decoder) for detailed usage, exa
 
 ## Supported Vocabularies
 
-| Vocabulary    | Used By                             | Vocabulary Size | Special Tokens  | Import Constant            |
-| ------------- | ----------------------------------- | --------------- | --------------- | -------------------------- |
-| `cl100k_base` | GPT-4, GPT-3.5-turbo                | ~100,000        | 5 + 54 agent    | `CL100K_BASE_PATTERN`      |
-| `o200k_base`  | GPT-4o                              | ~200,000        | 2 + 54 agent    | `O200K_BASE_PATTERN`       |
-| `llama3`      | Llama 3, 3.1, 3.2, 3.3 (Meta)       | ~128,000        | 11 + 54 agent   | `LLAMA3_PATTERN`           |
-| `deepseek_v3` | DeepSeek V3, DeepSeek R1            | ~128,000        | 17 + 54 agent   | `LLAMA3_PATTERN`           |
-| `mistral_v1`  | Mistral 7B v0.1/v0.2, Mixtral 8x7B  | ~32,000         | 3 + 54 agent    | `SENTENCEPIECE_PATTERN`    |
-| `mistral_v2`  | Mistral 7B v0.3, Codestral, 8x22B   | ~32,768         | 10 + 54 agent   | `SENTENCEPIECE_PATTERN`    |
-| `mistral_v3`  | Mistral NeMo, Large 2, Pixtral      | ~131,000        | 10 + 54 agent   | `MISTRAL_V3_PATTERN`       |
+| Vocabulary    | Used By                            | Vocabulary Size | Special Tokens | Import Constant         |
+| ------------- | ---------------------------------- | --------------- | -------------- | ----------------------- |
+| `cl100k_base` | GPT-4, GPT-3.5-turbo               | ~100,000        | 5 + 54 agent   | `CL100K_BASE_PATTERN`   |
+| `o200k_base`  | GPT-4o                             | ~200,000        | 2 + 54 agent   | `O200K_BASE_PATTERN`    |
+| `llama3`      | Llama 3, 3.1, 3.2, 3.3 (Meta)      | ~128,000        | 11 + 54 agent  | `LLAMA3_PATTERN`        |
+| `deepseek_v3` | DeepSeek V3, DeepSeek R1           | ~128,000        | 17 + 54 agent  | `LLAMA3_PATTERN`        |
+| `mistral_v1`  | Mistral 7B v0.1/v0.2, Mixtral 8x7B | ~32,000         | 3 + 54 agent   | `SENTENCEPIECE_PATTERN` |
+| `mistral_v2`  | Mistral 7B v0.3, Codestral, 8x22B  | ~32,768         | 10 + 54 agent  | `SENTENCEPIECE_PATTERN` |
+| `mistral_v3`  | Mistral NeMo, Large 2, Pixtral     | ~131,000        | 10 + 54 agent  | `MISTRAL_V3_PATTERN`    |
 
 **OpenAI standard tokens:**
 
@@ -279,6 +279,7 @@ Splintr implements several optimizations that make tokenization faster:
 - **Regexr with JIT compilation**: Pure Rust regex engine with SIMD acceleration
 - **Rayon parallelism**: Leverages multiple CPU cores for batch encoding
 - **Linked-list BPE algorithm**: Avoids O(N²) complexity on pathological inputs
+- **SentencePiece unigram**: Greedy longest-match with score-based tie-breaking for Mistral/Llama-style models
 - **FxHashMap**: Faster lookups than default SipHash for non-adversarial contexts
 - **Aho-Corasick for special tokens**: Fast multi-pattern matching without regex alternation
 - **LRU cache**: Avoids redundant BPE encoding of frequently seen chunks
@@ -357,6 +358,7 @@ The pre-commit hook automatically runs formatting, clippy, and tests before each
 Splintr builds upon concepts from:
 
 - [tiktoken](https://github.com/openai/tiktoken) - OpenAI's reference BPE tokenizer
+- [SentencePiece](https://github.com/google/sentencepiece) - Google's unsupervised text tokenizer
 - [tokenizers](https://github.com/huggingface/tokenizers) - Hugging Face's tokenization library
 
 The performance optimizations are informed by profiling real-world usage patterns in LLM applications.
@@ -368,7 +370,7 @@ If you use Splintr in your research, please cite:
 ```bibtex
 @software{splintr,
   author = {Farhan Syah},
-  title = {Splintr: High-Performance BPE Tokenizer},
+  title = {Splintr: High-Performance Tokenizer (BPE + SentencePiece)},
   year = {2025},
   url = {https://github.com/ml-rust/splintr}
 }

diff --git a/docs/api_guide.md b/docs/api_guide.md
@@ -5,14 +5,17 @@ This guide provides comprehensive documentation for using Splintr's Python and R
 ## Table of Contents
 
 - [Python API Reference](#python-api-reference)
-  - [Tokenizer Class](#tokenizer-class)
+  - [Tokenizer Class](#tokenizer-class) (BPE)
   - [Encoding Methods](#encoding-methods)
   - [Decoding Methods](#decoding-methods)
   - [Cache Management](#cache-management)
+  - [SentencePiece Tokenizer Class](#sentencepiece-tokenizer-class) (Unigram)
 - [Streaming Decoder](#streaming-decoder)
   - [Regular Streaming Decoder](#regular-streaming-decoder)
   - [ByteLevel Streaming Decoder](#bytelevel-streaming-decoder)
 - [Rust API Reference](#rust-api-reference)
+  - [BPE Tokenizer](#bpe-tokenizer)
+  - [SentencePiece Tokenizer](#sentencepiece-tokenizer)
 - [Detailed Usage Examples](#detailed-usage-examples)
   - [Basic Encoding and Decoding](#basic-encoding-and-decoding)
   - [Batch Processing](#batch-processing)
@@ -156,6 +159,61 @@ Clear the LRU encoding cache. Useful if memory pressure is a concern.
 tokenizer.clear_cache()
 ```
 
+### SentencePiece Tokenizer Class
+
+The `SentencePieceTokenizer` class provides unigram tokenization for models using SentencePiece (e.g., loaded from GGUF files).
+
+#### Creating
+
+```python
+from splintr import SentencePieceTokenizer
+
+# Create from raw vocabulary data
+tokenizer = SentencePieceTokenizer(
+    tokens=["<unk>", "<s>", "</s>", "▁Hello", "▁world"],
+    scores=[0.0, 0.0, 0.0, -1.2, -1.5],
+    eos_token_id=2,
+    bos_token_id=1,  # optional
+)
+```
+
+#### `encode(text: str) -> list[int]`
+
+Encode text using greedy longest-match with score-based tie-breaking. Prepends BOS if configured.
+
+```python
+ids = tokenizer.encode("Hello world")
+# [1, 3, 4]  (BOS + ▁Hello + ▁world)
+```
+
+#### `decode(ids: list[int]) -> str`
+
+Decode token IDs to text. Skips BOS/EOS tokens, converts ▁ back to spaces.
+
+```python
+text = tokenizer.decode([1, 3, 4])
+# "Hello world"
+```
+
+#### `decode_lossy(ids: list[int]) -> str`
+
+Decode token IDs, silently skipping any invalid (out-of-range) IDs.
+
+```python
+text = tokenizer.decode_lossy([1, 3, 999, 4])
+# "Hello world"  (999 is skipped)
+```
+
+#### Properties
+
+- `vocab_size: int` — Total vocabulary size
+- `eos_token_id: int` — End-of-sequence token ID
+- `bos_token_id: int | None` — Beginning-of-sequence token ID (if configured)
+
+#### Methods
+
+- `is_eos(token_id: int) -> bool` — Check if a token is the EOS token
+
 ## Streaming Decoder
 
 Streaming decoders are essential for real-time LLM applications where tokens arrive one at a time. They handle the critical problem of BPE tokens not aligning with UTF-8 character boundaries.
@@ -286,7 +344,7 @@ Add Splintr to your `Cargo.toml`:
 splintr = "*"  # or pin to a specific version
 ```
 
-### Basic Usage
+### BPE Tokenizer
 
 ```rust
 use splintr::{Tokenizer, CL100K_BASE_PATTERN};
@@ -306,19 +364,54 @@ let texts = vec!["Hello".to_string(), "World".to_string()];
 let batch_tokens = tokenizer.encode_batch(&texts);
 ```
 
-### Encoding Methods
+#### Encoding Methods
 
 - `encode(&self, text: &str) -> Vec<u32>`: Sequential encoding (optimal for texts <1MB)
 - `encode_with_special(&self, text: &str) -> Vec<u32>`: Encode with special token recognition
 - `encode_batch(&self, texts: &[String]) -> Vec<Vec<u32>>`: Parallel encoding across texts
 - `encode_rayon(&self, text: &str) -> Vec<u32>`: Parallel encoding within text (for texts >1MB)
 
-### Decoding Methods
+#### Decoding Methods
 
 - `decode(&self, tokens: &[u32]) -> Result<String, TokenizerError>`: Decode to UTF-8 string
 - `decode_bytes(&self, tokens: &[u32]) -> Vec<u8>`: Decode to raw bytes
 - `decode_lossy(&self, tokens: &[u32]) -> String`: Decode with replacement for invalid UTF-8
 
+### SentencePiece Tokenizer
+
+For models using SentencePiece unigram tokenization (e.g., Mistral V1/V2):
+
+```rust
+use splintr::SentencePieceTokenizer;
+
+// Create from raw vocabulary data
+let tokenizer = SentencePieceTokenizer::new(
+    tokens,       // Vec<String> — token strings indexed by ID
+    scores,       // Vec<f32> — scores for tie-breaking (empty for uniform)
+    Some(1),      // Optional BOS token ID
+    2,            // EOS token ID
+)?;
+
+// Encode (prepends BOS if configured, uses ▁ word boundaries)
+let ids = tokenizer.encode("Hello world");
+
+// Decode (skips BOS/EOS, converts ▁ back to spaces)
+let text = tokenizer.decode(&ids)?;
+
+// Lossy decode (skips invalid token IDs instead of erroring)
+let text = tokenizer.decode_lossy(&ids);
+```
+
+#### Methods
+
+- `encode(&self, text: &str) -> Vec<u32>`: Greedy longest-match encoding with score-based tie-breaking
+- `decode(&self, ids: &[u32]) -> Result<String, SentencePieceError>`: Decode to UTF-8 string
+- `decode_lossy(&self, ids: &[u32]) -> String`: Decode, skipping invalid token IDs
+- `vocab_size(&self) -> usize`: Vocabulary size
+- `is_eos(&self, token_id: u32) -> bool`: Check if token is EOS
+- `eos_token_id(&self) -> u32`: Get EOS token ID
+- `bos_token_id(&self) -> Option<u32>`: Get BOS token ID
+
 ### Error Handling
 
 The Rust API uses `Result` types for operations that can fail:

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,12 +4,12 @@ build-backend = "maturin"
 
 [project]
 name = "splintr-rs"
-version = "0.8.0"
-description = "Fast Rust BPE tokenizer with Python bindings"
+version = "0.9.0"
+description = "Fast Rust tokenizer (BPE + SentencePiece) with Python bindings"
 readme = "README.md"
 license = { text = "MIT" }
 requires-python = ">=3.8"
-keywords = ["tokenizer", "bpe", "tiktoken", "gpt", "llm"]
+keywords = ["tokenizer", "bpe", "sentencepiece", "tiktoken", "llm"]
 authors = [{ name = "Farhan" }]
 classifiers = [
     "Development Status :: 4 - Beta",

diff --git a/python/splintr/__init__.py b/python/splintr/__init__.py
@@ -1,11 +1,12 @@
 """
-Splintr - Fast Rust BPE tokenizer with Python bindings
+Splintr - Fast Rust tokenizer (BPE + SentencePiece) with Python bindings
 
 A high-performance tokenizer featuring:
 - Regexr with JIT and SIMD (default, pure Rust)
 - Optional PCRE2 with JIT (requires pcre2 feature)
 - Rayon parallelism for multi-core encoding
 - Linked-list BPE algorithm (avoids O(N^2) on pathological inputs)
+- SentencePiece unigram with greedy longest-match and score-based tie-breaking
 - FxHashMap for fast lookups
 - Aho-Corasick for fast special token matching
 - LRU cache for frequently encoded chunks
@@ -61,6 +62,18 @@
             print(text, end="", flush=True)
     print(decoder.flush())
 
+SentencePiece Unigram (for GGUF models):
+    from splintr import SentencePieceTokenizer
+
+    tokenizer = SentencePieceTokenizer(
+        tokens=["<unk>", "<s>", "</s>", "▁Hello", "▁world"],
+        scores=[0.0, 0.0, 0.0, -1.2, -1.5],
+        eos_token_id=2,
+        bos_token_id=1,
+    )
+    ids = tokenizer.encode("Hello world")
+    text = tokenizer.decode(ids)
+
 Agent Tokens:
     from splintr import (
         Tokenizer,
@@ -109,6 +122,7 @@
 
 from ._core import (
     Tokenizer,
+    SentencePieceTokenizer,
     StreamingDecoder,
     ByteLevelStreamingDecoder,
     CL100K_BASE_PATTERN,
@@ -125,6 +139,7 @@
 
 __all__ = [
     "Tokenizer",
+    "SentencePieceTokenizer",
     "StreamingDecoder",
     "ByteLevelStreamingDecoder",
     "CL100K_BASE_PATTERN",
@@ -138,4 +153,4 @@
     "MISTRAL_V2_AGENT_TOKENS",
     "MISTRAL_V3_AGENT_TOKENS",
 ]
-__version__ = "0.8.0"
+__version__ = "0.9.0"
diff --git a/src/core/mod.rs b/src/core/mod.rs
@@ -30,18 +30,20 @@
 mod bpe;
 pub mod byte_level;
 pub mod pretrained;
+pub mod sentencepiece;
 mod streaming;
 mod tokenizer;
 mod vocab;
 
 pub use bpe::byte_pair_encode;
 pub use byte_level::{byte_level_decode, byte_level_decode_bytes, byte_level_encode};
 pub use pretrained::{
-    bos_token_id, cl100k_base_special_tokens, deepseek_v3_special_tokens, eos_token_id,
-    eos_token_id_by_name, from_pretrained, from_vocab, llama3_special_tokens,
+    bos_token_id, bos_token_id_by_name, cl100k_base_special_tokens, deepseek_v3_special_tokens,
+    eos_token_id, eos_token_id_by_name, from_pretrained, from_vocab, llama3_special_tokens,
     o200k_base_special_tokens, pad_token_id, pattern, special_tokens, uses_byte_level,
     PretrainedVocab,
 };
+pub use sentencepiece::{SentencePieceError, SentencePieceTokenizer};
 pub use streaming::{ByteLevelStreamingDecoder, StreamingDecoder};
 pub use tokenizer::{
     cl100k_agent_tokens, o200k_agent_tokens, Tokenizer, TokenizerError, CL100K_BASE_PATTERN,

diff --git a/src/core/pretrained.rs b/src/core/pretrained.rs
@@ -204,6 +204,11 @@ pub fn bos_token_id(vocab: PretrainedVocab) -> Option<u32> {
     }
 }
 
+/// Get the BOS token ID by vocabulary name string.
+pub fn bos_token_id_by_name(name: &str) -> Option<u32> {
+    PretrainedVocab::from_name(name).and_then(bos_token_id)
+}
+
 /// Get the PAD token ID for a vocabulary.
 pub fn pad_token_id(vocab: PretrainedVocab) -> Option<u32> {
     match vocab {