Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .version
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.8.0
0.9.0
15 changes: 9 additions & 6 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,29 +1,32 @@
[package]
name = "splintr"
version = "0.8.0"
version = "0.9.0"
edition = "2021"
description = "Fast Rust BPE tokenizer with Python bindings"
description = "Fast Rust tokenizer (BPE + SentencePiece) with Python bindings"
license = "MIT"
repository = "https://github.com/ml-rust/splintr"
homepage = "https://github.com/ml-rust/splintr"
readme = "README.md"
keywords = ["tokenizer", "bpe", "tiktoken", "gpt", "llm"]
keywords = ["tokenizer", "bpe", "sentencepiece", "tiktoken", "llm"]
categories = ["text-processing", "encoding"]

[lib]
name = "splintr"
crate-type = ["cdylib", "rlib"]

[features]
default = ["pcre2"]
default = ["rayon", "regexr-jit"]
python = ["dep:pyo3"]
pcre2 = ["dep:pcre2"]
rayon = ["dep:rayon"]
regexr-jit = ["regexr/jit", "regexr/simd"]
wasm = [] # disables rayon, uses scalar regex — use with --no-default-features

[dependencies]
# PCRE2 regex with JIT support (optional, for benchmarking)
pcre2 = { version = "0.2", optional = true }
# Rayon for internal parallelism
rayon = "1.10"
rayon = { version = "1.10", optional = true }
# Fast hashing (FxHashMap)
rustc-hash = "2.0"
# Error handling
Expand All @@ -37,7 +40,7 @@ aho-corasick = "1.1"
# LRU cache for frequent token sequences
lru = "0.16"
# regexr regex engine (default backend)
regexr = { version = "0.1.0-beta.5", features = ["jit", "simd"] }
regexr = { version = "0.1.0-beta.5", default-features = false }

[dev-dependencies]
# PCRE2 for benchmarking comparisons
Expand Down
26 changes: 14 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

[![Crates.io](https://img.shields.io/crates/v/splintr.svg)](https://crates.io/crates/splintr) [![PyPI](https://img.shields.io/pypi/v/splintr-rs.svg)](https://pypi.org/project/splintr-rs/) [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT)

**A high-performance BPE tokenizer built with Rust with Python bindings, focused on speed, safety, and resource optimization.**
**A high-performance tokenizer (BPE + SentencePiece) built with Rust with Python bindings, focused on speed, safety, and resource optimization.**

## The Problem

Expand Down Expand Up @@ -85,7 +85,7 @@ See the [API Guide](docs/api_guide.md) and [docs.rs](https://docs.rs/splintr) fo
- **Compatible vocabularies** - Supports cl100k_base, o200k_base (OpenAI), Llama 3 family (Meta), DeepSeek V3 (DeepSeek), and Mistral V1/V2/V3 (Mistral AI)
- **Streaming decoders** - Real-time LLM output display with proper UTF-8 handling ([guide](docs/api_guide.md#streaming-decoder))
- **54 agent tokens** - Built-in support for chat, CoT reasoning, ReAct agents, tool calling, RAG citations ([docs](docs/special_tokens.md))
- **Battle-tested algorithms** - Regexr with JIT (pure Rust), Aho-Corasick for special tokens, linked-list BPE
- **Battle-tested algorithms** - Regexr with JIT (pure Rust), Aho-Corasick for special tokens, linked-list BPE, SentencePiece unigram

**Cross-platform:**

Expand Down Expand Up @@ -219,15 +219,15 @@ See the [API Guide](docs/api_guide.md#streaming-decoder) for detailed usage, exa

## Supported Vocabularies

| Vocabulary | Used By | Vocabulary Size | Special Tokens | Import Constant |
| ------------- | ----------------------------------- | --------------- | --------------- | -------------------------- |
| `cl100k_base` | GPT-4, GPT-3.5-turbo | ~100,000 | 5 + 54 agent | `CL100K_BASE_PATTERN` |
| `o200k_base` | GPT-4o | ~200,000 | 2 + 54 agent | `O200K_BASE_PATTERN` |
| `llama3` | Llama 3, 3.1, 3.2, 3.3 (Meta) | ~128,000 | 11 + 54 agent | `LLAMA3_PATTERN` |
| `deepseek_v3` | DeepSeek V3, DeepSeek R1 | ~128,000 | 17 + 54 agent | `LLAMA3_PATTERN` |
| `mistral_v1` | Mistral 7B v0.1/v0.2, Mixtral 8x7B | ~32,000 | 3 + 54 agent | `SENTENCEPIECE_PATTERN` |
| `mistral_v2` | Mistral 7B v0.3, Codestral, 8x22B | ~32,768 | 10 + 54 agent | `SENTENCEPIECE_PATTERN` |
| `mistral_v3` | Mistral NeMo, Large 2, Pixtral | ~131,000 | 10 + 54 agent | `MISTRAL_V3_PATTERN` |
| Vocabulary | Used By | Vocabulary Size | Special Tokens | Import Constant |
| ------------- | ---------------------------------- | --------------- | -------------- | ----------------------- |
| `cl100k_base` | GPT-4, GPT-3.5-turbo | ~100,000 | 5 + 54 agent | `CL100K_BASE_PATTERN` |
| `o200k_base` | GPT-4o | ~200,000 | 2 + 54 agent | `O200K_BASE_PATTERN` |
| `llama3` | Llama 3, 3.1, 3.2, 3.3 (Meta) | ~128,000 | 11 + 54 agent | `LLAMA3_PATTERN` |
| `deepseek_v3` | DeepSeek V3, DeepSeek R1 | ~128,000 | 17 + 54 agent | `LLAMA3_PATTERN` |
| `mistral_v1` | Mistral 7B v0.1/v0.2, Mixtral 8x7B | ~32,000 | 3 + 54 agent | `SENTENCEPIECE_PATTERN` |
| `mistral_v2` | Mistral 7B v0.3, Codestral, 8x22B | ~32,768 | 10 + 54 agent | `SENTENCEPIECE_PATTERN` |
| `mistral_v3` | Mistral NeMo, Large 2, Pixtral | ~131,000 | 10 + 54 agent | `MISTRAL_V3_PATTERN` |

**OpenAI standard tokens:**

Expand Down Expand Up @@ -279,6 +279,7 @@ Splintr implements several optimizations that make tokenization faster:
- **Regexr with JIT compilation**: Pure Rust regex engine with SIMD acceleration
- **Rayon parallelism**: Leverages multiple CPU cores for batch encoding
- **Linked-list BPE algorithm**: Avoids O(N²) complexity on pathological inputs
- **SentencePiece unigram**: Greedy longest-match with score-based tie-breaking for Mistral/Llama-style models
- **FxHashMap**: Faster lookups than default SipHash for non-adversarial contexts
- **Aho-Corasick for special tokens**: Fast multi-pattern matching without regex alternation
- **LRU cache**: Avoids redundant BPE encoding of frequently seen chunks
Expand Down Expand Up @@ -357,6 +358,7 @@ The pre-commit hook automatically runs formatting, clippy, and tests before each
Splintr builds upon concepts from:

- [tiktoken](https://github.com/openai/tiktoken) - OpenAI's reference BPE tokenizer
- [SentencePiece](https://github.com/google/sentencepiece) - Google's unsupervised text tokenizer
- [tokenizers](https://github.com/huggingface/tokenizers) - Hugging Face's tokenization library

The performance optimizations are informed by profiling real-world usage patterns in LLM applications.
Expand All @@ -368,7 +370,7 @@ If you use Splintr in your research, please cite:
```bibtex
@software{splintr,
author = {Farhan Syah},
title = {Splintr: High-Performance BPE Tokenizer},
title = {Splintr: High-Performance Tokenizer (BPE + SentencePiece)},
year = {2025},
url = {https://github.com/ml-rust/splintr}
}
Expand Down
101 changes: 97 additions & 4 deletions docs/api_guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,17 @@ This guide provides comprehensive documentation for using Splintr's Python and R
## Table of Contents

- [Python API Reference](#python-api-reference)
- [Tokenizer Class](#tokenizer-class)
- [Tokenizer Class](#tokenizer-class) (BPE)
- [Encoding Methods](#encoding-methods)
- [Decoding Methods](#decoding-methods)
- [Cache Management](#cache-management)
- [SentencePiece Tokenizer Class](#sentencepiece-tokenizer-class) (Unigram)
- [Streaming Decoder](#streaming-decoder)
- [Regular Streaming Decoder](#regular-streaming-decoder)
- [ByteLevel Streaming Decoder](#bytelevel-streaming-decoder)
- [Rust API Reference](#rust-api-reference)
- [BPE Tokenizer](#bpe-tokenizer)
- [SentencePiece Tokenizer](#sentencepiece-tokenizer)
- [Detailed Usage Examples](#detailed-usage-examples)
- [Basic Encoding and Decoding](#basic-encoding-and-decoding)
- [Batch Processing](#batch-processing)
Expand Down Expand Up @@ -156,6 +159,61 @@ Clear the LRU encoding cache. Useful if memory pressure is a concern.
tokenizer.clear_cache()
```

### SentencePiece Tokenizer Class

The `SentencePieceTokenizer` class provides unigram tokenization for models using SentencePiece (e.g., loaded from GGUF files).

#### Creating

```python
from splintr import SentencePieceTokenizer

# Create from raw vocabulary data
tokenizer = SentencePieceTokenizer(
tokens=["<unk>", "<s>", "</s>", "▁Hello", "▁world"],
scores=[0.0, 0.0, 0.0, -1.2, -1.5],
eos_token_id=2,
bos_token_id=1, # optional
)
```

#### `encode(text: str) -> list[int]`

Encode text using greedy longest-match with score-based tie-breaking. Prepends BOS if configured.

```python
ids = tokenizer.encode("Hello world")
# [1, 3, 4] (BOS + ▁Hello + ▁world)
```

#### `decode(ids: list[int]) -> str`

Decode token IDs to text. Skips BOS/EOS tokens, converts ▁ back to spaces.

```python
text = tokenizer.decode([1, 3, 4])
# "Hello world"
```

#### `decode_lossy(ids: list[int]) -> str`

Decode token IDs, silently skipping any invalid (out-of-range) IDs.

```python
text = tokenizer.decode_lossy([1, 3, 999, 4])
# "Hello world" (999 is skipped)
```

#### Properties

- `vocab_size: int` — Total vocabulary size
- `eos_token_id: int` — End-of-sequence token ID
- `bos_token_id: int | None` — Beginning-of-sequence token ID (if configured)

#### Methods

- `is_eos(token_id: int) -> bool` — Check if a token is the EOS token

## Streaming Decoder

Streaming decoders are essential for real-time LLM applications where tokens arrive one at a time. They handle the critical problem of BPE tokens not aligning with UTF-8 character boundaries.
Expand Down Expand Up @@ -286,7 +344,7 @@ Add Splintr to your `Cargo.toml`:
splintr = "*" # or pin to a specific version
```

### Basic Usage
### BPE Tokenizer

```rust
use splintr::{Tokenizer, CL100K_BASE_PATTERN};
Expand All @@ -306,19 +364,54 @@ let texts = vec!["Hello".to_string(), "World".to_string()];
let batch_tokens = tokenizer.encode_batch(&texts);
```

### Encoding Methods
#### Encoding Methods

- `encode(&self, text: &str) -> Vec<u32>`: Sequential encoding (optimal for texts <1MB)
- `encode_with_special(&self, text: &str) -> Vec<u32>`: Encode with special token recognition
- `encode_batch(&self, texts: &[String]) -> Vec<Vec<u32>>`: Parallel encoding across texts
- `encode_rayon(&self, text: &str) -> Vec<u32>`: Parallel encoding within text (for texts >1MB)

### Decoding Methods
#### Decoding Methods

- `decode(&self, tokens: &[u32]) -> Result<String, TokenizerError>`: Decode to UTF-8 string
- `decode_bytes(&self, tokens: &[u32]) -> Vec<u8>`: Decode to raw bytes
- `decode_lossy(&self, tokens: &[u32]) -> String`: Decode with replacement for invalid UTF-8

### SentencePiece Tokenizer

For models using SentencePiece unigram tokenization (e.g., Mistral V1/V2):

```rust
use splintr::SentencePieceTokenizer;

// Create from raw vocabulary data
let tokenizer = SentencePieceTokenizer::new(
tokens, // Vec<String> — token strings indexed by ID
scores, // Vec<f32> — scores for tie-breaking (empty for uniform)
Some(1), // Optional BOS token ID
2, // EOS token ID
)?;

// Encode (prepends BOS if configured, uses ▁ word boundaries)
let ids = tokenizer.encode("Hello world");

// Decode (skips BOS/EOS, converts ▁ back to spaces)
let text = tokenizer.decode(&ids)?;

// Lossy decode (skips invalid token IDs instead of erroring)
let text = tokenizer.decode_lossy(&ids);
```

#### Methods

- `encode(&self, text: &str) -> Vec<u32>`: Greedy longest-match encoding with score-based tie-breaking
- `decode(&self, ids: &[u32]) -> Result<String, SentencePieceError>`: Decode to UTF-8 string
- `decode_lossy(&self, ids: &[u32]) -> String`: Decode, skipping invalid token IDs
- `vocab_size(&self) -> usize`: Vocabulary size
- `is_eos(&self, token_id: u32) -> bool`: Check if token is EOS
- `eos_token_id(&self) -> u32`: Get EOS token ID
- `bos_token_id(&self) -> Option<u32>`: Get BOS token ID

### Error Handling

The Rust API uses `Result` types for operations that can fail:
Expand Down
6 changes: 3 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@ build-backend = "maturin"

[project]
name = "splintr-rs"
version = "0.8.0"
description = "Fast Rust BPE tokenizer with Python bindings"
version = "0.9.0"
description = "Fast Rust tokenizer (BPE + SentencePiece) with Python bindings"
readme = "README.md"
license = { text = "MIT" }
requires-python = ">=3.8"
keywords = ["tokenizer", "bpe", "tiktoken", "gpt", "llm"]
keywords = ["tokenizer", "bpe", "sentencepiece", "tiktoken", "llm"]
authors = [{ name = "Farhan" }]
classifiers = [
"Development Status :: 4 - Beta",
Expand Down
19 changes: 17 additions & 2 deletions python/splintr/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
"""
Splintr - Fast Rust BPE tokenizer with Python bindings
Splintr - Fast Rust tokenizer (BPE + SentencePiece) with Python bindings

A high-performance tokenizer featuring:
- Regexr with JIT and SIMD (default, pure Rust)
- Optional PCRE2 with JIT (requires pcre2 feature)
- Rayon parallelism for multi-core encoding
- Linked-list BPE algorithm (avoids O(N^2) on pathological inputs)
- SentencePiece unigram with greedy longest-match and score-based tie-breaking
- FxHashMap for fast lookups
- Aho-Corasick for fast special token matching
- LRU cache for frequently encoded chunks
Expand Down Expand Up @@ -61,6 +62,18 @@
print(text, end="", flush=True)
print(decoder.flush())

SentencePiece Unigram (for GGUF models):
from splintr import SentencePieceTokenizer

tokenizer = SentencePieceTokenizer(
tokens=["<unk>", "<s>", "</s>", "▁Hello", "▁world"],
scores=[0.0, 0.0, 0.0, -1.2, -1.5],
eos_token_id=2,
bos_token_id=1,
)
ids = tokenizer.encode("Hello world")
text = tokenizer.decode(ids)

Agent Tokens:
from splintr import (
Tokenizer,
Expand Down Expand Up @@ -109,6 +122,7 @@

from ._core import (
Tokenizer,
SentencePieceTokenizer,
StreamingDecoder,
ByteLevelStreamingDecoder,
CL100K_BASE_PATTERN,
Expand All @@ -125,6 +139,7 @@

__all__ = [
"Tokenizer",
"SentencePieceTokenizer",
"StreamingDecoder",
"ByteLevelStreamingDecoder",
"CL100K_BASE_PATTERN",
Expand All @@ -138,4 +153,4 @@
"MISTRAL_V2_AGENT_TOKENS",
"MISTRAL_V3_AGENT_TOKENS",
]
__version__ = "0.8.0"
__version__ = "0.9.0"
6 changes: 4 additions & 2 deletions src/core/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,18 +30,20 @@
mod bpe;
pub mod byte_level;
pub mod pretrained;
pub mod sentencepiece;
mod streaming;
mod tokenizer;
mod vocab;

pub use bpe::byte_pair_encode;
pub use byte_level::{byte_level_decode, byte_level_decode_bytes, byte_level_encode};
pub use pretrained::{
bos_token_id, cl100k_base_special_tokens, deepseek_v3_special_tokens, eos_token_id,
eos_token_id_by_name, from_pretrained, from_vocab, llama3_special_tokens,
bos_token_id, bos_token_id_by_name, cl100k_base_special_tokens, deepseek_v3_special_tokens,
eos_token_id, eos_token_id_by_name, from_pretrained, from_vocab, llama3_special_tokens,
o200k_base_special_tokens, pad_token_id, pattern, special_tokens, uses_byte_level,
PretrainedVocab,
};
pub use sentencepiece::{SentencePieceError, SentencePieceTokenizer};
pub use streaming::{ByteLevelStreamingDecoder, StreamingDecoder};
pub use tokenizer::{
cl100k_agent_tokens, o200k_agent_tokens, Tokenizer, TokenizerError, CL100K_BASE_PATTERN,
Expand Down
5 changes: 5 additions & 0 deletions src/core/pretrained.rs
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,11 @@ pub fn bos_token_id(vocab: PretrainedVocab) -> Option<u32> {
}
}

/// Get the BOS token ID by vocabulary name string.
pub fn bos_token_id_by_name(name: &str) -> Option<u32> {
PretrainedVocab::from_name(name).and_then(bos_token_id)
}

/// Get the PAD token ID for a vocabulary.
pub fn pad_token_id(vocab: PretrainedVocab) -> Option<u32> {
match vocab {
Expand Down
Loading
Loading