Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 0 additions & 8 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -235,11 +235,3 @@ incremental = false
inherits = "dev"
debug = true

# core2 0.4.0 is yanked on crates.io but no 0.4.1+ exists, and libflate /
# libflate_lz77 (pulled in via the `jieba` feature of nodedb-fts →
# include-flate → include-flate-compress) still require `core2 ^0.4`.
# Point cargo at the upstream git source so fresh resolution (CI without
# a committed lockfile) succeeds — git sources bypass the registry's yank
# check. Safe to remove once libflate releases a bump off core2.
[patch.crates-io]
core2 = { git = "https://github.com/technocreatives/core2", rev = "545e84bcb0f235b12e21351e0c69767958efe2a7" }
6 changes: 4 additions & 2 deletions nodedb-fts/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,10 @@ homepage.workspace = true
[features]
default = []
lang-ja = ["dep:lindera"]
lang-zh = ["dep:jieba-rs"]
# lang-zh currently falls back to CJK bigrams. The previous dictionary-based
# implementation pulled jieba-rs → include-flate → libflate → yanked core2,
# which breaks `cargo publish`. Re-enable once the upstream chain is fixed.
lang-zh = []
lang-ko = ["dep:lindera"]
lang-th = ["dep:icu_segmenter"]
lang-detect = ["dep:whatlang"]
Expand All @@ -29,7 +32,6 @@ thiserror = { workspace = true }

# Optional: dictionary-based CJK segmentation
lindera = { version = "2.3", optional = true }
jieba-rs = { version = "0.9", optional = true }
icu_segmenter = { version = "1", optional = true }
whatlang = { version = "0.18", optional = true }

Expand Down
37 changes: 4 additions & 33 deletions nodedb-fts/src/analyzer/language/cjk/segmenter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
//!
//! Feature gates:
//! - `lang-ja`: lindera with IPADIC for Japanese
//! - `lang-zh`: jieba-rs for Chinese
//! - `lang-zh`: currently falls back to CJK bigrams (see Cargo.toml)
//! - `lang-ko`: lindera with ko-dic for Korean
//! - `lang-th`: icu_segmenter for Thai

Expand Down Expand Up @@ -37,16 +37,9 @@ fn segment_japanese(text: &str) -> Vec<String> {
}
}

/// Chinese segmentation: jieba when `lang-zh` is enabled, bigrams otherwise.
/// Chinese segmentation: CJK bigrams (dictionary segmentation temporarily disabled).
fn segment_chinese(text: &str) -> Vec<String> {
#[cfg(feature = "lang-zh")]
{
jieba_segment(text)
}
#[cfg(not(feature = "lang-zh"))]
{
tokenize_cjk(text)
}
tokenize_cjk(text)
}

/// Korean segmentation: lindera/ko-dic when `lang-ko` is enabled, bigrams otherwise.
Expand Down Expand Up @@ -92,18 +85,6 @@ fn lindera_segment(text: &str, _dict: &str) -> Vec<String> {
.collect()
}

#[cfg(feature = "lang-zh")]
fn jieba_segment(text: &str) -> Vec<String> {
use jieba_rs::Jieba;
let jieba = Jieba::new();
jieba
.cut(text, false)
.into_iter()
.map(|s| s.to_string())
.filter(|s| !s.trim().is_empty())
.collect()
}

#[cfg(feature = "lang-th")]
fn icu_segment_thai(text: &str) -> Vec<String> {
use icu_segmenter::WordSegmenter;
Expand All @@ -124,21 +105,11 @@ mod tests {
use super::*;

#[test]
#[cfg(not(feature = "lang-zh"))]
fn fallback_to_bigrams_chinese() {
// Without lang-zh feature, should use CJK bigrams.
fn bigrams_chinese() {
let tokens = segment("全文検索", "zh");
assert_eq!(tokens, vec!["全文", "文検", "検索"]);
}

#[test]
#[cfg(feature = "lang-zh")]
fn dictionary_segmentation_chinese() {
// With lang-zh feature, jieba produces dictionary-based tokens.
let tokens = segment("全文検索", "zh");
assert!(!tokens.is_empty());
}

#[test]
#[cfg(not(feature = "lang-ja"))]
fn fallback_to_bigrams_japanese() {
Expand Down
Loading
Loading