NodeDB-Lab · farhan-syah · Apr 14, 2026 · Apr 14, 2026 · Apr 14, 2026 · Apr 14, 2026
diff --git a/Cargo.toml b/Cargo.toml
@@ -235,11 +235,3 @@ incremental = false
 inherits = "dev"
 debug = true
 
-# core2 0.4.0 is yanked on crates.io but no 0.4.1+ exists, and libflate /
-# libflate_lz77 (pulled in via the `jieba` feature of nodedb-fts →
-# include-flate → include-flate-compress) still require `core2 ^0.4`.
-# Point cargo at the upstream git source so fresh resolution (CI without
-# a committed lockfile) succeeds — git sources bypass the registry's yank
-# check. Safe to remove once libflate releases a bump off core2.
-[patch.crates-io]
-core2 = { git = "https://github.com/technocreatives/core2", rev = "545e84bcb0f235b12e21351e0c69767958efe2a7" }
diff --git a/nodedb-fts/Cargo.toml b/nodedb-fts/Cargo.toml
@@ -12,7 +12,10 @@ homepage.workspace = true
 [features]
 default = []
 lang-ja = ["dep:lindera"]
-lang-zh = ["dep:jieba-rs"]
+# lang-zh currently falls back to CJK bigrams. The previous dictionary-based
+# implementation pulled jieba-rs → include-flate → libflate → yanked core2,
+# which breaks `cargo publish`. Re-enable once the upstream chain is fixed.
+lang-zh = []
 lang-ko = ["dep:lindera"]
 lang-th = ["dep:icu_segmenter"]
 lang-detect = ["dep:whatlang"]
@@ -29,7 +32,6 @@ thiserror = { workspace = true }
 
 # Optional: dictionary-based CJK segmentation
 lindera = { version = "2.3", optional = true }
-jieba-rs = { version = "0.9", optional = true }
 icu_segmenter = { version = "1", optional = true }
 whatlang = { version = "0.18", optional = true }
 

diff --git a/nodedb-fts/src/analyzer/language/cjk/segmenter.rs b/nodedb-fts/src/analyzer/language/cjk/segmenter.rs
@@ -6,7 +6,7 @@
 //!
 //! Feature gates:
 //! - `lang-ja`: lindera with IPADIC for Japanese
-//! - `lang-zh`: jieba-rs for Chinese
+//! - `lang-zh`: currently falls back to CJK bigrams (see Cargo.toml)
 //! - `lang-ko`: lindera with ko-dic for Korean
 //! - `lang-th`: icu_segmenter for Thai
 
@@ -37,16 +37,9 @@ fn segment_japanese(text: &str) -> Vec<String> {
     }
 }
 
-/// Chinese segmentation: jieba when `lang-zh` is enabled, bigrams otherwise.
+/// Chinese segmentation: CJK bigrams (dictionary segmentation temporarily disabled).
 fn segment_chinese(text: &str) -> Vec<String> {
-    #[cfg(feature = "lang-zh")]
-    {
-        jieba_segment(text)
-    }
-    #[cfg(not(feature = "lang-zh"))]
-    {
-        tokenize_cjk(text)
-    }
+    tokenize_cjk(text)
 }
 
 /// Korean segmentation: lindera/ko-dic when `lang-ko` is enabled, bigrams otherwise.
@@ -92,18 +85,6 @@ fn lindera_segment(text: &str, _dict: &str) -> Vec<String> {
         .collect()
 }
 
-#[cfg(feature = "lang-zh")]
-fn jieba_segment(text: &str) -> Vec<String> {
-    use jieba_rs::Jieba;
-    let jieba = Jieba::new();
-    jieba
-        .cut(text, false)
-        .into_iter()
-        .map(|s| s.to_string())
-        .filter(|s| !s.trim().is_empty())
-        .collect()
-}
-
 #[cfg(feature = "lang-th")]
 fn icu_segment_thai(text: &str) -> Vec<String> {
     use icu_segmenter::WordSegmenter;
@@ -124,21 +105,11 @@ mod tests {
     use super::*;
 
     #[test]
-    #[cfg(not(feature = "lang-zh"))]
-    fn fallback_to_bigrams_chinese() {
-        // Without lang-zh feature, should use CJK bigrams.
+    fn bigrams_chinese() {
         let tokens = segment("全文検索", "zh");
         assert_eq!(tokens, vec!["全文", "文検", "検索"]);
     }
 
-    #[test]
-    #[cfg(feature = "lang-zh")]
-    fn dictionary_segmentation_chinese() {
-        // With lang-zh feature, jieba produces dictionary-based tokens.
-        let tokens = segment("全文検索", "zh");
-        assert!(!tokens.is_empty());
-    }
-
     #[test]
     #[cfg(not(feature = "lang-ja"))]
     fn fallback_to_bigrams_japanese() {