From bb0d984c8838df6350c26978b8683d9e070d711c Mon Sep 17 00:00:00 2001 From: Farhan Syah Date: Wed, 15 Apr 2026 05:15:44 +0800 Subject: [PATCH 1/4] ci(release): publish with --no-verify to bypass yanked core2 in verify build --- .github/workflows/release.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index f47eb666..9eb2249c 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -160,7 +160,10 @@ jobs: echo " $crate@$VERSION already published — skipping" else echo " Publishing $crate@$VERSION..." - cargo publish -p "$crate" --allow-dirty + # --no-verify: skip the local verify build, which re-resolves + # dependencies without the workspace [patch.crates-io] and fails + # on the yanked core2 0.4.0 pulled transitively by jieba-rs. + cargo publish -p "$crate" --allow-dirty --no-verify need_wait+=("$crate:$VERSION") fi done From fe6a3eb4a19d2f4201c086e9e491a935b12fb35e Mon Sep 17 00:00:00 2001 From: Farhan Syah Date: Wed, 15 Apr 2026 05:59:56 +0800 Subject: [PATCH 2/4] fix(nodedb-fts): drop jieba-rs dep to unblock cargo publish MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jieba-rs pulls include-flate → libflate → core2 0.4.0, which is yanked on crates.io. The workspace [patch.crates-io] bypassed this locally but was ignored by cargo publish's isolated packaging resolve, so releases kept failing on nodedb-fts. Drop jieba-rs entirely and let lang-zh fall back to CJK bigram segmentation (the same path used when the feature is disabled). The feature flag is retained as a no-op so downstream configs keep compiling. Can be restored when the upstream flate chain moves off yanked core2. --- .github/workflows/release.yml | 5 +-- Cargo.toml | 8 ---- nodedb-fts/Cargo.toml | 6 ++- .../src/analyzer/language/cjk/segmenter.rs | 37 ++----------------- 4 files changed, 9 insertions(+), 47 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 9eb2249c..f47eb666 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -160,10 +160,7 @@ jobs: echo " $crate@$VERSION already published — skipping" else echo " Publishing $crate@$VERSION..." - # --no-verify: skip the local verify build, which re-resolves - # dependencies without the workspace [patch.crates-io] and fails - # on the yanked core2 0.4.0 pulled transitively by jieba-rs. - cargo publish -p "$crate" --allow-dirty --no-verify + cargo publish -p "$crate" --allow-dirty need_wait+=("$crate:$VERSION") fi done diff --git a/Cargo.toml b/Cargo.toml index 61731e0a..aafb3262 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -235,11 +235,3 @@ incremental = false inherits = "dev" debug = true -# core2 0.4.0 is yanked on crates.io but no 0.4.1+ exists, and libflate / -# libflate_lz77 (pulled in via the `jieba` feature of nodedb-fts → -# include-flate → include-flate-compress) still require `core2 ^0.4`. -# Point cargo at the upstream git source so fresh resolution (CI without -# a committed lockfile) succeeds — git sources bypass the registry's yank -# check. Safe to remove once libflate releases a bump off core2. -[patch.crates-io] -core2 = { git = "https://github.com/technocreatives/core2", rev = "545e84bcb0f235b12e21351e0c69767958efe2a7" } diff --git a/nodedb-fts/Cargo.toml b/nodedb-fts/Cargo.toml index 95a4c662..0e6ea20e 100644 --- a/nodedb-fts/Cargo.toml +++ b/nodedb-fts/Cargo.toml @@ -12,7 +12,10 @@ homepage.workspace = true [features] default = [] lang-ja = ["dep:lindera"] -lang-zh = ["dep:jieba-rs"] +# lang-zh currently falls back to CJK bigrams. The previous dictionary-based +# implementation pulled jieba-rs → include-flate → libflate → yanked core2, +# which breaks `cargo publish`. Re-enable once the upstream chain is fixed. +lang-zh = [] lang-ko = ["dep:lindera"] lang-th = ["dep:icu_segmenter"] lang-detect = ["dep:whatlang"] @@ -29,7 +32,6 @@ thiserror = { workspace = true } # Optional: dictionary-based CJK segmentation lindera = { version = "2.3", optional = true } -jieba-rs = { version = "0.9", optional = true } icu_segmenter = { version = "1", optional = true } whatlang = { version = "0.18", optional = true } diff --git a/nodedb-fts/src/analyzer/language/cjk/segmenter.rs b/nodedb-fts/src/analyzer/language/cjk/segmenter.rs index 91447056..9f973d13 100644 --- a/nodedb-fts/src/analyzer/language/cjk/segmenter.rs +++ b/nodedb-fts/src/analyzer/language/cjk/segmenter.rs @@ -6,7 +6,7 @@ //! //! Feature gates: //! - `lang-ja`: lindera with IPADIC for Japanese -//! - `lang-zh`: jieba-rs for Chinese +//! - `lang-zh`: currently falls back to CJK bigrams (see Cargo.toml) //! - `lang-ko`: lindera with ko-dic for Korean //! - `lang-th`: icu_segmenter for Thai @@ -37,16 +37,9 @@ fn segment_japanese(text: &str) -> Vec { } } -/// Chinese segmentation: jieba when `lang-zh` is enabled, bigrams otherwise. +/// Chinese segmentation: CJK bigrams (dictionary segmentation temporarily disabled). fn segment_chinese(text: &str) -> Vec { - #[cfg(feature = "lang-zh")] - { - jieba_segment(text) - } - #[cfg(not(feature = "lang-zh"))] - { - tokenize_cjk(text) - } + tokenize_cjk(text) } /// Korean segmentation: lindera/ko-dic when `lang-ko` is enabled, bigrams otherwise. @@ -92,18 +85,6 @@ fn lindera_segment(text: &str, _dict: &str) -> Vec { .collect() } -#[cfg(feature = "lang-zh")] -fn jieba_segment(text: &str) -> Vec { - use jieba_rs::Jieba; - let jieba = Jieba::new(); - jieba - .cut(text, false) - .into_iter() - .map(|s| s.to_string()) - .filter(|s| !s.trim().is_empty()) - .collect() -} - #[cfg(feature = "lang-th")] fn icu_segment_thai(text: &str) -> Vec { use icu_segmenter::WordSegmenter; @@ -124,21 +105,11 @@ mod tests { use super::*; #[test] - #[cfg(not(feature = "lang-zh"))] - fn fallback_to_bigrams_chinese() { - // Without lang-zh feature, should use CJK bigrams. + fn bigrams_chinese() { let tokens = segment("全文検索", "zh"); assert_eq!(tokens, vec!["全文", "文検", "検索"]); } - #[test] - #[cfg(feature = "lang-zh")] - fn dictionary_segmentation_chinese() { - // With lang-zh feature, jieba produces dictionary-based tokens. - let tokens = segment("全文検索", "zh"); - assert!(!tokens.is_empty()); - } - #[test] #[cfg(not(feature = "lang-ja"))] fn fallback_to_bigrams_japanese() { From 708329cc5ba340450e6dbb0c8eed6c71657b1d67 Mon Sep 17 00:00:00 2001 From: Farhan Syah Date: Wed, 15 Apr 2026 06:27:43 +0800 Subject: [PATCH 3/4] refactor(pgwire): split ddl/dsl.rs into per-concern modules MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit dsl.rs has grown to cover DDL parsing for search indexes, CRDT merge, FTS, vector, sparse, and fusion in a single flat file. Split it into a dsl/ directory with one module per concern: crdt_merge, fulltext_index, search_fusion, search_index, search_vector, sparse_index, vector_index, helpers, mod No logic changes — pure file reorganisation. --- nodedb/src/control/server/pgwire/ddl/dsl.rs | 594 ------------------ .../server/pgwire/ddl/dsl/crdt_merge.rs | 95 +++ .../server/pgwire/ddl/dsl/fulltext_index.rs | 47 ++ .../control/server/pgwire/ddl/dsl/helpers.rs | 35 ++ .../src/control/server/pgwire/ddl/dsl/mod.rs | 27 + .../server/pgwire/ddl/dsl/search_fusion.rs | 100 +++ .../server/pgwire/ddl/dsl/search_index.rs | 71 +++ .../server/pgwire/ddl/dsl/search_vector.rs | 120 ++++ .../server/pgwire/ddl/dsl/sparse_index.rs | 59 ++ .../server/pgwire/ddl/dsl/vector_index.rs | 151 +++++ 10 files changed, 705 insertions(+), 594 deletions(-) delete mode 100644 nodedb/src/control/server/pgwire/ddl/dsl.rs create mode 100644 nodedb/src/control/server/pgwire/ddl/dsl/crdt_merge.rs create mode 100644 nodedb/src/control/server/pgwire/ddl/dsl/fulltext_index.rs create mode 100644 nodedb/src/control/server/pgwire/ddl/dsl/helpers.rs create mode 100644 nodedb/src/control/server/pgwire/ddl/dsl/mod.rs create mode 100644 nodedb/src/control/server/pgwire/ddl/dsl/search_fusion.rs create mode 100644 nodedb/src/control/server/pgwire/ddl/dsl/search_index.rs create mode 100644 nodedb/src/control/server/pgwire/ddl/dsl/search_vector.rs create mode 100644 nodedb/src/control/server/pgwire/ddl/dsl/sparse_index.rs create mode 100644 nodedb/src/control/server/pgwire/ddl/dsl/vector_index.rs diff --git a/nodedb/src/control/server/pgwire/ddl/dsl.rs b/nodedb/src/control/server/pgwire/ddl/dsl.rs deleted file mode 100644 index 95f173cf..00000000 --- a/nodedb/src/control/server/pgwire/ddl/dsl.rs +++ /dev/null @@ -1,594 +0,0 @@ -//! NodeDB DSL extensions — custom SQL-like commands beyond standard SQL. -//! -//! - SEARCH USING VECTOR(, ARRAY[...], ) -//! - SEARCH USING VECTOR(...) WITH FILTER -//! - SEARCH USING FUSION(vector=..., graph=..., top_k=...) -//! - CREATE VECTOR INDEX ON [METRIC cosine|l2] [M ] [EF_CONSTRUCTION ] -//! - CREATE FULLTEXT INDEX ON () -//! - CRDT MERGE INTO FROM TO - -use std::sync::Arc; -use std::time::Duration; - -use futures::stream; -use pgwire::api::results::{DataRowEncoder, QueryResponse, Response, Tag}; -use pgwire::error::PgWireResult; - -use crate::bridge::envelope::PhysicalPlan; -use crate::bridge::physical_plan::{CrdtOp, GraphOp, VectorOp}; -use crate::control::security::identity::AuthenticatedIdentity; -use crate::control::state::SharedState; - -use super::super::types::{sqlstate_error, text_field}; - -// ── SEARCH USING VECTOR ───────────────────────────────────────────── - -/// SEARCH USING VECTOR(ARRAY[...], ) -/// SEARCH USING VECTOR(ARRAY[...], ) WITH FILTER -pub async fn search_vector( - state: &SharedState, - identity: &AuthenticatedIdentity, - sql: &str, -) -> PgWireResult> { - // Extract collection name. - let parts: Vec<&str> = sql.split_whitespace().collect(); - if parts.len() < 4 { - return Err(sqlstate_error( - "42601", - "syntax: SEARCH USING VECTOR(ARRAY[...], )", - )); - } - let collection = parts[1]; - let tenant_id = identity.tenant_id; - - // Parse field name and ARRAY[...] from VECTOR(field, ARRAY[...], k) or VECTOR(ARRAY[...], k). - let vector_paren = sql.find("VECTOR(").or_else(|| sql.find("vector(")); - let vector_paren = match vector_paren { - Some(i) => i + 7, - None => { - return Err(sqlstate_error( - "42601", - "expected VECTOR(...) in SEARCH USING VECTOR", - )); - } - }; - - // Extract field name if present before ARRAY[. - let array_start = sql.find("ARRAY[").or_else(|| sql.find("array[")); - let array_start = match array_start { - Some(i) => i + 6, - None => { - return Err(sqlstate_error( - "42601", - "expected ARRAY[...] in SEARCH USING VECTOR", - )); - } - }; - - // Field name is between VECTOR( and ARRAY[ (trimmed, comma-stripped). - let field_name = sql[vector_paren..array_start - 6] - .trim() - .trim_end_matches(',') - .trim() - .to_string(); - - let array_end = sql[array_start..].find(']').map(|i| i + array_start); - let array_end = match array_end { - Some(i) => i, - None => { - return Err(sqlstate_error("42601", "unterminated ARRAY[")); - } - }; - - let vector_str = &sql[array_start..array_end]; - let query_vector: Vec = vector_str - .split(',') - .filter_map(|s| s.trim().parse::().ok()) - .collect(); - - if query_vector.is_empty() { - return Err(sqlstate_error("42601", "empty query vector")); - } - - // Parse top_k: number after the closing bracket. - let after_array = &sql[array_end + 1..]; - let top_k = after_array - .split(|c: char| !c.is_ascii_digit()) - .find(|s| !s.is_empty()) - .and_then(|s| s.parse::().ok()) - .unwrap_or(10); - - // Future: parse WITH FILTER predicates, evaluate against documents, build Roaring bitmap. - let filter_bitmap: Option> = None; - - let plan = PhysicalPlan::Vector(VectorOp::Search { - collection: collection.to_string(), - query_vector: Arc::from(query_vector.as_slice()), - top_k, - ef_search: 0, - filter_bitmap, - field_name, - rls_filters: Vec::new(), - }); - - let payload = super::sync_dispatch::dispatch_async( - state, - tenant_id, - collection, - plan, - Duration::from_secs(state.tuning.network.default_deadline_secs), - ) - .await - .map_err(|e| sqlstate_error("XX000", &e.to_string()))?; - - let schema = Arc::new(vec![text_field("result")]); - let text = crate::data::executor::response_codec::decode_payload_to_json(&payload); - let mut encoder = DataRowEncoder::new(schema.clone()); - encoder - .encode_field(&text) - .map_err(|e| sqlstate_error("XX000", &e.to_string()))?; - let row = encoder.take_row(); - - Ok(vec![Response::Query(QueryResponse::new( - schema, - stream::iter(vec![Ok(row)]), - ))]) -} - -// ── SEARCH USING FUSION ───────────────────────────────────────────── - -/// SEARCH USING FUSION(VECTOR(ARRAY[...], ), GRAPH(