diff --git a/Cargo.toml b/Cargo.toml index 61731e0a..aafb3262 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -235,11 +235,3 @@ incremental = false inherits = "dev" debug = true -# core2 0.4.0 is yanked on crates.io but no 0.4.1+ exists, and libflate / -# libflate_lz77 (pulled in via the `jieba` feature of nodedb-fts → -# include-flate → include-flate-compress) still require `core2 ^0.4`. -# Point cargo at the upstream git source so fresh resolution (CI without -# a committed lockfile) succeeds — git sources bypass the registry's yank -# check. Safe to remove once libflate releases a bump off core2. -[patch.crates-io] -core2 = { git = "https://github.com/technocreatives/core2", rev = "545e84bcb0f235b12e21351e0c69767958efe2a7" } diff --git a/nodedb-fts/Cargo.toml b/nodedb-fts/Cargo.toml index 95a4c662..0e6ea20e 100644 --- a/nodedb-fts/Cargo.toml +++ b/nodedb-fts/Cargo.toml @@ -12,7 +12,10 @@ homepage.workspace = true [features] default = [] lang-ja = ["dep:lindera"] -lang-zh = ["dep:jieba-rs"] +# lang-zh currently falls back to CJK bigrams. The previous dictionary-based +# implementation pulled jieba-rs → include-flate → libflate → yanked core2, +# which breaks `cargo publish`. Re-enable once the upstream chain is fixed. +lang-zh = [] lang-ko = ["dep:lindera"] lang-th = ["dep:icu_segmenter"] lang-detect = ["dep:whatlang"] @@ -29,7 +32,6 @@ thiserror = { workspace = true } # Optional: dictionary-based CJK segmentation lindera = { version = "2.3", optional = true } -jieba-rs = { version = "0.9", optional = true } icu_segmenter = { version = "1", optional = true } whatlang = { version = "0.18", optional = true } diff --git a/nodedb-fts/src/analyzer/language/cjk/segmenter.rs b/nodedb-fts/src/analyzer/language/cjk/segmenter.rs index 91447056..9f973d13 100644 --- a/nodedb-fts/src/analyzer/language/cjk/segmenter.rs +++ b/nodedb-fts/src/analyzer/language/cjk/segmenter.rs @@ -6,7 +6,7 @@ //! //! Feature gates: //! - `lang-ja`: lindera with IPADIC for Japanese -//! - `lang-zh`: jieba-rs for Chinese +//! - `lang-zh`: currently falls back to CJK bigrams (see Cargo.toml) //! - `lang-ko`: lindera with ko-dic for Korean //! - `lang-th`: icu_segmenter for Thai @@ -37,16 +37,9 @@ fn segment_japanese(text: &str) -> Vec { } } -/// Chinese segmentation: jieba when `lang-zh` is enabled, bigrams otherwise. +/// Chinese segmentation: CJK bigrams (dictionary segmentation temporarily disabled). fn segment_chinese(text: &str) -> Vec { - #[cfg(feature = "lang-zh")] - { - jieba_segment(text) - } - #[cfg(not(feature = "lang-zh"))] - { - tokenize_cjk(text) - } + tokenize_cjk(text) } /// Korean segmentation: lindera/ko-dic when `lang-ko` is enabled, bigrams otherwise. @@ -92,18 +85,6 @@ fn lindera_segment(text: &str, _dict: &str) -> Vec { .collect() } -#[cfg(feature = "lang-zh")] -fn jieba_segment(text: &str) -> Vec { - use jieba_rs::Jieba; - let jieba = Jieba::new(); - jieba - .cut(text, false) - .into_iter() - .map(|s| s.to_string()) - .filter(|s| !s.trim().is_empty()) - .collect() -} - #[cfg(feature = "lang-th")] fn icu_segment_thai(text: &str) -> Vec { use icu_segmenter::WordSegmenter; @@ -124,21 +105,11 @@ mod tests { use super::*; #[test] - #[cfg(not(feature = "lang-zh"))] - fn fallback_to_bigrams_chinese() { - // Without lang-zh feature, should use CJK bigrams. + fn bigrams_chinese() { let tokens = segment("全文検索", "zh"); assert_eq!(tokens, vec!["全文", "文検", "検索"]); } - #[test] - #[cfg(feature = "lang-zh")] - fn dictionary_segmentation_chinese() { - // With lang-zh feature, jieba produces dictionary-based tokens. - let tokens = segment("全文検索", "zh"); - assert!(!tokens.is_empty()); - } - #[test] #[cfg(not(feature = "lang-ja"))] fn fallback_to_bigrams_japanese() { diff --git a/nodedb/src/control/server/pgwire/ddl/dsl.rs b/nodedb/src/control/server/pgwire/ddl/dsl.rs deleted file mode 100644 index 95f173cf..00000000 --- a/nodedb/src/control/server/pgwire/ddl/dsl.rs +++ /dev/null @@ -1,594 +0,0 @@ -//! NodeDB DSL extensions — custom SQL-like commands beyond standard SQL. -//! -//! - SEARCH USING VECTOR(, ARRAY[...], ) -//! - SEARCH USING VECTOR(...) WITH FILTER -//! - SEARCH USING FUSION(vector=..., graph=..., top_k=...) -//! - CREATE VECTOR INDEX ON [METRIC cosine|l2] [M ] [EF_CONSTRUCTION ] -//! - CREATE FULLTEXT INDEX ON () -//! - CRDT MERGE INTO FROM TO - -use std::sync::Arc; -use std::time::Duration; - -use futures::stream; -use pgwire::api::results::{DataRowEncoder, QueryResponse, Response, Tag}; -use pgwire::error::PgWireResult; - -use crate::bridge::envelope::PhysicalPlan; -use crate::bridge::physical_plan::{CrdtOp, GraphOp, VectorOp}; -use crate::control::security::identity::AuthenticatedIdentity; -use crate::control::state::SharedState; - -use super::super::types::{sqlstate_error, text_field}; - -// ── SEARCH USING VECTOR ───────────────────────────────────────────── - -/// SEARCH USING VECTOR(ARRAY[...], ) -/// SEARCH USING VECTOR(ARRAY[...], ) WITH FILTER -pub async fn search_vector( - state: &SharedState, - identity: &AuthenticatedIdentity, - sql: &str, -) -> PgWireResult> { - // Extract collection name. - let parts: Vec<&str> = sql.split_whitespace().collect(); - if parts.len() < 4 { - return Err(sqlstate_error( - "42601", - "syntax: SEARCH USING VECTOR(ARRAY[...], )", - )); - } - let collection = parts[1]; - let tenant_id = identity.tenant_id; - - // Parse field name and ARRAY[...] from VECTOR(field, ARRAY[...], k) or VECTOR(ARRAY[...], k). - let vector_paren = sql.find("VECTOR(").or_else(|| sql.find("vector(")); - let vector_paren = match vector_paren { - Some(i) => i + 7, - None => { - return Err(sqlstate_error( - "42601", - "expected VECTOR(...) in SEARCH USING VECTOR", - )); - } - }; - - // Extract field name if present before ARRAY[. - let array_start = sql.find("ARRAY[").or_else(|| sql.find("array[")); - let array_start = match array_start { - Some(i) => i + 6, - None => { - return Err(sqlstate_error( - "42601", - "expected ARRAY[...] in SEARCH USING VECTOR", - )); - } - }; - - // Field name is between VECTOR( and ARRAY[ (trimmed, comma-stripped). - let field_name = sql[vector_paren..array_start - 6] - .trim() - .trim_end_matches(',') - .trim() - .to_string(); - - let array_end = sql[array_start..].find(']').map(|i| i + array_start); - let array_end = match array_end { - Some(i) => i, - None => { - return Err(sqlstate_error("42601", "unterminated ARRAY[")); - } - }; - - let vector_str = &sql[array_start..array_end]; - let query_vector: Vec = vector_str - .split(',') - .filter_map(|s| s.trim().parse::().ok()) - .collect(); - - if query_vector.is_empty() { - return Err(sqlstate_error("42601", "empty query vector")); - } - - // Parse top_k: number after the closing bracket. - let after_array = &sql[array_end + 1..]; - let top_k = after_array - .split(|c: char| !c.is_ascii_digit()) - .find(|s| !s.is_empty()) - .and_then(|s| s.parse::().ok()) - .unwrap_or(10); - - // Future: parse WITH FILTER predicates, evaluate against documents, build Roaring bitmap. - let filter_bitmap: Option> = None; - - let plan = PhysicalPlan::Vector(VectorOp::Search { - collection: collection.to_string(), - query_vector: Arc::from(query_vector.as_slice()), - top_k, - ef_search: 0, - filter_bitmap, - field_name, - rls_filters: Vec::new(), - }); - - let payload = super::sync_dispatch::dispatch_async( - state, - tenant_id, - collection, - plan, - Duration::from_secs(state.tuning.network.default_deadline_secs), - ) - .await - .map_err(|e| sqlstate_error("XX000", &e.to_string()))?; - - let schema = Arc::new(vec![text_field("result")]); - let text = crate::data::executor::response_codec::decode_payload_to_json(&payload); - let mut encoder = DataRowEncoder::new(schema.clone()); - encoder - .encode_field(&text) - .map_err(|e| sqlstate_error("XX000", &e.to_string()))?; - let row = encoder.take_row(); - - Ok(vec![Response::Query(QueryResponse::new( - schema, - stream::iter(vec![Ok(row)]), - ))]) -} - -// ── SEARCH USING FUSION ───────────────────────────────────────────── - -/// SEARCH USING FUSION(VECTOR(ARRAY[...], ), GRAPH(