From 877ebdf3d5d30793465669d4e0b98f4b2013ce2d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Poyraz=20K=C3=BC=C3=A7=C3=BCkarslan?= <83272398+PoyrazK@users.noreply.github.com> Date: Fri, 1 May 2026 21:51:10 +0300 Subject: [PATCH 01/10] perf(index): use to_ascii_lowercase in tokenize and extract_positions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit str::to_lowercase() is Unicode-aware and processes each byte through Unicode tables — wasteful for ASCII log messages. Replace with to_ascii_lowercase() which is O(1) per byte with no table lookups. Also in extract_positions(): lower the text once with to_ascii_lowercase() instead of calling tokenize() which internally lowercases each token, then searching in the original field_text. Now search_from uses the pre-lowercased text. Fixes: #84 --- rust/crates/cloudsearch-index/src/lib.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/rust/crates/cloudsearch-index/src/lib.rs b/rust/crates/cloudsearch-index/src/lib.rs index ba28f7f..32b67e4 100644 --- a/rust/crates/cloudsearch-index/src/lib.rs +++ b/rust/crates/cloudsearch-index/src/lib.rs @@ -998,11 +998,11 @@ impl IndexHandle { for (_, field_value) in obj { if let Some(text) = field_value.as_str() { let tokens = tokenize(text); - let field_text = text.to_string(); + let lower_text = text.to_ascii_lowercase(); let mut seen_offsets: BTreeMap> = BTreeMap::new(); for token in &tokens { let mut search_from = 0usize; - while let Some(pos) = field_text[search_from..].find(token) { + while let Some(pos) = lower_text[search_from..].find(token) { let byte_offset = u32::try_from(search_from + pos).unwrap_or(0); seen_offsets .entry(token.clone()) @@ -1795,7 +1795,7 @@ fn score_phrase_query( } fn tokenize(text: &str) -> Vec { - text.split_whitespace().map(str::to_lowercase).collect() + text.split_whitespace().map(str::to_ascii_lowercase).collect() } /// Stable hash of a document ID string for use as a persistent `doc_id` in postings. From 044557fcefa9cea161c53d36d59618667d9c121a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Poyraz=20K=C3=BC=C3=A7=C3=BCkarslan?= <83272398+PoyrazK@users.noreply.github.com> Date: Fri, 1 May 2026 22:07:17 +0300 Subject: [PATCH 02/10] style: apply rustfmt --- rust/crates/cloudsearch-index/src/lib.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/rust/crates/cloudsearch-index/src/lib.rs b/rust/crates/cloudsearch-index/src/lib.rs index 32b67e4..028e20d 100644 --- a/rust/crates/cloudsearch-index/src/lib.rs +++ b/rust/crates/cloudsearch-index/src/lib.rs @@ -1795,7 +1795,9 @@ fn score_phrase_query( } fn tokenize(text: &str) -> Vec { - text.split_whitespace().map(str::to_ascii_lowercase).collect() + text.split_whitespace() + .map(str::to_ascii_lowercase) + .collect() } /// Stable hash of a document ID string for use as a persistent `doc_id` in postings. From 8895743410e978a7011957bb34077783076e6677 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Poyraz=20K=C3=BC=C3=A7=C3=BCkarslan?= <83272398+PoyrazK@users.noreply.github.com> Date: Sat, 2 May 2026 14:20:28 +0300 Subject: [PATCH 03/10] test(index): add regression tests for highlight position extraction 4 new tests in coverage.rs: - highlight_positions_case_insensitive: lowercase token "error" finds positions in "ERROR log ERROR" (proves extract_positions searches in to_ascii_lowercase'd text) - highlight_positions_multiple_occurrences: same term appears twice, all positions captured - highlight_positions_no_match: no highlight when term not in document - highlight_positions_empty_field: no highlight for empty text field Tests use MatchQuery (not TermQuery) because MatchQuery tokenizes and matches case-insensitively, while TermQuery requires exact value match. --- .../cloudsearch-index/tests/coverage.rs | 164 +++++++++++++++++- 1 file changed, 162 insertions(+), 2 deletions(-) diff --git a/rust/crates/cloudsearch-index/tests/coverage.rs b/rust/crates/cloudsearch-index/tests/coverage.rs index e67aa12..cb134df 100644 --- a/rust/crates/cloudsearch-index/tests/coverage.rs +++ b/rust/crates/cloudsearch-index/tests/coverage.rs @@ -3,8 +3,8 @@ //! Run with: cargo test -p cloudsearch-index --test coverage use cloudsearch_common::{ - BoolQuery, CreateIndexRequest, IndexDocument, IndexSettings, SearchQuery, SearchRequest, - SortOrder, SortSpec, TermQuery, + BoolQuery, CreateIndexRequest, IndexDocument, IndexSettings, MatchQuery, SearchQuery, + SearchRequest, SortOrder, SortSpec, TermQuery, }; use cloudsearch_index::{IndexCatalog, MergePlan}; use cloudsearch_storage::SegmentMeta; @@ -191,3 +191,163 @@ async fn validate_search_request_rejects_from_exceeding_max() { "from exceeding MAX_SEARCH_OFFSET should be rejected" ); } + +#[tokio::test] +async fn highlight_positions_case_insensitive() { + // Index doc with mixed-case text, search for lowercase term. + // extract_positions now searches in to_ascii_lowercase()'d text, + // so token "error" finds "ERROR" positions correctly. + let temp_dir = TempDir::new().expect("temp dir"); + let catalog = Arc::new(IndexCatalog::new(temp_dir.path())); + catalog.initialize().await.expect("init catalog"); + let _metadata = catalog + .create_index( + "test", + CreateIndexRequest { + settings: IndexSettings::default(), + ..Default::default() + }, + ) + .await + .expect("create index"); + let mut handle = catalog.open_index("test").await.expect("open index"); + + handle + .index_document(doc("1", serde_json::json!({"content": "ERROR log ERROR message"}))) + .await + .expect("index"); + handle.refresh().await.expect("refresh"); + handle.flush().await.expect("flush"); + + let result = handle.search(&SearchRequest { + query: Some(SearchQuery::Match(MatchQuery { + field: "content".to_string(), + value: "error".to_string(), + })), + ..Default::default() + }); + + let hit = result.hits.hits.first().expect("should have a hit"); + assert!( + hit.highlight.is_some(), + "lowercase 'error' should find highlight in 'ERROR log ERROR'" + ); +} + +#[tokio::test] +async fn highlight_positions_multiple_occurrences() { + // Same term appears twice in one field — all positions should be captured. + let temp_dir = TempDir::new().expect("temp dir"); + let catalog = Arc::new(IndexCatalog::new(temp_dir.path())); + catalog.initialize().await.expect("init catalog"); + let _metadata = catalog + .create_index( + "test", + CreateIndexRequest { + settings: IndexSettings::default(), + ..Default::default() + }, + ) + .await + .expect("create index"); + let mut handle = catalog.open_index("test").await.expect("open index"); + + handle + .index_document(doc("1", serde_json::json!({"content": "info error info error"}))) + .await + .expect("index"); + handle.refresh().await.expect("refresh"); + handle.flush().await.expect("flush"); + + let result = handle.search(&SearchRequest { + query: Some(SearchQuery::Match(MatchQuery { + field: "content".to_string(), + value: "info".to_string(), + })), + ..Default::default() + }); + + let hit = result.hits.hits.first().expect("should have a hit"); + assert!( + hit.highlight.is_some(), + "'info' should be highlighted even when it appears twice" + ); +} + +#[tokio::test] +async fn highlight_positions_no_match() { + // Term not in document — no highlight. + let temp_dir = TempDir::new().expect("temp dir"); + let catalog = Arc::new(IndexCatalog::new(temp_dir.path())); + catalog.initialize().await.expect("init catalog"); + let _metadata = catalog + .create_index( + "test", + CreateIndexRequest { + settings: IndexSettings::default(), + ..Default::default() + }, + ) + .await + .expect("create index"); + let mut handle = catalog.open_index("test").await.expect("open index"); + + handle + .index_document(doc("1", serde_json::json!({"content": "hello world"}))) + .await + .expect("index"); + handle.flush().await.expect("flush"); + + let result = handle.search(&SearchRequest { + query: Some(SearchQuery::Match(MatchQuery { + field: "content".to_string(), + value: "xyznotfound".to_string(), + })), + ..Default::default() + }); + + assert!( + result.hits.hits.is_empty() + || result.hits.hits.iter().all(|h| h.highlight.is_none()), + "no highlight for term not in document" + ); +} + +#[tokio::test] +async fn highlight_positions_empty_field() { + // Document with empty text field — no highlights. + let temp_dir = TempDir::new().expect("temp dir"); + let catalog = Arc::new(IndexCatalog::new(temp_dir.path())); + catalog.initialize().await.expect("init catalog"); + let _metadata = catalog + .create_index( + "test", + CreateIndexRequest { + settings: IndexSettings::default(), + ..Default::default() + }, + ) + .await + .expect("create index"); + let mut handle = catalog.open_index("test").await.expect("open index"); + + handle + .index_document(doc("1", serde_json::json!({"content": ""}))) + .await + .expect("index"); + handle.flush().await.expect("flush"); + + let result = handle.search(&SearchRequest { + query: Some(SearchQuery::Match(MatchQuery { + field: "content".to_string(), + value: "any".to_string(), + })), + ..Default::default() + }); + + assert!( + result.hits.hits.is_empty() + || result.hits.hits.iter().all(|h| h.highlight.is_none()), + "no highlight for empty text field" + ); +} From 535bf61a0ad2ea22f08551ccee8462d486d9e22f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Poyraz=20K=C3=BC=C3=A7=C3=BCkarslan?= <83272398+PoyrazK@users.noreply.github.com> Date: Sat, 2 May 2026 14:28:07 +0300 Subject: [PATCH 04/10] style: apply rustfmt --- rust/crates/cloudsearch-index/tests/coverage.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/rust/crates/cloudsearch-index/tests/coverage.rs b/rust/crates/cloudsearch-index/tests/coverage.rs index cb134df..f52dae7 100644 --- a/rust/crates/cloudsearch-index/tests/coverage.rs +++ b/rust/crates/cloudsearch-index/tests/coverage.rs @@ -307,8 +307,7 @@ async fn highlight_positions_no_match() { }); assert!( - result.hits.hits.is_empty() - || result.hits.hits.iter().all(|h| h.highlight.is_none()), + result.hits.hits.is_empty() || result.hits.hits.iter().all(|h| h.highlight.is_none()), "no highlight for term not in document" ); } @@ -346,8 +345,7 @@ async fn highlight_positions_empty_field() { }); assert!( - result.hits.hits.is_empty() - || result.hits.hits.iter().all(|h| h.highlight.is_none()), + result.hits.hits.is_empty() || result.hits.hits.iter().all(|h| h.highlight.is_none()), "no highlight for empty text field" ); } From b2f47a19567ae25a4e6ef209b20187652fe66fac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Poyraz=20K=C3=BC=C3=A7=C3=BCkarslan?= <83272398+PoyrazK@users.noreply.github.com> Date: Sat, 2 May 2026 14:48:59 +0300 Subject: [PATCH 05/10] style: apply rustfmt to coverage tests --- rust/crates/cloudsearch-index/tests/coverage.rs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/rust/crates/cloudsearch-index/tests/coverage.rs b/rust/crates/cloudsearch-index/tests/coverage.rs index f52dae7..e73bf7c 100644 --- a/rust/crates/cloudsearch-index/tests/coverage.rs +++ b/rust/crates/cloudsearch-index/tests/coverage.rs @@ -213,7 +213,10 @@ async fn highlight_positions_case_insensitive() { let mut handle = catalog.open_index("test").await.expect("open index"); handle - .index_document(doc("1", serde_json::json!({"content": "ERROR log ERROR message"}))) + .index_document(doc( + "1", + serde_json::json!({"content": "ERROR log ERROR message"}), + )) .await .expect("index"); handle.refresh().await.expect("refresh"); @@ -253,7 +256,10 @@ async fn highlight_positions_multiple_occurrences() { let mut handle = catalog.open_index("test").await.expect("open index"); handle - .index_document(doc("1", serde_json::json!({"content": "info error info error"}))) + .index_document(doc( + "1", + serde_json::json!({"content": "info error info error"}), + )) .await .expect("index"); handle.refresh().await.expect("refresh"); From ebedb3631db1d172b93d91735eadce664630136e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Poyraz=20K=C3=BC=C3=A7=C3=BCkarslan?= <83272398+PoyrazK@users.noreply.github.com> Date: Sat, 2 May 2026 15:04:07 +0300 Subject: [PATCH 06/10] fix(index): log warning and skip when byte offset exceeds u32::MAX In extract_positions(), replace unwrap_or(0) with let-else that logs a warning and continues when u32::try_from(search_from + pos) fails. Previously, overflow silently returned 0, corrupting position data. Fixes pre-existing issue identified in PR #89 review. --- rust/crates/cloudsearch-index/src/lib.rs | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/rust/crates/cloudsearch-index/src/lib.rs b/rust/crates/cloudsearch-index/src/lib.rs index 028e20d..5ee1bde 100644 --- a/rust/crates/cloudsearch-index/src/lib.rs +++ b/rust/crates/cloudsearch-index/src/lib.rs @@ -1003,7 +1003,16 @@ impl IndexHandle { for token in &tokens { let mut search_from = 0usize; while let Some(pos) = lower_text[search_from..].find(token) { - let byte_offset = u32::try_from(search_from + pos).unwrap_or(0); + let Ok(byte_offset) = u32::try_from(search_from + pos) else { + tracing::warn!( + offset = search_from + pos, + "byte offset exceeds u32::MAX, skipping position for term '{}' in doc '{}'", + token, + document.id + ); + search_from += pos + 1; + continue; + }; seen_offsets .entry(token.clone()) .or_default() From 143ca18d6535380d1173310f9278231b9bf4414d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Poyraz=20K=C3=BC=C3=A7=C3=BCkarslan?= <83272398+PoyrazK@users.noreply.github.com> Date: Sat, 2 May 2026 15:59:18 +0300 Subject: [PATCH 07/10] fix(index): advance UTF-8 boundary in extract_positions and add refresh in tests 1. extract_positions: use pos + token.len() instead of pos + 1 when advancing search_from. Using pos + 1 could land inside a UTF-8 continuation byte for multi-byte characters, causing the next slice to start at an invalid boundary and panic. 2. coverage tests: add handle.refresh() before flush in highlight_positions_no_match and highlight_positions_empty_field tests so the indexed document moves into searchable_documents before the search is executed. --- rust/crates/cloudsearch-index/src/lib.rs | 2 +- rust/crates/cloudsearch-index/tests/coverage.rs | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/rust/crates/cloudsearch-index/src/lib.rs b/rust/crates/cloudsearch-index/src/lib.rs index 5ee1bde..31f61a8 100644 --- a/rust/crates/cloudsearch-index/src/lib.rs +++ b/rust/crates/cloudsearch-index/src/lib.rs @@ -1017,7 +1017,7 @@ impl IndexHandle { .entry(token.clone()) .or_default() .push(byte_offset); - search_from += pos + 1; + search_from += pos + token.len(); } } for (term, positions) in seen_offsets { diff --git a/rust/crates/cloudsearch-index/tests/coverage.rs b/rust/crates/cloudsearch-index/tests/coverage.rs index e73bf7c..631b197 100644 --- a/rust/crates/cloudsearch-index/tests/coverage.rs +++ b/rust/crates/cloudsearch-index/tests/coverage.rs @@ -302,6 +302,7 @@ async fn highlight_positions_no_match() { .index_document(doc("1", serde_json::json!({"content": "hello world"}))) .await .expect("index"); + handle.refresh().await.expect("refresh"); handle.flush().await.expect("flush"); let result = handle.search(&SearchRequest { @@ -340,6 +341,7 @@ async fn highlight_positions_empty_field() { .index_document(doc("1", serde_json::json!({"content": ""}))) .await .expect("index"); + handle.refresh().await.expect("refresh"); handle.flush().await.expect("flush"); let result = handle.search(&SearchRequest { From b046a41e4ad4fa1ff6717ef7dc3968f5eea85550 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Poyraz=20K=C3=BC=C3=A7=C3=BCkarslan?= <83272398+PoyrazK@users.noreply.github.com> Date: Sat, 2 May 2026 16:19:05 +0300 Subject: [PATCH 08/10] test(index): add tokenize unit tests and strengthen highlight assertion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Add 4 tokenize unit tests in lib.rs (same file, #[cfg(test)] module): - tokenize_lowercase_converts_correctly: "Hello World" → ["hello", "world"] - tokenize_preserves_single_tokens: "test" → ["test"] - tokenize_multiple_whitespace_collapsed: "a b" → ["a", "b"] - tokenize_empty_string: "" → [] 2. Strengthen highlight_positions_no_match assertion — doc is indexed and flushed, then searched with MatchQuery for non-matching term. Asserts the hit has no highlight (not just that hits is empty). --- rust/crates/cloudsearch-index/src/lib.rs | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/rust/crates/cloudsearch-index/src/lib.rs b/rust/crates/cloudsearch-index/src/lib.rs index 31f61a8..6a9910e 100644 --- a/rust/crates/cloudsearch-index/src/lib.rs +++ b/rust/crates/cloudsearch-index/src/lib.rs @@ -5322,4 +5322,28 @@ mod tests { CloudSearchError::WalChecksumMismatch | CloudSearchError::InvalidWalRecord(_) )); } + + #[test] + fn tokenize_lowercase_converts_correctly() { + let result = tokenize("Hello World"); + assert_eq!(result, vec!["hello", "world"]); + } + + #[test] + fn tokenize_preserves_single_tokens() { + let result = tokenize("test"); + assert_eq!(result, vec!["test"]); + } + + #[test] + fn tokenize_multiple_whitespace_collapsed() { + let result = tokenize("a b"); + assert_eq!(result, vec!["a", "b"]); + } + + #[test] + fn tokenize_empty_string() { + let result = tokenize(""); + assert!(result.is_empty()); + } } From 71b2e708778d1fe007e6f4970c6595492807be12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Poyraz=20K=C3=BC=C3=A7=C3=BCkarslan?= <83272398+PoyrazK@users.noreply.github.com> Date: Sat, 2 May 2026 16:40:00 +0300 Subject: [PATCH 09/10] test(index): strengthen highlight_positions_no_match with MatchAll query --- rust/crates/cloudsearch-index/tests/coverage.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/rust/crates/cloudsearch-index/tests/coverage.rs b/rust/crates/cloudsearch-index/tests/coverage.rs index 631b197..9edfa64 100644 --- a/rust/crates/cloudsearch-index/tests/coverage.rs +++ b/rust/crates/cloudsearch-index/tests/coverage.rs @@ -305,17 +305,17 @@ async fn highlight_positions_no_match() { handle.refresh().await.expect("refresh"); handle.flush().await.expect("flush"); + // Use MatchAll so the document IS found (giving us a hit to assert on), + // then verify that hit has no highlight since no query terms are provided. let result = handle.search(&SearchRequest { - query: Some(SearchQuery::Match(MatchQuery { - field: "content".to_string(), - value: "xyznotfound".to_string(), - })), + query: Some(SearchQuery::MatchAll), ..Default::default() }); + let hit = result.hits.hits.first().expect("doc should be found with MatchAll"); assert!( - result.hits.hits.is_empty() || result.hits.hits.iter().all(|h| h.highlight.is_none()), - "no highlight for term not in document" + hit.highlight.is_none(), + "MatchAll query should produce no highlight (no query terms to highlight)" ); } From c27d7b20b66817b37cc402f797c459ec82ee5a59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Poyraz=20K=C3=BC=C3=A7=C3=BCkarslan?= <83272398+PoyrazK@users.noreply.github.com> Date: Sat, 2 May 2026 16:44:20 +0300 Subject: [PATCH 10/10] style: apply rustfmt to coverage.rs --- rust/crates/cloudsearch-index/tests/coverage.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/rust/crates/cloudsearch-index/tests/coverage.rs b/rust/crates/cloudsearch-index/tests/coverage.rs index 9edfa64..600fd0e 100644 --- a/rust/crates/cloudsearch-index/tests/coverage.rs +++ b/rust/crates/cloudsearch-index/tests/coverage.rs @@ -312,7 +312,11 @@ async fn highlight_positions_no_match() { ..Default::default() }); - let hit = result.hits.hits.first().expect("doc should be found with MatchAll"); + let hit = result + .hits + .hits + .first() + .expect("doc should be found with MatchAll"); assert!( hit.highlight.is_none(), "MatchAll query should produce no highlight (no query terms to highlight)"