diff --git a/rust/crates/cloudsearch-index/src/lib.rs b/rust/crates/cloudsearch-index/src/lib.rs index ba28f7f..6a9910e 100644 --- a/rust/crates/cloudsearch-index/src/lib.rs +++ b/rust/crates/cloudsearch-index/src/lib.rs @@ -998,17 +998,26 @@ impl IndexHandle { for (_, field_value) in obj { if let Some(text) = field_value.as_str() { let tokens = tokenize(text); - let field_text = text.to_string(); + let lower_text = text.to_ascii_lowercase(); let mut seen_offsets: BTreeMap> = BTreeMap::new(); for token in &tokens { let mut search_from = 0usize; - while let Some(pos) = field_text[search_from..].find(token) { - let byte_offset = u32::try_from(search_from + pos).unwrap_or(0); + while let Some(pos) = lower_text[search_from..].find(token) { + let Ok(byte_offset) = u32::try_from(search_from + pos) else { + tracing::warn!( + offset = search_from + pos, + "byte offset exceeds u32::MAX, skipping position for term '{}' in doc '{}'", + token, + document.id + ); + search_from += pos + 1; + continue; + }; seen_offsets .entry(token.clone()) .or_default() .push(byte_offset); - search_from += pos + 1; + search_from += pos + token.len(); } } for (term, positions) in seen_offsets { @@ -1795,7 +1804,9 @@ fn score_phrase_query( } fn tokenize(text: &str) -> Vec { - text.split_whitespace().map(str::to_lowercase).collect() + text.split_whitespace() + .map(str::to_ascii_lowercase) + .collect() } /// Stable hash of a document ID string for use as a persistent `doc_id` in postings. @@ -5311,4 +5322,28 @@ mod tests { CloudSearchError::WalChecksumMismatch | CloudSearchError::InvalidWalRecord(_) )); } + + #[test] + fn tokenize_lowercase_converts_correctly() { + let result = tokenize("Hello World"); + assert_eq!(result, vec!["hello", "world"]); + } + + #[test] + fn tokenize_preserves_single_tokens() { + let result = tokenize("test"); + assert_eq!(result, vec!["test"]); + } + + #[test] + fn tokenize_multiple_whitespace_collapsed() { + let result = tokenize("a b"); + assert_eq!(result, vec!["a", "b"]); + } + + #[test] + fn tokenize_empty_string() { + let result = tokenize(""); + assert!(result.is_empty()); + } } diff --git a/rust/crates/cloudsearch-index/tests/coverage.rs b/rust/crates/cloudsearch-index/tests/coverage.rs index e67aa12..600fd0e 100644 --- a/rust/crates/cloudsearch-index/tests/coverage.rs +++ b/rust/crates/cloudsearch-index/tests/coverage.rs @@ -3,8 +3,8 @@ //! Run with: cargo test -p cloudsearch-index --test coverage use cloudsearch_common::{ - BoolQuery, CreateIndexRequest, IndexDocument, IndexSettings, SearchQuery, SearchRequest, - SortOrder, SortSpec, TermQuery, + BoolQuery, CreateIndexRequest, IndexDocument, IndexSettings, MatchQuery, SearchQuery, + SearchRequest, SortOrder, SortSpec, TermQuery, }; use cloudsearch_index::{IndexCatalog, MergePlan}; use cloudsearch_storage::SegmentMeta; @@ -191,3 +191,173 @@ async fn validate_search_request_rejects_from_exceeding_max() { "from exceeding MAX_SEARCH_OFFSET should be rejected" ); } + +#[tokio::test] +async fn highlight_positions_case_insensitive() { + // Index doc with mixed-case text, search for lowercase term. + // extract_positions now searches in to_ascii_lowercase()'d text, + // so token "error" finds "ERROR" positions correctly. + let temp_dir = TempDir::new().expect("temp dir"); + let catalog = Arc::new(IndexCatalog::new(temp_dir.path())); + catalog.initialize().await.expect("init catalog"); + let _metadata = catalog + .create_index( + "test", + CreateIndexRequest { + settings: IndexSettings::default(), + ..Default::default() + }, + ) + .await + .expect("create index"); + let mut handle = catalog.open_index("test").await.expect("open index"); + + handle + .index_document(doc( + "1", + serde_json::json!({"content": "ERROR log ERROR message"}), + )) + .await + .expect("index"); + handle.refresh().await.expect("refresh"); + handle.flush().await.expect("flush"); + + let result = handle.search(&SearchRequest { + query: Some(SearchQuery::Match(MatchQuery { + field: "content".to_string(), + value: "error".to_string(), + })), + ..Default::default() + }); + + let hit = result.hits.hits.first().expect("should have a hit"); + assert!( + hit.highlight.is_some(), + "lowercase 'error' should find highlight in 'ERROR log ERROR'" + ); +} + +#[tokio::test] +async fn highlight_positions_multiple_occurrences() { + // Same term appears twice in one field — all positions should be captured. + let temp_dir = TempDir::new().expect("temp dir"); + let catalog = Arc::new(IndexCatalog::new(temp_dir.path())); + catalog.initialize().await.expect("init catalog"); + let _metadata = catalog + .create_index( + "test", + CreateIndexRequest { + settings: IndexSettings::default(), + ..Default::default() + }, + ) + .await + .expect("create index"); + let mut handle = catalog.open_index("test").await.expect("open index"); + + handle + .index_document(doc( + "1", + serde_json::json!({"content": "info error info error"}), + )) + .await + .expect("index"); + handle.refresh().await.expect("refresh"); + handle.flush().await.expect("flush"); + + let result = handle.search(&SearchRequest { + query: Some(SearchQuery::Match(MatchQuery { + field: "content".to_string(), + value: "info".to_string(), + })), + ..Default::default() + }); + + let hit = result.hits.hits.first().expect("should have a hit"); + assert!( + hit.highlight.is_some(), + "'info' should be highlighted even when it appears twice" + ); +} + +#[tokio::test] +async fn highlight_positions_no_match() { + // Term not in document — no highlight. + let temp_dir = TempDir::new().expect("temp dir"); + let catalog = Arc::new(IndexCatalog::new(temp_dir.path())); + catalog.initialize().await.expect("init catalog"); + let _metadata = catalog + .create_index( + "test", + CreateIndexRequest { + settings: IndexSettings::default(), + ..Default::default() + }, + ) + .await + .expect("create index"); + let mut handle = catalog.open_index("test").await.expect("open index"); + + handle + .index_document(doc("1", serde_json::json!({"content": "hello world"}))) + .await + .expect("index"); + handle.refresh().await.expect("refresh"); + handle.flush().await.expect("flush"); + + // Use MatchAll so the document IS found (giving us a hit to assert on), + // then verify that hit has no highlight since no query terms are provided. + let result = handle.search(&SearchRequest { + query: Some(SearchQuery::MatchAll), + ..Default::default() + }); + + let hit = result + .hits + .hits + .first() + .expect("doc should be found with MatchAll"); + assert!( + hit.highlight.is_none(), + "MatchAll query should produce no highlight (no query terms to highlight)" + ); +} + +#[tokio::test] +async fn highlight_positions_empty_field() { + // Document with empty text field — no highlights. + let temp_dir = TempDir::new().expect("temp dir"); + let catalog = Arc::new(IndexCatalog::new(temp_dir.path())); + catalog.initialize().await.expect("init catalog"); + let _metadata = catalog + .create_index( + "test", + CreateIndexRequest { + settings: IndexSettings::default(), + ..Default::default() + }, + ) + .await + .expect("create index"); + let mut handle = catalog.open_index("test").await.expect("open index"); + + handle + .index_document(doc("1", serde_json::json!({"content": ""}))) + .await + .expect("index"); + handle.refresh().await.expect("refresh"); + handle.flush().await.expect("flush"); + + let result = handle.search(&SearchRequest { + query: Some(SearchQuery::Match(MatchQuery { + field: "content".to_string(), + value: "any".to_string(), + })), + ..Default::default() + }); + + assert!( + result.hits.hits.is_empty() || result.hits.hits.iter().all(|h| h.highlight.is_none()), + "no highlight for empty text field" + ); +}