Skip to content
Merged
45 changes: 40 additions & 5 deletions rust/crates/cloudsearch-index/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -998,17 +998,26 @@ impl IndexHandle {
for (_, field_value) in obj {
if let Some(text) = field_value.as_str() {
let tokens = tokenize(text);
let field_text = text.to_string();
let lower_text = text.to_ascii_lowercase();
let mut seen_offsets: BTreeMap<String, Vec<u32>> = BTreeMap::new();
for token in &tokens {
let mut search_from = 0usize;
while let Some(pos) = field_text[search_from..].find(token) {
let byte_offset = u32::try_from(search_from + pos).unwrap_or(0);
while let Some(pos) = lower_text[search_from..].find(token) {
let Ok(byte_offset) = u32::try_from(search_from + pos) else {
tracing::warn!(
offset = search_from + pos,
"byte offset exceeds u32::MAX, skipping position for term '{}' in doc '{}'",
token,
document.id
);
search_from += pos + 1;
continue;
};
seen_offsets
.entry(token.clone())
.or_default()
.push(byte_offset);
search_from += pos + 1;
search_from += pos + token.len();
}
}
for (term, positions) in seen_offsets {
Expand Down Expand Up @@ -1795,7 +1804,9 @@ fn score_phrase_query(
}

fn tokenize(text: &str) -> Vec<String> {
text.split_whitespace().map(str::to_lowercase).collect()
text.split_whitespace()
.map(str::to_ascii_lowercase)
.collect()
}

/// Stable hash of a document ID string for use as a persistent `doc_id` in postings.
Expand Down Expand Up @@ -5311,4 +5322,28 @@ mod tests {
CloudSearchError::WalChecksumMismatch | CloudSearchError::InvalidWalRecord(_)
));
}

#[test]
fn tokenize_lowercase_converts_correctly() {
let result = tokenize("Hello World");
assert_eq!(result, vec!["hello", "world"]);
}

#[test]
fn tokenize_preserves_single_tokens() {
let result = tokenize("test");
assert_eq!(result, vec!["test"]);
}

#[test]
fn tokenize_multiple_whitespace_collapsed() {
let result = tokenize("a b");
assert_eq!(result, vec!["a", "b"]);
}

#[test]
fn tokenize_empty_string() {
let result = tokenize("");
assert!(result.is_empty());
}
}
174 changes: 172 additions & 2 deletions rust/crates/cloudsearch-index/tests/coverage.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
//! Run with: cargo test -p cloudsearch-index --test coverage

use cloudsearch_common::{
BoolQuery, CreateIndexRequest, IndexDocument, IndexSettings, SearchQuery, SearchRequest,
SortOrder, SortSpec, TermQuery,
BoolQuery, CreateIndexRequest, IndexDocument, IndexSettings, MatchQuery, SearchQuery,
SearchRequest, SortOrder, SortSpec, TermQuery,
};
use cloudsearch_index::{IndexCatalog, MergePlan};
use cloudsearch_storage::SegmentMeta;
Expand Down Expand Up @@ -191,3 +191,173 @@ async fn validate_search_request_rejects_from_exceeding_max() {
"from exceeding MAX_SEARCH_OFFSET should be rejected"
);
}

#[tokio::test]
async fn highlight_positions_case_insensitive() {
// Index doc with mixed-case text, search for lowercase term.
// extract_positions now searches in to_ascii_lowercase()'d text,
// so token "error" finds "ERROR" positions correctly.
let temp_dir = TempDir::new().expect("temp dir");
let catalog = Arc::new(IndexCatalog::new(temp_dir.path()));
catalog.initialize().await.expect("init catalog");
let _metadata = catalog
.create_index(
"test",
CreateIndexRequest {
settings: IndexSettings::default(),
..Default::default()
},
)
.await
.expect("create index");
let mut handle = catalog.open_index("test").await.expect("open index");

handle
.index_document(doc(
"1",
serde_json::json!({"content": "ERROR log ERROR message"}),
))
.await
.expect("index");
handle.refresh().await.expect("refresh");
handle.flush().await.expect("flush");

let result = handle.search(&SearchRequest {
query: Some(SearchQuery::Match(MatchQuery {
field: "content".to_string(),
value: "error".to_string(),
})),
..Default::default()
});

let hit = result.hits.hits.first().expect("should have a hit");
assert!(
hit.highlight.is_some(),
"lowercase 'error' should find highlight in 'ERROR log ERROR'"
);
}

#[tokio::test]
async fn highlight_positions_multiple_occurrences() {
// Same term appears twice in one field — all positions should be captured.
let temp_dir = TempDir::new().expect("temp dir");
let catalog = Arc::new(IndexCatalog::new(temp_dir.path()));
catalog.initialize().await.expect("init catalog");
let _metadata = catalog
.create_index(
"test",
CreateIndexRequest {
settings: IndexSettings::default(),
..Default::default()
},
)
.await
.expect("create index");
let mut handle = catalog.open_index("test").await.expect("open index");

handle
.index_document(doc(
"1",
serde_json::json!({"content": "info error info error"}),
))
.await
.expect("index");
handle.refresh().await.expect("refresh");
handle.flush().await.expect("flush");

let result = handle.search(&SearchRequest {
query: Some(SearchQuery::Match(MatchQuery {
field: "content".to_string(),
value: "info".to_string(),
})),
..Default::default()
});

let hit = result.hits.hits.first().expect("should have a hit");
assert!(
hit.highlight.is_some(),
"'info' should be highlighted even when it appears twice"
);
}

#[tokio::test]
async fn highlight_positions_no_match() {
// Term not in document — no highlight.
let temp_dir = TempDir::new().expect("temp dir");
let catalog = Arc::new(IndexCatalog::new(temp_dir.path()));
catalog.initialize().await.expect("init catalog");
let _metadata = catalog
.create_index(
"test",
CreateIndexRequest {
settings: IndexSettings::default(),
..Default::default()
},
)
.await
.expect("create index");
let mut handle = catalog.open_index("test").await.expect("open index");

handle
.index_document(doc("1", serde_json::json!({"content": "hello world"})))
.await
.expect("index");
handle.refresh().await.expect("refresh");
handle.flush().await.expect("flush");

// Use MatchAll so the document IS found (giving us a hit to assert on),
// then verify that hit has no highlight since no query terms are provided.
let result = handle.search(&SearchRequest {
query: Some(SearchQuery::MatchAll),
..Default::default()
});

let hit = result
.hits
.hits
.first()
.expect("doc should be found with MatchAll");
assert!(
hit.highlight.is_none(),
"MatchAll query should produce no highlight (no query terms to highlight)"
);
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.

#[tokio::test]
async fn highlight_positions_empty_field() {
// Document with empty text field — no highlights.
let temp_dir = TempDir::new().expect("temp dir");
let catalog = Arc::new(IndexCatalog::new(temp_dir.path()));
catalog.initialize().await.expect("init catalog");
let _metadata = catalog
.create_index(
"test",
CreateIndexRequest {
settings: IndexSettings::default(),
..Default::default()
},
)
.await
.expect("create index");
let mut handle = catalog.open_index("test").await.expect("open index");

handle
.index_document(doc("1", serde_json::json!({"content": ""})))
.await
.expect("index");
handle.refresh().await.expect("refresh");
handle.flush().await.expect("flush");

let result = handle.search(&SearchRequest {
query: Some(SearchQuery::Match(MatchQuery {
field: "content".to_string(),
value: "any".to_string(),
})),
..Default::default()
});

assert!(
result.hits.hits.is_empty() || result.hits.hits.iter().all(|h| h.highlight.is_none()),
"no highlight for empty text field"
);
}