From 8d8c2c023b61ee4a3485e05b064e70d6a3a2ee9f Mon Sep 17 00:00:00 2001 From: Joan Antoni RE Date: Thu, 16 Apr 2026 12:09:15 +0200 Subject: [PATCH 1/7] Run extracted text search in a tokio blocking thread --- nidx/src/searcher/shard_text.rs | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/nidx/src/searcher/shard_text.rs b/nidx/src/searcher/shard_text.rs index 568dc8cda4..3b10f59bf2 100644 --- a/nidx/src/searcher/shard_text.rs +++ b/nidx/src/searcher/shard_text.rs @@ -23,6 +23,7 @@ use std::time::Instant; use nidx_protos::{ExtractedTextsRequest, ExtractedTextsResponse}; use nidx_text::{FieldUid, ParagraphUid, TextSearcher}; +use tracing::Span; use crate::errors::{NidxError, NidxResult}; use crate::searcher::index_cache::IndexCache; @@ -47,8 +48,24 @@ pub async fn extracted_texts( return Err(NidxError::NotFound); }; let index = index_cache.get(&text_index_id).await?; - let searcher: &TextSearcher = index.as_ref().into(); + let span = Span::current(); + let extracted_texts = tokio::task::spawn_blocking(move || { + span.in_scope(|| { + let searcher: &TextSearcher = index.as_ref().into(); + blocking_extracted_texts(searcher, request) + }) + }) + .await??; + + tracing::debug!("Extracted texts took {:?}", start.elapsed()); + Ok(extracted_texts) +} + +fn blocking_extracted_texts( + searcher: &TextSearcher, + request: ExtractedTextsRequest, +) -> NidxResult { let mut extracted_texts = ExtractedTextsResponse::default(); if !request.field_ids.is_empty() { @@ -85,6 +102,5 @@ pub async fn extracted_texts( } } - tracing::info!("Extracted texts took {:?}µs", start.elapsed().as_micros()); Ok(extracted_texts) } From d8403cbb9d56cff8da4dc3cb8516ca73d0066e25 Mon Sep 17 00:00:00 2001 From: Joan Antoni RE Date: Thu, 16 Apr 2026 14:33:07 +0200 Subject: [PATCH 2/7] Optimizations to get paragraph text - Directly get paragraph text instead of cloning the whole field text and then cutting - Reuse the same chars iterator for multiple paragraphs on the same field --- nidx/nidx_text/src/lib.rs | 86 ++++++++++++++++++++++------------- nidx/nidx_text/src/reader.rs | 88 +++++++++++++++++++++++++++++++++++- 2 files changed, 141 insertions(+), 33 deletions(-) diff --git a/nidx/nidx_text/src/lib.rs b/nidx/nidx_text/src/lib.rs index 3dd6703997..e70d7c59e5 100644 --- a/nidx/nidx_text/src/lib.rs +++ b/nidx/nidx_text/src/lib.rs @@ -85,7 +85,7 @@ pub struct FieldUid { } // Unique id for a field, equivalent to {rid}/{field_type}/{field_id}[/{split}]/{paragraph_start}-{paragraph_end} -#[derive(Clone, PartialEq, Eq, Hash)] +#[derive(Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] pub struct ParagraphUid { pub rid: String, pub field_type: String, @@ -224,37 +224,7 @@ impl TextSearcher { &self, paragraph_uids: Vec, ) -> anyhow::Result>> { - let mut paragraph_fields = HashMap::new(); - for paragraph_id in paragraph_uids { - let field_id = FieldUid::from(paragraph_id.clone()); - paragraph_fields - .entry(field_id) - .and_modify(|v: &mut Vec| v.push(paragraph_id.clone())) - .or_insert(vec![paragraph_id]); - } - - let fields_text = self - .reader - .get_fields_text(paragraph_fields.keys().cloned().collect())?; - - let mut paragraphs_text = HashMap::new(); - - for (field_id, field_text) in fields_text { - if let Some(paragraphs) = paragraph_fields.remove(&field_id) { - for paragraph_id in paragraphs { - let paragraph_text = field_text.as_ref().map(|field_text| { - field_text - .chars() - .skip(paragraph_id.paragraph_start as usize) - .take((paragraph_id.paragraph_end - paragraph_id.paragraph_start) as usize) - .collect() - }); - paragraphs_text.insert(paragraph_id, paragraph_text); - } - } - } - - Ok(paragraphs_text) + self.reader.get_paragraphs_text(paragraph_uids) } pub fn iterator(&self, request: &StreamRequest) -> anyhow::Result + use<>> { @@ -303,3 +273,55 @@ impl Display for ParagraphUid { } } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_paragraph_uid_sorting() { + let mut paragraphs = vec![ + ParagraphUid { + rid: "rid".to_string(), + field_type: "a".to_string(), + field_name: "title".to_string(), + split: None, + paragraph_start: 400, + paragraph_end: 500, + }, + ParagraphUid { + rid: "rid".to_string(), + field_type: "a".to_string(), + field_name: "title".to_string(), + split: None, + paragraph_start: 501, + paragraph_end: 555, + }, + ParagraphUid { + rid: "arid".to_string(), + field_type: "a".to_string(), + field_name: "title".to_string(), + split: None, + paragraph_start: 1000, + paragraph_end: 1020, + }, + ParagraphUid { + rid: "rid".to_string(), + field_type: "a".to_string(), + field_name: "title".to_string(), + split: None, + paragraph_start: 0, + paragraph_end: 20, + }, + ]; + paragraphs.sort(); + assert_eq!(paragraphs[0].rid, "arid"); + assert_eq!(paragraphs[0].paragraph_start, 1000); + assert_eq!(paragraphs[1].rid, "rid"); + assert_eq!(paragraphs[1].paragraph_start, 0); + assert_eq!(paragraphs[2].rid, "rid"); + assert_eq!(paragraphs[2].paragraph_start, 400); + assert_eq!(paragraphs[3].rid, "rid"); + assert_eq!(paragraphs[3].paragraph_start, 501); + } +} diff --git a/nidx/nidx_text/src/reader.rs b/nidx/nidx_text/src/reader.rs index 7081b6c3cb..f8b178d684 100644 --- a/nidx/nidx_text/src/reader.rs +++ b/nidx/nidx_text/src/reader.rs @@ -23,7 +23,7 @@ use std::time::*; use crate::schema::{datetime_utc_to_timestamp, decode_field_id, encode_field_id_bytes}; use crate::search_query::filter_to_query; -use crate::{DocumentSearchRequest, FieldUid, prefilter::*}; +use crate::{DocumentSearchRequest, FieldUid, ParagraphUid, prefilter::*}; use super::schema::TextSchema; use super::search_query; @@ -525,6 +525,92 @@ impl TextReaderService { Ok(texts) } + + pub fn get_paragraphs_text( + &self, + paragraph_uids: Vec, + ) -> anyhow::Result>> { + let mut field_paragraph_ids = HashMap::new(); + for paragraph_id in paragraph_uids { + let field_id = FieldUid::from(paragraph_id.clone()); + field_paragraph_ids + .entry(field_id) + .and_modify(|v: &mut Vec| v.push(paragraph_id.clone())) + .or_insert(vec![paragraph_id]); + } + + // we store a doc per field, so we expect at most the number of unique fields + let limit = field_paragraph_ids.len(); + + // due to implementation details, we use here a BooleanQuery as it's + // around 2 orders of magnitude faster than a TermSetQuery + let mut subqueries: Vec> = vec![]; + for field_uid in field_paragraph_ids.keys() { + subqueries.push(Box::new(TermQuery::new( + Term::from_field_bytes( + self.schema.encoded_field_id_bytes, + &encode_field_id_bytes( + Uuid::parse_str(&field_uid.rid)?, + &format!("{}/{}", field_uid.field_type, field_uid.field_name), + ), + ), + IndexRecordOption::Basic, + ))); + } + let query: Box = Box::new(BooleanQuery::union(subqueries)); + let collector = TopDocs::with_limit(limit).order_by_score(); + let searcher = self.reader.searcher(); + let results = searcher.search(&query, &collector)?; + + let mut paragraphs_text = HashMap::new(); + for (_score, doc_id) in results { + let doc = searcher.doc::(doc_id)?; + + let Some(text) = doc.get_first(self.schema.text).map(|value| value.as_str().unwrap()) else { + // can't do anything without extracted text + continue; + }; + let rid = String::from_utf8( + doc.get_first(self.schema.uuid) + .expect("document doesn't appear to have uuid.") + .as_bytes() + .unwrap() + .to_vec(), + ) + .unwrap(); + let field = decode_facet( + doc.get_first(self.schema.field) + .expect("document doesn't appear to have field.") + .as_facet() + .unwrap(), + ) + .to_path_string(); + + let parts: Vec<_> = field.split('/').collect(); // e.g. /a/title + let field_uid = FieldUid { + rid, + field_type: parts[1].to_string(), + field_name: parts[2].to_string(), + split: parts.get(3).map(|x| x.to_string()), + }; + + if let Some(paragraphs) = field_paragraph_ids.remove(&field_uid) { + // iterate the text by unicode characters only once, reusing the same iterator for + // all paragraphs on the field. This is more useful for multiple paragraphs per + // field on a large text + let mut paragraph_chars = text.chars(); + let mut skip = 0; + for paragraph_id in paragraphs.into_iter().sorted() { + skip = paragraph_id.paragraph_start as usize - skip; + let take = (paragraph_id.paragraph_end - paragraph_id.paragraph_start) as usize; + let paragraph_text = paragraph_chars.by_ref().skip(skip).take(take).collect(); + paragraphs_text.insert(paragraph_id, Some(paragraph_text)); + } + } + } + + Ok(paragraphs_text) + } } pub struct BatchProducer { From b35c1bb669ee8cafa1a4c873d02e0c166a091aa8 Mon Sep 17 00:00:00 2001 From: Joan Antoni RE Date: Thu, 16 Apr 2026 14:47:20 +0200 Subject: [PATCH 3/7] Clippy on test --- nidx/nidx_text/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nidx/nidx_text/src/lib.rs b/nidx/nidx_text/src/lib.rs index e70d7c59e5..771811ea09 100644 --- a/nidx/nidx_text/src/lib.rs +++ b/nidx/nidx_text/src/lib.rs @@ -280,7 +280,7 @@ mod tests { #[test] fn test_paragraph_uid_sorting() { - let mut paragraphs = vec![ + let mut paragraphs = [ ParagraphUid { rid: "rid".to_string(), field_type: "a".to_string(), From 7d1a19f3a93c1e1738222e287a6d04b739123255 Mon Sep 17 00:00:00 2001 From: Joan Antoni RE Date: Thu, 16 Apr 2026 15:58:54 +0200 Subject: [PATCH 4/7] Extract fields query logic --- nidx/nidx_text/src/reader.rs | 71 +++++++++++++++--------------------- 1 file changed, 30 insertions(+), 41 deletions(-) diff --git a/nidx/nidx_text/src/reader.rs b/nidx/nidx_text/src/reader.rs index f8b178d684..30beb30e43 100644 --- a/nidx/nidx_text/src/reader.rs +++ b/nidx/nidx_text/src/reader.rs @@ -469,29 +469,10 @@ impl TextReaderService { } pub fn get_fields_text(&self, field_uids: Vec) -> anyhow::Result>> { - let limit = field_uids.len(); - - // due to implementation details, we use here a BooleanQuery as it's - // around 2 orders of magnitude faster than a TermSetQuery - let mut subqueries: Vec> = vec![]; - for uid in field_uids { - subqueries.push(Box::new(TermQuery::new( - Term::from_field_bytes( - self.schema.encoded_field_id_bytes, - &encode_field_id_bytes( - Uuid::parse_str(&uid.rid)?, - &format!("{}/{}", uid.field_type, uid.field_name), - ), - ), - IndexRecordOption::Basic, - ))); - } - let query: Box = Box::new(BooleanQuery::union(subqueries)); - let collector = TopDocs::with_limit(limit).order_by_score(); let searcher = self.reader.searcher(); + let results = self.search_fields(searcher.clone(), field_uids.iter())?; let mut texts = HashMap::new(); - let results = searcher.search(&query, &collector)?; for (_score, doc_id) in results { let doc = searcher.doc::(doc_id)?; let doc_value = doc.get_first(self.schema.text); @@ -539,28 +520,8 @@ impl TextReaderService { .or_insert(vec![paragraph_id]); } - // we store a doc per field, so we expect at most the number of unique fields - let limit = field_paragraph_ids.len(); - - // due to implementation details, we use here a BooleanQuery as it's - // around 2 orders of magnitude faster than a TermSetQuery - let mut subqueries: Vec> = vec![]; - for field_uid in field_paragraph_ids.keys() { - subqueries.push(Box::new(TermQuery::new( - Term::from_field_bytes( - self.schema.encoded_field_id_bytes, - &encode_field_id_bytes( - Uuid::parse_str(&field_uid.rid)?, - &format!("{}/{}", field_uid.field_type, field_uid.field_name), - ), - ), - IndexRecordOption::Basic, - ))); - } - let query: Box = Box::new(BooleanQuery::union(subqueries)); - let collector = TopDocs::with_limit(limit).order_by_score(); let searcher = self.reader.searcher(); - let results = searcher.search(&query, &collector)?; + let results = self.search_fields(searcher.clone(), field_paragraph_ids.keys())?; let mut paragraphs_text = HashMap::new(); for (_score, doc_id) in results { @@ -611,6 +572,34 @@ impl TextReaderService { Ok(paragraphs_text) } + + fn search_fields<'a>( + &self, + searcher: Searcher, + field_uids: impl Iterator, + ) -> anyhow::Result> { + // due to implementation details, we use here a BooleanQuery as it's + // around 2 orders of magnitude faster than a TermSetQuery + let mut subqueries: Vec> = vec![]; + for field_uid in field_uids { + subqueries.push(Box::new(TermQuery::new( + Term::from_field_bytes( + self.schema.encoded_field_id_bytes, + &encode_field_id_bytes( + Uuid::parse_str(&field_uid.rid)?, + &format!("{}/{}", field_uid.field_type, field_uid.field_name), + ), + ), + IndexRecordOption::Basic, + ))); + } + // we store a doc per field, so we expect at most the number of unique fields + let limit = subqueries.len(); + let query: Box = Box::new(BooleanQuery::union(subqueries)); + let collector = TopDocs::with_limit(limit).order_by_score(); + let results = searcher.search(&query, &collector)?; + Ok(results) + } } pub struct BatchProducer { From 534498516662d735c4dd14a85ab098f483fa22fc Mon Sep 17 00:00:00 2001 From: Joan Antoni RE Date: Thu, 16 Apr 2026 16:27:11 +0200 Subject: [PATCH 5/7] Fix implementation of extract paragraphs from an iterator --- nidx/nidx_text/src/lib.rs | 54 +------------------------- nidx/nidx_text/src/reader.rs | 74 ++++++++++++++++++++++++++++++++---- 2 files changed, 67 insertions(+), 61 deletions(-) diff --git a/nidx/nidx_text/src/lib.rs b/nidx/nidx_text/src/lib.rs index 771811ea09..40e152a73f 100644 --- a/nidx/nidx_text/src/lib.rs +++ b/nidx/nidx_text/src/lib.rs @@ -85,7 +85,7 @@ pub struct FieldUid { } // Unique id for a field, equivalent to {rid}/{field_type}/{field_id}[/{split}]/{paragraph_start}-{paragraph_end} -#[derive(Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +#[derive(Clone, Debug, PartialEq, Eq, Hash)] pub struct ParagraphUid { pub rid: String, pub field_type: String, @@ -273,55 +273,3 @@ impl Display for ParagraphUid { } } } - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_paragraph_uid_sorting() { - let mut paragraphs = [ - ParagraphUid { - rid: "rid".to_string(), - field_type: "a".to_string(), - field_name: "title".to_string(), - split: None, - paragraph_start: 400, - paragraph_end: 500, - }, - ParagraphUid { - rid: "rid".to_string(), - field_type: "a".to_string(), - field_name: "title".to_string(), - split: None, - paragraph_start: 501, - paragraph_end: 555, - }, - ParagraphUid { - rid: "arid".to_string(), - field_type: "a".to_string(), - field_name: "title".to_string(), - split: None, - paragraph_start: 1000, - paragraph_end: 1020, - }, - ParagraphUid { - rid: "rid".to_string(), - field_type: "a".to_string(), - field_name: "title".to_string(), - split: None, - paragraph_start: 0, - paragraph_end: 20, - }, - ]; - paragraphs.sort(); - assert_eq!(paragraphs[0].rid, "arid"); - assert_eq!(paragraphs[0].paragraph_start, 1000); - assert_eq!(paragraphs[1].rid, "rid"); - assert_eq!(paragraphs[1].paragraph_start, 0); - assert_eq!(paragraphs[2].rid, "rid"); - assert_eq!(paragraphs[2].paragraph_start, 400); - assert_eq!(paragraphs[3].rid, "rid"); - assert_eq!(paragraphs[3].paragraph_start, 501); - } -} diff --git a/nidx/nidx_text/src/reader.rs b/nidx/nidx_text/src/reader.rs index 30beb30e43..977b7fd78b 100644 --- a/nidx/nidx_text/src/reader.rs +++ b/nidx/nidx_text/src/reader.rs @@ -555,17 +555,13 @@ impl TextReaderService { split: parts.get(3).map(|x| x.to_string()), }; - if let Some(paragraphs) = field_paragraph_ids.remove(&field_uid) { + if let Some(paragraph_ids) = field_paragraph_ids.remove(&field_uid) { // iterate the text by unicode characters only once, reusing the same iterator for // all paragraphs on the field. This is more useful for multiple paragraphs per // field on a large text - let mut paragraph_chars = text.chars(); - let mut skip = 0; - for paragraph_id in paragraphs.into_iter().sorted() { - skip = paragraph_id.paragraph_start as usize - skip; - let take = (paragraph_id.paragraph_end - paragraph_id.paragraph_start) as usize; - let paragraph_text = paragraph_chars.by_ref().skip(skip).take(take).collect(); - paragraphs_text.insert(paragraph_id, Some(paragraph_text)); + let mut paragraphs = Self::extract_paragraphs(paragraph_ids.into_iter(), text.chars()); + for (k, v) in paragraphs.drain() { + paragraphs_text.insert(k, v); } } } @@ -600,6 +596,25 @@ impl TextReaderService { let results = searcher.search(&query, &collector)?; Ok(results) } + + fn extract_paragraphs( + ids: impl Iterator, + mut text: std::str::Chars<'_>, + ) -> HashMap> { + let mut skip = 0; + + let mut paragraphs = HashMap::new(); + + for paragraph_id in ids.sorted_by_key(|id| (id.paragraph_start, id.paragraph_end)) { + skip = paragraph_id.paragraph_start as usize - skip; + let take = (paragraph_id.paragraph_end - paragraph_id.paragraph_start) as usize; + let paragraph = text.by_ref().skip(skip).take(take).collect(); + skip = paragraph_id.paragraph_end as usize; + paragraphs.insert(paragraph_id, Some(paragraph)); + } + + paragraphs + } } pub struct BatchProducer { @@ -663,3 +678,46 @@ impl Iterator for BatchProducer { Some(items) } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn extract_paragraphs_using_a_single_iterator() { + let text = "This is my test text"; + let word_positions = [(0, 4), (5, 7), (8, 10), (11, 15), (16, 20)]; + let words: Vec = word_positions + .into_iter() + .map(|(start, end)| ParagraphUid { + rid: "rid".to_string(), + field_type: "a".to_string(), + field_name: "title".to_string(), + split: None, + paragraph_start: start, + paragraph_end: end, + }) + .collect(); + let paragraphs = TextReaderService::extract_paragraphs( + [ + words[3].clone(), + words[1].clone(), + words[4].clone(), + words[0].clone(), + words[2].clone(), + ] + .into_iter(), + text.chars(), + ); + assert_eq!( + paragraphs, + HashMap::from_iter([ + (words[0].clone(), Some("This".to_string())), + (words[1].clone(), Some("is".to_string())), + (words[2].clone(), Some("my".to_string())), + (words[3].clone(), Some("test".to_string())), + (words[4].clone(), Some("text".to_string())), + ]) + ); + } +} From 010a80d90c43ef0beb17d4a82314514e1b2f2d8d Mon Sep 17 00:00:00 2001 From: Joan Antoni RE Date: Fri, 17 Apr 2026 10:57:17 +0200 Subject: [PATCH 6/7] Overlapping paragraph extraction --- nidx/nidx_text/src/reader.rs | 91 +++++++++++++++++++++++++++++++++--- 1 file changed, 84 insertions(+), 7 deletions(-) diff --git a/nidx/nidx_text/src/reader.rs b/nidx/nidx_text/src/reader.rs index 977b7fd78b..7536b2e7aa 100644 --- a/nidx/nidx_text/src/reader.rs +++ b/nidx/nidx_text/src/reader.rs @@ -601,16 +601,62 @@ impl TextReaderService { ids: impl Iterator, mut text: std::str::Chars<'_>, ) -> HashMap> { + let mut paragraphs = HashMap::new(); + + // sort paragraph_ids by (start, end) to avoid the need of already read chars from the text + let mut ids = ids.sorted_by_key(|id| (id.paragraph_start, id.paragraph_end)); + + let Some(first) = ids.next() else { + return paragraphs; + }; + let mut window = std::ops::Range { + start: first.paragraph_start, + end: first.paragraph_end, + }; + let mut window_paragraphs = vec![first]; + let mut skip = 0; - let mut paragraphs = HashMap::new(); + for paragraph_id in ids { + if paragraph_id.paragraph_start < window.end { + // This paragraph overlaps with the window. We can't be sure if there will be more + // in the future, so we widen the window and continue + window.end = std::cmp::max(window.end, paragraph_id.paragraph_end); + window_paragraphs.push(paragraph_id); + } else { + // A non-overlapping paragraph means we won't find any other paragraph that needs + // the text from the window. We then read the window and extract the paragraphs + skip = window.start - skip; + let take = window.end - window.start; + let chunk: Vec = text.by_ref().skip(skip as usize).take(take as usize).collect(); + skip = window.end; + + for id in window_paragraphs.drain(..) { + let start = (id.paragraph_start - window.start) as usize; + let end = (id.paragraph_end - window.start) as usize; + let paragraph: String = chunk[start..end].iter().collect(); + paragraphs.insert(id, Some(paragraph)); + } + + // As the new paragraph could overlap with future ones, we reset the window with it + window = std::ops::Range { + start: paragraph_id.paragraph_start, + end: paragraph_id.paragraph_end, + }; + window_paragraphs.push(paragraph_id); + } + } + + // with no more paragraphs, we can finish with the window + skip = window.start - skip; + let take = window.end - window.start; + let chunk: Vec = text.by_ref().skip(skip as usize).take(take as usize).collect(); - for paragraph_id in ids.sorted_by_key(|id| (id.paragraph_start, id.paragraph_end)) { - skip = paragraph_id.paragraph_start as usize - skip; - let take = (paragraph_id.paragraph_end - paragraph_id.paragraph_start) as usize; - let paragraph = text.by_ref().skip(skip).take(take).collect(); - skip = paragraph_id.paragraph_end as usize; - paragraphs.insert(paragraph_id, Some(paragraph)); + for id in window_paragraphs.drain(..) { + let start = (id.paragraph_start - window.start) as usize; + let end = (id.paragraph_end - window.start) as usize; + let paragraph: String = chunk[start..end].iter().collect(); + paragraphs.insert(id, Some(paragraph)); } paragraphs @@ -698,12 +744,39 @@ mod tests { paragraph_end: end, }) .collect(); + + // longer paragraphs overlapping with the words above + let overlapping_positions = [ + (0, 7), + // intersects with the above but has content outside + (5, 15), + // same as above + (5, 15), + // subset of the above + (8, 15), + ]; + let overlapping: Vec = overlapping_positions + .into_iter() + .map(|(start, end)| ParagraphUid { + rid: "rid".to_string(), + field_type: "a".to_string(), + field_name: "title".to_string(), + split: None, + paragraph_start: start, + paragraph_end: end, + }) + .collect(); + let paragraphs = TextReaderService::extract_paragraphs( [ + overlapping[2].clone(), words[3].clone(), + overlapping[3].clone(), words[1].clone(), + overlapping[0].clone(), words[4].clone(), words[0].clone(), + overlapping[1].clone(), words[2].clone(), ] .into_iter(), @@ -713,8 +786,12 @@ mod tests { paragraphs, HashMap::from_iter([ (words[0].clone(), Some("This".to_string())), + (overlapping[0].clone(), Some("This is".to_string())), (words[1].clone(), Some("is".to_string())), + (overlapping[1].clone(), Some("is my test".to_string())), + (overlapping[2].clone(), Some("is my test".to_string())), (words[2].clone(), Some("my".to_string())), + (overlapping[3].clone(), Some("my test".to_string())), (words[3].clone(), Some("test".to_string())), (words[4].clone(), Some("text".to_string())), ]) From 3c9931bf14e7d8f0756d49734e15ec57dce73480 Mon Sep 17 00:00:00 2001 From: Joan Antoni RE Date: Mon, 20 Apr 2026 10:40:19 +0200 Subject: [PATCH 7/7] Clamp end of paragraph --- nidx/nidx_text/src/reader.rs | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/nidx/nidx_text/src/reader.rs b/nidx/nidx_text/src/reader.rs index 7536b2e7aa..f1326c5f52 100644 --- a/nidx/nidx_text/src/reader.rs +++ b/nidx/nidx_text/src/reader.rs @@ -633,7 +633,8 @@ impl TextReaderService { for id in window_paragraphs.drain(..) { let start = (id.paragraph_start - window.start) as usize; - let end = (id.paragraph_end - window.start) as usize; + // clamp to chunk size, we don't have more text + let end = std::cmp::min((id.paragraph_end - window.start) as usize, chunk.len()); let paragraph: String = chunk[start..end].iter().collect(); paragraphs.insert(id, Some(paragraph)); } @@ -654,7 +655,7 @@ impl TextReaderService { for id in window_paragraphs.drain(..) { let start = (id.paragraph_start - window.start) as usize; - let end = (id.paragraph_end - window.start) as usize; + let end = std::cmp::min((id.paragraph_end - window.start) as usize, chunk.len()); let paragraph: String = chunk[start..end].iter().collect(); paragraphs.insert(id, Some(paragraph)); } @@ -767,16 +768,31 @@ mod tests { }) .collect(); + let out_of_bounds: Vec = [(8, 100), (16, 100), (200, 300)] + .into_iter() + .map(|(start, end)| ParagraphUid { + rid: "rid".to_string(), + field_type: "a".to_string(), + field_name: "title".to_string(), + split: None, + paragraph_start: start, + paragraph_end: end, + }) + .collect(); + let paragraphs = TextReaderService::extract_paragraphs( [ overlapping[2].clone(), words[3].clone(), overlapping[3].clone(), + out_of_bounds[2].clone(), words[1].clone(), overlapping[0].clone(), + out_of_bounds[1].clone(), words[4].clone(), words[0].clone(), overlapping[1].clone(), + out_of_bounds[0].clone(), words[2].clone(), ] .into_iter(), @@ -792,8 +808,11 @@ mod tests { (overlapping[2].clone(), Some("is my test".to_string())), (words[2].clone(), Some("my".to_string())), (overlapping[3].clone(), Some("my test".to_string())), + (out_of_bounds[0].clone(), Some("my test text".to_string())), (words[3].clone(), Some("test".to_string())), (words[4].clone(), Some("text".to_string())), + (out_of_bounds[1].clone(), Some("text".to_string())), + (out_of_bounds[2].clone(), Some("".to_string())), ]) ); }