diff --git a/src/recallforge/storage/indexing_ops.py b/src/recallforge/storage/indexing_ops.py index c5bb1cb..79c4979 100644 --- a/src/recallforge/storage/indexing_ops.py +++ b/src/recallforge/storage/indexing_ops.py @@ -195,6 +195,28 @@ def _generate_media_tags(self, embed_func, source_text: str, media_kind: str) -> logger.debug("index_%s: tag generation returned no usable tags", media_kind) return tags + def _build_parent_summary(self, parts: List[str], *, fallback: str = "", max_chars: int = 4000) -> str: + """Build a compact persisted parent summary/body from existing derived text.""" + excerpts: List[str] = [] + seen: set[str] = set() + + for raw in parts: + text = re.sub(r"\s+", " ", str(raw or "").strip()) + if not text: + continue + lowered = text.lower() + if lowered in seen: + continue + seen.add(lowered) + excerpts.append(text) + if len(excerpts) >= 2 or sum(len(item) for item in excerpts) >= 600: + break + + if not excerpts and fallback: + excerpts.append(re.sub(r"\s+", " ", fallback.strip())) + + return "\n\n".join(excerpts)[:max_chars].strip() + def index_document( self, path: str, @@ -1211,8 +1233,10 @@ def index_video( frame_paths=frame_paths, enabled=caption_media, ) - parts = [part for part in (video_caption, transcript_summary) if part] - video_body = "\n\n".join(parts)[:4000] + video_body = self._build_parent_summary( + [video_caption, transcript_summary], + fallback=resolved_title, + ) video_tag_backend = self._select_generation_backend(embed_video_func, embed_image_func) video_tags = ( self._generate_media_tags(video_tag_backend, video_body, "video") @@ -1274,6 +1298,35 @@ def index_video( actual_path, e, ) + if video_body: + try: + summary_vector = embed_text_func(video_body) + self._backend.insert_embedding( + content_hash=content_hash, + seq=0, + pos=0, + vector=summary_vector.tolist() if hasattr(summary_vector, "tolist") else list(summary_vector), + model=model, + collection=collection, + file_path=logical_path, + title=resolved_title, + text_body=video_body, + content_type="video", + user_id=user_id, + session_id=session_id, + project_id=project_id, + profile=profile, + memory_role="root", + memory_root_path=logical_path, + tags=video_tags or None, + ) + indexed_video_embeddings = 1 + except Exception as summary_exc: + logger.warning( + "index_video: fallback summary embedding failed for %s: %s", + actual_path, + summary_exc, + ) indexed_frames = 0 indexed_transcripts = 0 @@ -1388,6 +1441,38 @@ def index_document_file( memory_root_path=logical_path, ) + document_summary = self._build_parent_summary( + [section.text for section in artifacts.sections], + fallback=document_title, + ) + if document_summary: + try: + root_vector = embed_func(document_summary) + self._backend.insert_embedding( + content_hash=document_hash, + seq=0, + pos=0, + vector=root_vector.tolist() if hasattr(root_vector, "tolist") else list(root_vector), + model=model, + collection=collection, + file_path=logical_path, + title=document_title, + text_body=document_summary, + content_type=artifacts.document_type, + user_id=user_id, + session_id=session_id, + project_id=project_id, + profile=profile, + memory_role="root", + memory_root_path=logical_path, + ) + except Exception as exc: + logger.warning( + "index_document_file: root summary embedding failed for %s; continuing with child assets: %s", + actual_path, + exc, + ) + # Track temp dirs from PDF vision fallback for cleanup _temp_dirs_to_clean: set = set() diff --git a/tests/test_storage.py b/tests/test_storage.py index 72ef376..aabfe60 100644 --- a/tests/test_storage.py +++ b/tests/test_storage.py @@ -1447,9 +1447,21 @@ def failing_video_embed(_path: str): for row in child_embedding_rows: self.assertEqual(json.loads(row.get("tags") or "[]"), expected_tags) + root_embedding_rows = self.backend._embeddings_table.search().where( + f"collection = 'test' AND file_path = '{logical_path}'" + ).to_list() + self.assertEqual(len(root_embedding_rows), 1) + self.assertIn("Technical explainer video", root_embedding_rows[0].get("text_body") or "") + self.assertEqual(json.loads(root_embedding_rows[0].get("tags") or "[]"), expected_tags) + + memories = self.backend.list_memories(collection="test", limit=10) + self.assertEqual(len(memories), 1) + self.assertTrue((memories[0].get("summary") or "").startswith("Technical explainer video")) + memory = self.backend.get_memory(path=logical_path, collection="test") self.assertIsNotNone(memory) self.assertEqual(memory["tags"], expected_tags) + self.assertTrue((memory.get("summary") or "").startswith("Technical explainer video")) def test_index_document_file_creates_root_memory_and_links_sections(self): document_path = os.path.join(self.temp_dir, "report.pdf") @@ -1492,6 +1504,69 @@ def test_index_document_file_creates_root_memory_and_links_sections(self): self.assertEqual(child_doc.memory_role, "child") self.assertEqual(child_doc.memory_root_path, logical_path) + root_embedding_rows = self.backend._embeddings_table.search().where( + f"collection = 'test' AND file_path = '{logical_path}'" + ).to_list() + self.assertEqual(len(root_embedding_rows), 1) + self.assertIn("Budget and launch notes", root_embedding_rows[0].get("text_body") or "") + + memory = self.backend.get_memory(path=logical_path, collection="test") + self.assertIsNotNone(memory) + self.assertTrue((memory.get("summary") or "").startswith("Budget and launch notes")) + + def test_index_document_file_continues_when_root_summary_embedding_fails(self): + document_path = os.path.join(self.temp_dir, "notes.pdf") + logical_path = str(Path(document_path).expanduser().resolve()) + with open(document_path, "wb") as f: + f.write(b"%PDF-1.4 mock") + + fake_artifacts = SimpleNamespace( + document_type="pdf", + extractor="unit-test", + sections=[ + SimpleNamespace( + logical_path=f"{logical_path}::section:0001", + title="notes section 1", + text="First section about memory retrieval.", + section_type="section", + index=1, + content_type="text", + image_path=None, + ), + SimpleNamespace( + logical_path=f"{logical_path}::section:0002", + title="notes section 2", + text="Second section about multimodal evidence.", + section_type="section", + index=2, + content_type="text", + image_path=None, + ), + ], + ) + + def embed_except_summary(text: str): + if "First section about memory retrieval." in text and "Second section about multimodal evidence." in text: + raise RuntimeError("summary embed failed") + return mock_embed(text) + + with patch("recallforge.storage.indexing_ops.extract_document_artifacts", return_value=fake_artifacts): + result = self.backend.index_document_file( + path=document_path, + collection="test", + embed_func=embed_except_summary, + model="mock-embedder", + ) + + self.assertEqual(result["indexed_sections"], 2) + child_doc = self.backend.find_document("test", f"{logical_path}::section:0001") + self.assertIsNotNone(child_doc) + + root_embedding_rows = self.backend._embeddings_table.search().where( + f"collection = 'test' AND file_path = '{logical_path}'" + ).to_list() + self.assertEqual(len(root_embedding_rows), 0) + def test_index_document_file_preserves_ocr_text_for_image_only_pages(self): document_path = os.path.join(self.temp_dir, "scan.pdf") logical_path = str(Path(document_path).expanduser().resolve())