brianmeyer · brianmeyer · Mar 27, 2026 · Mar 23, 2026
diff --git a/src/recallforge/storage/indexing_ops.py b/src/recallforge/storage/indexing_ops.py
@@ -195,6 +195,28 @@ def _generate_media_tags(self, embed_func, source_text: str, media_kind: str) ->
             logger.debug("index_%s: tag generation returned no usable tags", media_kind)
         return tags
 
+    def _build_parent_summary(self, parts: List[str], *, fallback: str = "", max_chars: int = 4000) -> str:
+        """Build a compact persisted parent summary/body from existing derived text."""
+        excerpts: List[str] = []
+        seen: set[str] = set()
+
+        for raw in parts:
+            text = re.sub(r"\s+", " ", str(raw or "").strip())
+            if not text:
+                continue
+            lowered = text.lower()
+            if lowered in seen:
+                continue
+            seen.add(lowered)
+            excerpts.append(text)
+            if len(excerpts) >= 2 or sum(len(item) for item in excerpts) >= 600:
+                break
+
+        if not excerpts and fallback:
+            excerpts.append(re.sub(r"\s+", " ", fallback.strip()))
+
+        return "\n\n".join(excerpts)[:max_chars].strip()
+
     def index_document(
         self,
         path: str,
@@ -1211,8 +1233,10 @@ def index_video(
             frame_paths=frame_paths,
             enabled=caption_media,
         )
-        parts = [part for part in (video_caption, transcript_summary) if part]
-        video_body = "\n\n".join(parts)[:4000]
+        video_body = self._build_parent_summary(
+            [video_caption, transcript_summary],
+            fallback=resolved_title,
+        )
         video_tag_backend = self._select_generation_backend(embed_video_func, embed_image_func)
         video_tags = (
             self._generate_media_tags(video_tag_backend, video_body, "video")
@@ -1274,6 +1298,35 @@ def index_video(
                 actual_path,
                 e,
             )
+            if video_body:
+                try:
+                    summary_vector = embed_text_func(video_body)
+                    self._backend.insert_embedding(
+                        content_hash=content_hash,
+                        seq=0,
+                        pos=0,
+                        vector=summary_vector.tolist() if hasattr(summary_vector, "tolist") else list(summary_vector),
+                        model=model,
+                        collection=collection,
+                        file_path=logical_path,
+                        title=resolved_title,
+                        text_body=video_body,
+                        content_type="video",
+                        user_id=user_id,
+                        session_id=session_id,
+                        project_id=project_id,
+                        profile=profile,
+                        memory_role="root",
+                        memory_root_path=logical_path,
+                        tags=video_tags or None,
+                    )
+                    indexed_video_embeddings = 1
+                except Exception as summary_exc:
+                    logger.warning(
+                        "index_video: fallback summary embedding failed for %s: %s",
+                        actual_path,
+                        summary_exc,
+                    )
 
         indexed_frames = 0
         indexed_transcripts = 0
@@ -1388,6 +1441,38 @@ def index_document_file(
             memory_root_path=logical_path,
         )
 
+        document_summary = self._build_parent_summary(
+            [section.text for section in artifacts.sections],
+            fallback=document_title,
+        )
+        if document_summary:
+            try:
+                root_vector = embed_func(document_summary)
+                self._backend.insert_embedding(
+                    content_hash=document_hash,
+                    seq=0,
+                    pos=0,
+                    vector=root_vector.tolist() if hasattr(root_vector, "tolist") else list(root_vector),
+                    model=model,
+                    collection=collection,
+                    file_path=logical_path,
+                    title=document_title,
+                    text_body=document_summary,
+                    content_type=artifacts.document_type,
+                    user_id=user_id,
+                    session_id=session_id,
+                    project_id=project_id,
+                    profile=profile,
+                    memory_role="root",
+                    memory_root_path=logical_path,
+                )
+            except Exception as exc:
+                logger.warning(
+                    "index_document_file: root summary embedding failed for %s; continuing with child assets: %s",
+                    actual_path,
+                    exc,
+                )
+
         # Track temp dirs from PDF vision fallback for cleanup
         _temp_dirs_to_clean: set = set()
 

diff --git a/tests/test_storage.py b/tests/test_storage.py
@@ -1447,9 +1447,21 @@ def failing_video_embed(_path: str):
         for row in child_embedding_rows:
             self.assertEqual(json.loads(row.get("tags") or "[]"), expected_tags)
 
+        root_embedding_rows = self.backend._embeddings_table.search().where(
+            f"collection = 'test' AND file_path = '{logical_path}'"
+        ).to_list()
+        self.assertEqual(len(root_embedding_rows), 1)
+        self.assertIn("Technical explainer video", root_embedding_rows[0].get("text_body") or "")
+        self.assertEqual(json.loads(root_embedding_rows[0].get("tags") or "[]"), expected_tags)
+
+        memories = self.backend.list_memories(collection="test", limit=10)
+        self.assertEqual(len(memories), 1)
+        self.assertTrue((memories[0].get("summary") or "").startswith("Technical explainer video"))
+
         memory = self.backend.get_memory(path=logical_path, collection="test")
         self.assertIsNotNone(memory)
         self.assertEqual(memory["tags"], expected_tags)
+        self.assertTrue((memory.get("summary") or "").startswith("Technical explainer video"))
 
     def test_index_document_file_creates_root_memory_and_links_sections(self):
         document_path = os.path.join(self.temp_dir, "report.pdf")
@@ -1492,6 +1504,69 @@ def test_index_document_file_creates_root_memory_and_links_sections(self):
         self.assertEqual(child_doc.memory_role, "child")
         self.assertEqual(child_doc.memory_root_path, logical_path)
 
+        root_embedding_rows = self.backend._embeddings_table.search().where(
+            f"collection = 'test' AND file_path = '{logical_path}'"
+        ).to_list()
+        self.assertEqual(len(root_embedding_rows), 1)
+        self.assertIn("Budget and launch notes", root_embedding_rows[0].get("text_body") or "")
+
+        memory = self.backend.get_memory(path=logical_path, collection="test")
+        self.assertIsNotNone(memory)
+        self.assertTrue((memory.get("summary") or "").startswith("Budget and launch notes"))
+
+    def test_index_document_file_continues_when_root_summary_embedding_fails(self):
+        document_path = os.path.join(self.temp_dir, "notes.pdf")
+        logical_path = str(Path(document_path).expanduser().resolve())
+        with open(document_path, "wb") as f:
+            f.write(b"%PDF-1.4 mock")
+
+        fake_artifacts = SimpleNamespace(
+            document_type="pdf",
+            extractor="unit-test",
+            sections=[
+                SimpleNamespace(
+                    logical_path=f"{logical_path}::section:0001",
+                    title="notes section 1",
+                    text="First section about memory retrieval.",
+                    section_type="section",
+                    index=1,
+                    content_type="text",
+                    image_path=None,
+                ),
+                SimpleNamespace(
+                    logical_path=f"{logical_path}::section:0002",
+                    title="notes section 2",
+                    text="Second section about multimodal evidence.",
+                    section_type="section",
+                    index=2,
+                    content_type="text",
+                    image_path=None,
+                ),
+            ],
+        )
+
+        def embed_except_summary(text: str):
+            if "First section about memory retrieval." in text and "Second section about multimodal evidence." in text:
+                raise RuntimeError("summary embed failed")
+            return mock_embed(text)
+
+        with patch("recallforge.storage.indexing_ops.extract_document_artifacts", return_value=fake_artifacts):
+            result = self.backend.index_document_file(
+                path=document_path,
+                collection="test",
+                embed_func=embed_except_summary,
+                model="mock-embedder",
+            )
+
+        self.assertEqual(result["indexed_sections"], 2)
+        child_doc = self.backend.find_document("test", f"{logical_path}::section:0001")
+        self.assertIsNotNone(child_doc)
+
+        root_embedding_rows = self.backend._embeddings_table.search().where(
+            f"collection = 'test' AND file_path = '{logical_path}'"
+        ).to_list()
+        self.assertEqual(len(root_embedding_rows), 0)
+
     def test_index_document_file_preserves_ocr_text_for_image_only_pages(self):
         document_path = os.path.join(self.temp_dir, "scan.pdf")
         logical_path = str(Path(document_path).expanduser().resolve())