Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 87 additions & 2 deletions src/recallforge/storage/indexing_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,28 @@ def _generate_media_tags(self, embed_func, source_text: str, media_kind: str) ->
logger.debug("index_%s: tag generation returned no usable tags", media_kind)
return tags

def _build_parent_summary(self, parts: List[str], *, fallback: str = "", max_chars: int = 4000) -> str:
"""Build a compact persisted parent summary/body from existing derived text."""
excerpts: List[str] = []
seen: set[str] = set()

for raw in parts:
text = re.sub(r"\s+", " ", str(raw or "").strip())
if not text:
continue
lowered = text.lower()
if lowered in seen:
continue
seen.add(lowered)
excerpts.append(text)
if len(excerpts) >= 2 or sum(len(item) for item in excerpts) >= 600:
break

if not excerpts and fallback:
excerpts.append(re.sub(r"\s+", " ", fallback.strip()))

return "\n\n".join(excerpts)[:max_chars].strip()

def index_document(
self,
path: str,
Expand Down Expand Up @@ -1211,8 +1233,10 @@ def index_video(
frame_paths=frame_paths,
enabled=caption_media,
)
parts = [part for part in (video_caption, transcript_summary) if part]
video_body = "\n\n".join(parts)[:4000]
video_body = self._build_parent_summary(
[video_caption, transcript_summary],
fallback=resolved_title,
)
video_tag_backend = self._select_generation_backend(embed_video_func, embed_image_func)
video_tags = (
self._generate_media_tags(video_tag_backend, video_body, "video")
Expand Down Expand Up @@ -1274,6 +1298,35 @@ def index_video(
actual_path,
e,
)
if video_body:
try:
summary_vector = embed_text_func(video_body)
self._backend.insert_embedding(
content_hash=content_hash,
seq=0,
pos=0,
vector=summary_vector.tolist() if hasattr(summary_vector, "tolist") else list(summary_vector),
model=model,
collection=collection,
file_path=logical_path,
title=resolved_title,
text_body=video_body,
content_type="video",
user_id=user_id,
session_id=session_id,
project_id=project_id,
profile=profile,
memory_role="root",
memory_root_path=logical_path,
tags=video_tags or None,
)
indexed_video_embeddings = 1
except Exception as summary_exc:
logger.warning(
"index_video: fallback summary embedding failed for %s: %s",
actual_path,
summary_exc,
)

indexed_frames = 0
indexed_transcripts = 0
Expand Down Expand Up @@ -1388,6 +1441,38 @@ def index_document_file(
memory_root_path=logical_path,
)

document_summary = self._build_parent_summary(
[section.text for section in artifacts.sections],
fallback=document_title,
)
if document_summary:
try:
root_vector = embed_func(document_summary)
self._backend.insert_embedding(
content_hash=document_hash,
seq=0,
pos=0,
vector=root_vector.tolist() if hasattr(root_vector, "tolist") else list(root_vector),
model=model,
collection=collection,
file_path=logical_path,
title=document_title,
text_body=document_summary,
content_type=artifacts.document_type,
user_id=user_id,
session_id=session_id,
project_id=project_id,
profile=profile,
memory_role="root",
memory_root_path=logical_path,
)
except Exception as exc:
logger.warning(
"index_document_file: root summary embedding failed for %s; continuing with child assets: %s",
actual_path,
exc,
)

# Track temp dirs from PDF vision fallback for cleanup
_temp_dirs_to_clean: set = set()

Expand Down
75 changes: 75 additions & 0 deletions tests/test_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -1447,9 +1447,21 @@ def failing_video_embed(_path: str):
for row in child_embedding_rows:
self.assertEqual(json.loads(row.get("tags") or "[]"), expected_tags)

root_embedding_rows = self.backend._embeddings_table.search().where(
f"collection = 'test' AND file_path = '{logical_path}'"
).to_list()
self.assertEqual(len(root_embedding_rows), 1)
self.assertIn("Technical explainer video", root_embedding_rows[0].get("text_body") or "")
self.assertEqual(json.loads(root_embedding_rows[0].get("tags") or "[]"), expected_tags)

memories = self.backend.list_memories(collection="test", limit=10)
self.assertEqual(len(memories), 1)
self.assertTrue((memories[0].get("summary") or "").startswith("Technical explainer video"))

memory = self.backend.get_memory(path=logical_path, collection="test")
self.assertIsNotNone(memory)
self.assertEqual(memory["tags"], expected_tags)
self.assertTrue((memory.get("summary") or "").startswith("Technical explainer video"))

def test_index_document_file_creates_root_memory_and_links_sections(self):
document_path = os.path.join(self.temp_dir, "report.pdf")
Expand Down Expand Up @@ -1492,6 +1504,69 @@ def test_index_document_file_creates_root_memory_and_links_sections(self):
self.assertEqual(child_doc.memory_role, "child")
self.assertEqual(child_doc.memory_root_path, logical_path)

root_embedding_rows = self.backend._embeddings_table.search().where(
f"collection = 'test' AND file_path = '{logical_path}'"
).to_list()
self.assertEqual(len(root_embedding_rows), 1)
self.assertIn("Budget and launch notes", root_embedding_rows[0].get("text_body") or "")

memory = self.backend.get_memory(path=logical_path, collection="test")
self.assertIsNotNone(memory)
self.assertTrue((memory.get("summary") or "").startswith("Budget and launch notes"))

def test_index_document_file_continues_when_root_summary_embedding_fails(self):
document_path = os.path.join(self.temp_dir, "notes.pdf")
logical_path = str(Path(document_path).expanduser().resolve())
with open(document_path, "wb") as f:
f.write(b"%PDF-1.4 mock")

fake_artifacts = SimpleNamespace(
document_type="pdf",
extractor="unit-test",
sections=[
SimpleNamespace(
logical_path=f"{logical_path}::section:0001",
title="notes section 1",
text="First section about memory retrieval.",
section_type="section",
index=1,
content_type="text",
image_path=None,
),
SimpleNamespace(
logical_path=f"{logical_path}::section:0002",
title="notes section 2",
text="Second section about multimodal evidence.",
section_type="section",
index=2,
content_type="text",
image_path=None,
),
],
)

def embed_except_summary(text: str):
if "First section about memory retrieval." in text and "Second section about multimodal evidence." in text:
raise RuntimeError("summary embed failed")
return mock_embed(text)

with patch("recallforge.storage.indexing_ops.extract_document_artifacts", return_value=fake_artifacts):
result = self.backend.index_document_file(
path=document_path,
collection="test",
embed_func=embed_except_summary,
model="mock-embedder",
)

self.assertEqual(result["indexed_sections"], 2)
child_doc = self.backend.find_document("test", f"{logical_path}::section:0001")
self.assertIsNotNone(child_doc)

root_embedding_rows = self.backend._embeddings_table.search().where(
f"collection = 'test' AND file_path = '{logical_path}'"
).to_list()
self.assertEqual(len(root_embedding_rows), 0)

def test_index_document_file_preserves_ocr_text_for_image_only_pages(self):
document_path = os.path.join(self.temp_dir, "scan.pdf")
logical_path = str(Path(document_path).expanduser().resolve())
Expand Down
Loading