Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 16 additions & 1 deletion src/recallforge/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -1530,6 +1530,18 @@ def search_batch(
if not batch_queries:
return []

def _merge_tags(items: List[Any]) -> Optional[List[str]]:
merged: List[str] = []
seen: set[str] = set()
for item in items:
for tag in getattr(item, "tags", None) or []:
cleaned = str(tag or "").strip().lower()
if not cleaned or cleaned in seen:
continue
seen.add(cleaned)
merged.append(cleaned)
return merged or None

def run_single_query(q: BatchQuery) -> List[tuple]:
"""Run a single query and return (result, score) tuples."""
mode = q.mode or "hybrid"
Expand Down Expand Up @@ -1582,11 +1594,14 @@ def run_single_query(q: BatchQuery) -> List[tuple]:
if filepath not in merged:
merged[filepath] = {
'result': result,
'results': [result],
'rrf_score': 0.0,
'query_indices': set(),
'query_scores': {},
'best_score': 0.0,
}
else:
merged[filepath]['results'].append(result)
Comment on lines +1603 to +1604
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Preserve query order when merging duplicate result tags

The new tag merge path appends duplicate hits in whatever order all_results is iterated, but all_results is populated via as_completed, so completion timing can reorder queries between runs. In batches where multiple queries hit the same filepath with different tags, tags=_merge_tags(data['results']) will produce different tag orders for identical inputs, which breaks reproducibility and can make deterministic-order assertions flaky.

Useful? React with 👍 / 👎.


# RRF contribution: rank-based, not insertion-order-based
merged[filepath]['rrf_score'] += weight / (rrf_k + rank + 1)
Expand All @@ -1612,7 +1627,7 @@ def run_single_query(q: BatchQuery) -> List[tuple]:
score=data['rrf_score'],
source=','.join(str(i) for i in sorted(data['query_indices'])),
query_scores=data['query_scores'],
tags=getattr(result, "tags", None),
tags=_merge_tags(data['results']),
))

final_results.sort(key=lambda x: x.score, reverse=True)
Expand Down
4 changes: 4 additions & 0 deletions src/recallforge/storage/indexing_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,10 @@ def _parse_generated_media_tags(self, raw: str) -> List[str]:
if not text:
return []

fenced_match = re.match(r"^```(?:[A-Za-z0-9_+-]+)?\s*\n?(.*?)\n?```$", text, flags=re.DOTALL)
if fenced_match:
text = fenced_match.group(1).strip()

candidates: List[str] = []
if text.startswith("[") and text.endswith("]"):
try:
Expand Down
63 changes: 62 additions & 1 deletion tests/test_search_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -525,6 +525,67 @@ def mock_search(self, query):
self.assertIn(0, results[0].query_scores)
self.assertIn(1, results[0].query_scores)

def test_same_document_merges_tags_deterministically(self):
"""Duplicate hits should merge tag sets in stable first-seen order."""
backend = StubBackend()
storage = StubStorage()

results_list = [
[
type('HybridResult', (), {
'filepath': 'shared.md',
'display_path': 'shared.md',
'title': 'shared.md',
'context': None,
'hash': 'h1',
'docid': 'd1',
'collection': 'test',
'modified_at': '2026-01-01',
'body_length': 100,
'body': 'shared content',
'score': 0.8,
'source': 'hybrid',
'tags': ['alpha', 'shared'],
}),
],
[
type('HybridResult', (), {
'filepath': 'shared.md',
'display_path': 'shared.md',
'title': 'shared.md',
'context': None,
'hash': 'h1',
'docid': 'd1',
'collection': 'test',
'modified_at': '2026-01-01',
'body_length': 100,
'body': 'shared content',
'score': 0.9,
'source': 'hybrid',
'tags': ['shared', 'beta'],
}),
],
]

call_idx = [0]

def mock_search(self, query):
idx = call_idx[0]
call_idx[0] += 1
return results_list[idx]

with patch.object(HybridSearcher, '__init__', lambda self, **kwargs: None):
with patch.object(HybridSearcher, 'search', mock_search):
results = search_batch(
["query one", "query two"],
backend=backend,
storage=storage,
limit=10,
)

self.assertEqual(len(results), 1)
self.assertEqual(results[0].tags, ["alpha", "shared", "beta"])


if __name__ == "__main__":
unittest.main()
unittest.main()
22 changes: 22 additions & 0 deletions tests/test_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -1340,6 +1340,28 @@ def test_memory_lookup_surfaces_media_tags(self):
["neural network", "diagram", "hidden layers"],
)

def test_generated_media_tags_strip_fenced_json(self):
embedder = CaptioningEmbedder()

def fenced_json(_prompt: str, max_tokens: int = 60) -> str:
return '```json\n["diagram", "hidden layers", "neural network"]\n```'

embedder.generate_text = fenced_json

self.backend.index_image(
path=self.image_path,
collection="test",
embed_func=embedder,
caption_media=True,
)

rows = self.backend._embeddings_table.search().where("content_type = 'image'").to_list()
self.assertEqual(len(rows), 1)
self.assertEqual(
json.loads(rows[0].get("tags") or "[]"),
["diagram", "hidden layers", "neural network"],
)

def test_index_video_keeps_parent_memory_and_links_children(self):
embedder = CaptioningEmbedder()
logical_path = str(Path(self.video_path).expanduser().resolve())
Expand Down
Loading