diff --git a/src/recallforge/search.py b/src/recallforge/search.py index a3b87ce..6caf39b 100644 --- a/src/recallforge/search.py +++ b/src/recallforge/search.py @@ -1530,6 +1530,18 @@ def search_batch( if not batch_queries: return [] + def _merge_tags(items: List[Any]) -> Optional[List[str]]: + merged: List[str] = [] + seen: set[str] = set() + for item in items: + for tag in getattr(item, "tags", None) or []: + cleaned = str(tag or "").strip().lower() + if not cleaned or cleaned in seen: + continue + seen.add(cleaned) + merged.append(cleaned) + return merged or None + def run_single_query(q: BatchQuery) -> List[tuple]: """Run a single query and return (result, score) tuples.""" mode = q.mode or "hybrid" @@ -1582,11 +1594,14 @@ def run_single_query(q: BatchQuery) -> List[tuple]: if filepath not in merged: merged[filepath] = { 'result': result, + 'results': [result], 'rrf_score': 0.0, 'query_indices': set(), 'query_scores': {}, 'best_score': 0.0, } + else: + merged[filepath]['results'].append(result) # RRF contribution: rank-based, not insertion-order-based merged[filepath]['rrf_score'] += weight / (rrf_k + rank + 1) @@ -1612,7 +1627,7 @@ def run_single_query(q: BatchQuery) -> List[tuple]: score=data['rrf_score'], source=','.join(str(i) for i in sorted(data['query_indices'])), query_scores=data['query_scores'], - tags=getattr(result, "tags", None), + tags=_merge_tags(data['results']), )) final_results.sort(key=lambda x: x.score, reverse=True) diff --git a/src/recallforge/storage/indexing_ops.py b/src/recallforge/storage/indexing_ops.py index 8537c4a..5c23bd6 100644 --- a/src/recallforge/storage/indexing_ops.py +++ b/src/recallforge/storage/indexing_ops.py @@ -132,6 +132,10 @@ def _parse_generated_media_tags(self, raw: str) -> List[str]: if not text: return [] + fenced_match = re.match(r"^```(?:[A-Za-z0-9_+-]+)?\s*\n?(.*?)\n?```$", text, flags=re.DOTALL) + if fenced_match: + text = fenced_match.group(1).strip() + candidates: List[str] = [] if text.startswith("[") and text.endswith("]"): try: diff --git a/tests/test_search_batch.py b/tests/test_search_batch.py index 2b0ec6b..ea038b6 100644 --- a/tests/test_search_batch.py +++ b/tests/test_search_batch.py @@ -525,6 +525,67 @@ def mock_search(self, query): self.assertIn(0, results[0].query_scores) self.assertIn(1, results[0].query_scores) + def test_same_document_merges_tags_deterministically(self): + """Duplicate hits should merge tag sets in stable first-seen order.""" + backend = StubBackend() + storage = StubStorage() + + results_list = [ + [ + type('HybridResult', (), { + 'filepath': 'shared.md', + 'display_path': 'shared.md', + 'title': 'shared.md', + 'context': None, + 'hash': 'h1', + 'docid': 'd1', + 'collection': 'test', + 'modified_at': '2026-01-01', + 'body_length': 100, + 'body': 'shared content', + 'score': 0.8, + 'source': 'hybrid', + 'tags': ['alpha', 'shared'], + }), + ], + [ + type('HybridResult', (), { + 'filepath': 'shared.md', + 'display_path': 'shared.md', + 'title': 'shared.md', + 'context': None, + 'hash': 'h1', + 'docid': 'd1', + 'collection': 'test', + 'modified_at': '2026-01-01', + 'body_length': 100, + 'body': 'shared content', + 'score': 0.9, + 'source': 'hybrid', + 'tags': ['shared', 'beta'], + }), + ], + ] + + call_idx = [0] + + def mock_search(self, query): + idx = call_idx[0] + call_idx[0] += 1 + return results_list[idx] + + with patch.object(HybridSearcher, '__init__', lambda self, **kwargs: None): + with patch.object(HybridSearcher, 'search', mock_search): + results = search_batch( + ["query one", "query two"], + backend=backend, + storage=storage, + limit=10, + ) + + self.assertEqual(len(results), 1) + self.assertEqual(results[0].tags, ["alpha", "shared", "beta"]) + if __name__ == "__main__": - unittest.main() \ No newline at end of file + unittest.main() diff --git a/tests/test_storage.py b/tests/test_storage.py index 3c990d3..be35123 100644 --- a/tests/test_storage.py +++ b/tests/test_storage.py @@ -1340,6 +1340,28 @@ def test_memory_lookup_surfaces_media_tags(self): ["neural network", "diagram", "hidden layers"], ) + def test_generated_media_tags_strip_fenced_json(self): + embedder = CaptioningEmbedder() + + def fenced_json(_prompt: str, max_tokens: int = 60) -> str: + return '```json\n["diagram", "hidden layers", "neural network"]\n```' + + embedder.generate_text = fenced_json + + self.backend.index_image( + path=self.image_path, + collection="test", + embed_func=embedder, + caption_media=True, + ) + + rows = self.backend._embeddings_table.search().where("content_type = 'image'").to_list() + self.assertEqual(len(rows), 1) + self.assertEqual( + json.loads(rows[0].get("tags") or "[]"), + ["diagram", "hidden layers", "neural network"], + ) + def test_index_video_keeps_parent_memory_and_links_children(self): embedder = CaptioningEmbedder() logical_path = str(Path(self.video_path).expanduser().resolve())