From 6bea2f0acf8843612db8c3030ffc6dd5600bf712 Mon Sep 17 00:00:00 2001
From: Christophe Goudet <goudetdata@gmail.com>
Date: Sat, 4 Apr 2026 14:24:50 +0200
Subject: [PATCH 1/4] minimal endpoint

---
 .env.template                                 |  2 +
 .gitignore                                    |  9 +++--
 eu_fact_force/app/settings.py                 |  7 ++++
 .../data_collection/default_search.json       | 37 +++++++++++++++++++
 eu_fact_force/ingestion/urls.py               |  1 +
 eu_fact_force/ingestion/views.py              | 20 ++++++++++
 6 files changed, 72 insertions(+), 4 deletions(-)
 create mode 100644 eu_fact_force/ingestion/data_collection/default_search.json

diff --git a/.env.template b/.env.template
index 6e776a5..677c235 100644
--- a/.env.template
+++ b/.env.template
@@ -13,6 +13,8 @@ SECRET_KEY=
 # DEBUG: Enable debug mode (tracebacks, detailed errors). Default is true.
 # In production you must set this to false (DEBUG=0 or DEBUG=false).
 DEBUG=1
+# If true, returns a fixed json response for the /api/v1/retrieve endpoint.
+FLAG_RETRIEVE_DEFAULT_JSON=0
 
 # -----------------------------------------------------------------------------
 # Database (required)
diff --git a/.gitignore b/.gitignore
index 1d8cf59..de02f97 100644
--- a/.gitignore
+++ b/.gitignore
@@ -172,8 +172,9 @@ eu_fact_force/ingestion/parsing/output/benchmark_results_extended.csv
 eu_fact_force/ingestion/parsing/output/extraction_scores.csv
 eu_fact_force/ingestion/parsing/output/analysis/
 s3/
-eu_fact_force/exploration/docling/results/html/
-eu_fact_force/exploration/docling/results/json/
-eu_fact_force/exploration/docling/results/md/
+eu_fact_force/exploration/
 annotated_pdf/
-eu_fact_force/exploration/docling/results/annotated_pdf/
+
+# docker volumes
+postgres_data/
+rustfs_data
\ No newline at end of file
diff --git a/eu_fact_force/app/settings.py b/eu_fact_force/app/settings.py
index d094c99..74cd9d7 100644
--- a/eu_fact_force/app/settings.py
+++ b/eu_fact_force/app/settings.py
@@ -187,3 +187,10 @@ def _get_databases():
             "BACKEND": "django.contrib.staticfiles.storage.StaticFilesStorage",
         },
     }
+
+
+FLAG_RETRIEVE_DEFAULT_JSON = os.getenv("FLAG_RETRIEVE_DEFAULT_JSON", "0").lower() in (
+    "true",
+    "1",
+    "yes",
+)
diff --git a/eu_fact_force/ingestion/data_collection/default_search.json b/eu_fact_force/ingestion/data_collection/default_search.json
new file mode 100644
index 0000000..03a6de4
--- /dev/null
+++ b/eu_fact_force/ingestion/data_collection/default_search.json
@@ -0,0 +1,37 @@
+{
+    "status": "success",
+    "narrative": "vaccine_autism",
+    "chunks": [
+        {
+            "type": "text",
+            "content": "...",
+            "score": 0.94,
+            "metadata": {
+                "document_id": "<document_id>",
+                "page": 0
+            }
+        }
+    ],
+    "documents": {
+        "<document_id>": {
+            "link": "https://.../",
+            "title": "...",
+            "date": "...",
+            "journal": "...",
+            "authors": [
+                "...",
+                "..."
+            ],
+            "doi": "...",
+            "abstract": "...",
+            "keywords": [
+                "keyword_1",
+                "keyword_2"
+            ],
+            "evidence": {
+                "name": "...",
+                "rank": 0
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/eu_fact_force/ingestion/urls.py b/eu_fact_force/ingestion/urls.py
index 0a3a66c..92b1876 100644
--- a/eu_fact_force/ingestion/urls.py
+++ b/eu_fact_force/ingestion/urls.py
@@ -5,4 +5,5 @@
 app_name = "ingestion"
 urlpatterns = [
     path("ingest/", views.ingest, name="ingest"),
+    path("search/<str:keyword>/", views.search, name="search"),
 ]
diff --git a/eu_fact_force/ingestion/views.py b/eu_fact_force/ingestion/views.py
index 675555e..9c4ab20 100644
--- a/eu_fact_force/ingestion/views.py
+++ b/eu_fact_force/ingestion/views.py
@@ -1,8 +1,18 @@
+import json
+from pathlib import Path
+
+from django.http import JsonResponse
 from django.shortcuts import render
 
+from eu_fact_force.app.settings import FLAG_RETRIEVE_DEFAULT_JSON
+
 from .forms import IngestForm
 from .services import run_pipeline
 
+_DEFAULT_SEARCH_PATH = (
+    Path(__file__).resolve().parent / "data_collection" / "default_search.json"
+)
+
 
 def ingest(request):
     """Accept a DOI via form, run the pipeline, display success and count."""
@@ -31,3 +41,13 @@ def ingest(request):
         else:
             context["form"] = form
     return render(request, "ingestion/ingest.html", context)
+
+
+def search(request, keyword: str):
+    """Return the default search fixture JSON (keyword reserved for future filtering)."""
+    _ = keyword
+    if FLAG_RETRIEVE_DEFAULT_JSON:
+        return JsonResponse(
+            json.loads(_DEFAULT_SEARCH_PATH.read_text(encoding="utf-8"))
+        )
+    return JsonResponse({"status": "success", "narrative": keyword})

From a414b68f748310519f915fb9f6c4cb0a5c8bdd4b Mon Sep 17 00:00:00 2001
From: Christophe Goudet <goudetdata@gmail.com>
Date: Sat, 4 Apr 2026 14:45:00 +0200
Subject: [PATCH 2/4] valid search

---
 .../data_collection/prompts/vaccine_autism.md |  1 +
 eu_fact_force/ingestion/search.py             | 51 ++++++++++++++++++-
 eu_fact_force/ingestion/views.py              | 21 +++++++-
 3 files changed, 71 insertions(+), 2 deletions(-)
 create mode 100644 eu_fact_force/ingestion/data_collection/prompts/vaccine_autism.md

diff --git a/eu_fact_force/ingestion/data_collection/prompts/vaccine_autism.md b/eu_fact_force/ingestion/data_collection/prompts/vaccine_autism.md
new file mode 100644
index 0000000..e34a010
--- /dev/null
+++ b/eu_fact_force/ingestion/data_collection/prompts/vaccine_autism.md
@@ -0,0 +1 @@
+I want quantitative results about the link between vaccines and autism.
\ No newline at end of file
diff --git a/eu_fact_force/ingestion/search.py b/eu_fact_force/ingestion/search.py
index 70608be..783e93e 100644
--- a/eu_fact_force/ingestion/search.py
+++ b/eu_fact_force/ingestion/search.py
@@ -1,8 +1,22 @@
 """Semantic search over ingested document chunks using pgvector."""
 
+from pathlib import Path
+
+from pgvector.django import CosineDistance
+
 from eu_fact_force.ingestion.embedding import embed_query
 from eu_fact_force.ingestion.models import DocumentChunk
-from pgvector.django import CosineDistance
+
+_PROMPTS_DIR = Path(__file__).resolve().parent / "data_collection" / "prompts"
+
+
+class NarrativeNotFoundError(FileNotFoundError):
+    """No prompts/<narrative>.md for the given narrative keyword."""
+
+
+def list_prompt_keywords() -> list[str]:
+    """Basenames of narrative prompts (one .md file per keyword), sorted."""
+    return sorted(p.stem for p in _PROMPTS_DIR.glob("*.md"))
 
 
 def search_chunks(query: str, k: int = 10) -> list[tuple[DocumentChunk, float]]:
@@ -26,3 +40,38 @@ def search_chunks(query: str, k: int = 10) -> list[tuple[DocumentChunk, float]]:
         .order_by("distance")[:k]
     )
     return [(chunk, float(chunk.distance)) for chunk in qs]
+
+
+def search_narrative(narrative: str, k: int = 10) -> list[tuple[DocumentChunk, float]]:
+    prompt = _PROMPTS_DIR / f"{narrative}.md"
+    if not prompt.exists():
+        raise NarrativeNotFoundError(f"Prompt file not found: {prompt}")
+    return search_chunks(prompt.read_text(), k)
+
+
+def chunks_context(top_chunks: list[DocumentChunk, float]) -> dict:
+    chunks = [
+        {
+            "type": "text",
+            "content": chunk.content,
+            "score": score,
+            "metadata": {"document_id": chunk.source_file.id, "page": -1},
+        }
+        for chunk, score in top_chunks
+    ]
+
+    documents = {}
+    for chunk, _ in top_chunks:
+        source_file = chunk.source_file
+        if source_file.id in documents:
+            continue
+        meta = source_file.metadata
+        documents[source_file.id] = {
+            "id": source_file.id,
+            "doi": source_file.doi,
+            "tags_pubmed": meta.tags_pubmed,
+        }
+    return {
+        "chunks": chunks,
+        "documents": documents,
+    }
diff --git a/eu_fact_force/ingestion/views.py b/eu_fact_force/ingestion/views.py
index 9c4ab20..9c0f97d 100644
--- a/eu_fact_force/ingestion/views.py
+++ b/eu_fact_force/ingestion/views.py
@@ -5,6 +5,12 @@
 from django.shortcuts import render
 
 from eu_fact_force.app.settings import FLAG_RETRIEVE_DEFAULT_JSON
+from eu_fact_force.ingestion.search import (
+    NarrativeNotFoundError,
+    chunks_context,
+    list_prompt_keywords,
+    search_narrative,
+)
 
 from .forms import IngestForm
 from .services import run_pipeline
@@ -50,4 +56,17 @@ def search(request, keyword: str):
         return JsonResponse(
             json.loads(_DEFAULT_SEARCH_PATH.read_text(encoding="utf-8"))
         )
-    return JsonResponse({"status": "success", "narrative": keyword})
+    try:
+        chunks = search_narrative(keyword)
+    except NarrativeNotFoundError:
+        return JsonResponse(
+            {
+                "error": f"Unknown narrative keyword {keyword!r}; no matching prompt.",
+                "keywords": list_prompt_keywords(),
+            },
+            status=404,
+        )
+
+    return JsonResponse(
+        {"status": "success", "narrative": keyword, **chunks_context(chunks)}
+    )

From 13517d793715f89a31861fe9f143793647e08bd4 Mon Sep 17 00:00:00 2001
From: Christophe Goudet <goudetdata@gmail.com>
Date: Sat, 4 Apr 2026 14:53:12 +0200
Subject: [PATCH 3/4] add unit test

---
 tests/factories.py             | 10 ++++-
 tests/ingestion/test_search.py | 67 ++++++++++++++++++++++++++++++++--
 2 files changed, 73 insertions(+), 4 deletions(-)

diff --git a/tests/factories.py b/tests/factories.py
index 1876f43..c26b5fd 100644
--- a/tests/factories.py
+++ b/tests/factories.py
@@ -3,7 +3,7 @@
 import factory
 from factory.django import DjangoModelFactory
 
-from eu_fact_force.ingestion.models import DocumentChunk, SourceFile
+from eu_fact_force.ingestion.models import DocumentChunk, FileMetadata, SourceFile
 
 
 class SourceFileFactory(DjangoModelFactory):
@@ -15,6 +15,14 @@ class Meta:
     status = SourceFile.Status.STORED
 
 
+class FileMetadataFactory(DjangoModelFactory):
+    class Meta:
+        model = FileMetadata
+
+    source_file = factory.SubFactory(SourceFileFactory)
+    tags_pubmed = factory.LazyFunction(list)
+
+
 class DocumentChunkFactory(DjangoModelFactory):
     class Meta:
         model = DocumentChunk
diff --git a/tests/ingestion/test_search.py b/tests/ingestion/test_search.py
index d23536b..d558160 100644
--- a/tests/ingestion/test_search.py
+++ b/tests/ingestion/test_search.py
@@ -8,8 +8,14 @@
 from eu_fact_force.ingestion import search as search_module
 from eu_fact_force.ingestion import services as services_module
 from eu_fact_force.ingestion.chunking import MAX_CHUNK_CHARS
-from eu_fact_force.ingestion.models import DocumentChunk, EMBEDDING_DIMENSIONS, SourceFile
+from eu_fact_force.ingestion.models import (
+    EMBEDDING_DIMENSIONS,
+    DocumentChunk,
+    SourceFile,
+)
+from eu_fact_force.ingestion.search import chunks_context
 from eu_fact_force.ingestion.services import run_pipeline
+from tests.factories import DocumentChunkFactory, FileMetadataFactory, SourceFileFactory
 
 PROJECT_ROOT = Path(__file__).resolve().parents[2]
 
@@ -132,8 +138,10 @@ def test_pipeline_then_search_returns_chunks_ordered_by_similarity(
         # Vectors so cosine distance order is well-defined: p1 closest, then p2, then p3.
         def _add_known_embeddings(chunks):
             near = _one_hot_vector(0)
-            mid = [SECOND_CLOSEST_VEC_ALIGNED] + [SECOND_CLOSEST_VEC_OFF] + [0.0] * (
-                EMBEDDING_DIMENSIONS - 2
+            mid = (
+                [SECOND_CLOSEST_VEC_ALIGNED]
+                + [SECOND_CLOSEST_VEC_OFF]
+                + [0.0] * (EMBEDDING_DIMENSIONS - 2)
             )
             far = _one_hot_vector(1)
             vecs = [near, mid, far]
@@ -151,3 +159,56 @@ def _add_known_embeddings(chunks):
         contents = [r[0].content for r in results]
         assert contents == [p1, p2, p3]
         assert all(r[0].source_file_id == source_file.pk for r in results)
+
+
+@pytest.mark.django_db
+class TestChunksContext:
+    def test_empty_top_chunks(self):
+        assert chunks_context([]) == {"chunks": [], "documents": {}}
+
+    def test_two_chunks_single_source_file(self):
+        source = SourceFileFactory(doi="doi/single", s3_key="key/single")
+        FileMetadataFactory(source_file=source, tags_pubmed=["mesh:a"])
+        chunk_a = DocumentChunkFactory(source_file=source, content="first")
+        chunk_b = DocumentChunkFactory(source_file=source, content="second")
+
+        result = chunks_context([(chunk_a, 0.9), (chunk_b, 0.8)])
+
+        assert result["chunks"] == [
+            {
+                "type": "text",
+                "content": "first",
+                "score": 0.9,
+                "metadata": {"document_id": source.id, "page": -1},
+            },
+            {
+                "type": "text",
+                "content": "second",
+                "score": 0.8,
+                "metadata": {"document_id": source.id, "page": -1},
+            },
+        ]
+        assert result["documents"] == {
+            source.id: {
+                "id": source.id,
+                "doi": "doi/single",
+                "tags_pubmed": ["mesh:a"],
+            }
+        }
+
+    def test_two_chunks_two_source_files(self):
+        src1 = SourceFileFactory(doi="doi/one", s3_key="k1")
+        FileMetadataFactory(source_file=src1, tags_pubmed=["t1"])
+        src2 = SourceFileFactory(doi="doi/two", s3_key="k2")
+        FileMetadataFactory(source_file=src2, tags_pubmed=["t2", "t3"])
+
+        c1 = DocumentChunkFactory(source_file=src1, content="alpha", order=0)
+        c2 = DocumentChunkFactory(source_file=src2, content="beta", order=0)
+
+        result = chunks_context([(c1, 0.1), (c2, 0.2)])
+
+        assert [x["content"] for x in result["chunks"]] == ["alpha", "beta"]
+        assert result["documents"] == {
+            src1.id: {"id": src1.id, "doi": "doi/one", "tags_pubmed": ["t1"]},
+            src2.id: {"id": src2.id, "doi": "doi/two", "tags_pubmed": ["t2", "t3"]},
+        }

From 67597d557f47a192831c6b692a53a17144532709 Mon Sep 17 00:00:00 2001
From: Christophe Goudet <goudetdata@gmail.com>
Date: Wed, 8 Apr 2026 17:22:13 +0200
Subject: [PATCH 4/4] fix typo

---
 eu_fact_force/ingestion/search.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/eu_fact_force/ingestion/search.py b/eu_fact_force/ingestion/search.py
index 783e93e..b9888eb 100644
--- a/eu_fact_force/ingestion/search.py
+++ b/eu_fact_force/ingestion/search.py
@@ -49,7 +49,7 @@ def search_narrative(narrative: str, k: int = 10) -> list[tuple[DocumentChunk, f
     return search_chunks(prompt.read_text(), k)
 
 
-def chunks_context(top_chunks: list[DocumentChunk, float]) -> dict:
+def chunks_context(top_chunks: list[tuple[DocumentChunk, float]]) -> dict:
     chunks = [
         {
             "type": "text",