HolobiomicsLab · lfnothias · Jun 19, 2026 · Jun 19, 2026
diff --git a/config.example.yml b/config.example.yml
@@ -24,6 +24,10 @@ llm:
   default_provider: "anthropic"
   default_model: "claude-sonnet-4-5"
   default_timeout_s: 60  # applied to every LLM call that doesn't set its own timeout
+  # Ollama only: context window forwarded to the local model. Ollama defaults
+  # num_ctx to 2048, which truncates long RAG synthesis prompts and makes the
+  # model return nothing. Raise to fit your prompts (more RAM = larger value).
+  ollama_num_ctx: 8192
 
   # ── Free-tier auto-rotation (zero-cost alternative) ──────────────────────
   # Set free_auto_mode: true + add your OPENROUTER_API_KEY (free account at

diff --git a/src/perspicacite/config/schema.py b/src/perspicacite/config/schema.py
@@ -499,6 +499,13 @@ class LLMConfig(BaseModel):
     # v1 core/core.py get_response: truncate mandatory + base system prompt to this length (chars)
     max_context_window: int = Field(default=10000, ge=2000, le=500000)
 
+    # Ollama only: context window (num_ctx) forwarded to the local model.
+    # Ollama silently defaults num_ctx to 2048, which truncates long RAG
+    # synthesis prompts (assembled up to ~context.max_tokens) so the model
+    # emits nothing. Forwarding this keeps the prompt inside the window.
+    # Larger values use more RAM; ignored by non-Ollama providers.
+    ollama_num_ctx: int = Field(default=8192, ge=512)
+
     embedding_models_per_type: dict[str, str] = Field(
         default_factory=dict,
         description=(

diff --git a/src/perspicacite/llm/client.py b/src/perspicacite/llm/client.py
@@ -517,6 +517,19 @@ def _build_model_string(self, provider: str, model: str) -> str:
         # For Minimax, the actual API call uses minimax/{model} format directly
         return f"{provider}/{model}"
 
+    def _provider_extra_params(self, provider: str) -> dict[str, Any]:
+        """Provider-specific kwargs merged into the LiteLLM completion call.
+
+        For Ollama, forward ``num_ctx`` so the local model's context window is
+        large enough for the assembled prompt. Ollama silently defaults
+        ``num_ctx`` to 2048, which truncates long RAG synthesis prompts and
+        makes the model return empty output; LiteLLM passes this through to
+        Ollama's ``options.num_ctx``. Empty for every other provider.
+        """
+        if provider == "ollama":
+            return {"num_ctx": int(self.config.ollama_num_ctx)}
+        return {}
+
     @retry(
         # F1 (audit 2026-05-15): never retry on deterministic-fail errors
         # — auth errors won't suddenly become valid; budget breaches won't
@@ -677,6 +690,9 @@ async def _complete_primary(
                 "max_tokens": max_tokens,
                 "timeout": _effective_timeout,
             }
+            # Ollama: forward num_ctx so the local context window fits the prompt
+            # (Ollama silently defaults to 2048 → truncated/empty output).
+            completion_kwargs.update(self._provider_extra_params(provider))
 
             # TODO: Minimax implementation needs fixes
             # There are response parsing issues with the Anthropic-compatible API

diff --git a/tests/unit/test_ollama_num_ctx.py b/tests/unit/test_ollama_num_ctx.py
@@ -0,0 +1,35 @@
+"""Ollama num_ctx forwarding (local-model context window fix).
+
+Ollama silently defaults num_ctx to 2048, which truncates long RAG synthesis
+prompts and makes the model emit empty output. The client must forward the
+configured num_ctx for the Ollama provider (and nothing for others).
+"""
+import unittest
+
+from perspicacite.config.schema import LLMConfig
+from perspicacite.llm.client import AsyncLLMClient
+
+
+class TestOllamaNumCtx(unittest.TestCase):
+    def test_config_default(self):
+        assert LLMConfig().ollama_num_ctx == 8192
+
+    def test_config_override(self):
+        assert LLMConfig(ollama_num_ctx=16384).ollama_num_ctx == 16384
+
+    def test_ollama_provider_gets_num_ctx(self):
+        client = AsyncLLMClient(LLMConfig())
+        assert client._provider_extra_params("ollama") == {"num_ctx": 8192}
+
+    def test_ollama_respects_config_value(self):
+        client = AsyncLLMClient(LLMConfig(ollama_num_ctx=4096))
+        assert client._provider_extra_params("ollama") == {"num_ctx": 4096}
+
+    def test_non_ollama_providers_unchanged(self):
+        client = AsyncLLMClient(LLMConfig())
+        for provider in ("openai", "anthropic", "deepseek", "minimax"):
+            assert client._provider_extra_params(provider) == {}
+
+
+if __name__ == "__main__":
+    unittest.main()