diff --git a/config.example.yml b/config.example.yml index b56bacc..2f213af 100644 --- a/config.example.yml +++ b/config.example.yml @@ -24,6 +24,10 @@ llm: default_provider: "anthropic" default_model: "claude-sonnet-4-5" default_timeout_s: 60 # applied to every LLM call that doesn't set its own timeout + # Ollama only: context window forwarded to the local model. Ollama defaults + # num_ctx to 2048, which truncates long RAG synthesis prompts and makes the + # model return nothing. Raise to fit your prompts (more RAM = larger value). + ollama_num_ctx: 8192 # ── Free-tier auto-rotation (zero-cost alternative) ────────────────────── # Set free_auto_mode: true + add your OPENROUTER_API_KEY (free account at diff --git a/src/perspicacite/config/schema.py b/src/perspicacite/config/schema.py index f9f048a..9bc1eb1 100644 --- a/src/perspicacite/config/schema.py +++ b/src/perspicacite/config/schema.py @@ -499,6 +499,13 @@ class LLMConfig(BaseModel): # v1 core/core.py get_response: truncate mandatory + base system prompt to this length (chars) max_context_window: int = Field(default=10000, ge=2000, le=500000) + # Ollama only: context window (num_ctx) forwarded to the local model. + # Ollama silently defaults num_ctx to 2048, which truncates long RAG + # synthesis prompts (assembled up to ~context.max_tokens) so the model + # emits nothing. Forwarding this keeps the prompt inside the window. + # Larger values use more RAM; ignored by non-Ollama providers. + ollama_num_ctx: int = Field(default=8192, ge=512) + embedding_models_per_type: dict[str, str] = Field( default_factory=dict, description=( diff --git a/src/perspicacite/llm/client.py b/src/perspicacite/llm/client.py index 5786179..145b72a 100644 --- a/src/perspicacite/llm/client.py +++ b/src/perspicacite/llm/client.py @@ -517,6 +517,19 @@ def _build_model_string(self, provider: str, model: str) -> str: # For Minimax, the actual API call uses minimax/{model} format directly return f"{provider}/{model}" + def _provider_extra_params(self, provider: str) -> dict[str, Any]: + """Provider-specific kwargs merged into the LiteLLM completion call. + + For Ollama, forward ``num_ctx`` so the local model's context window is + large enough for the assembled prompt. Ollama silently defaults + ``num_ctx`` to 2048, which truncates long RAG synthesis prompts and + makes the model return empty output; LiteLLM passes this through to + Ollama's ``options.num_ctx``. Empty for every other provider. + """ + if provider == "ollama": + return {"num_ctx": int(self.config.ollama_num_ctx)} + return {} + @retry( # F1 (audit 2026-05-15): never retry on deterministic-fail errors # — auth errors won't suddenly become valid; budget breaches won't @@ -677,6 +690,9 @@ async def _complete_primary( "max_tokens": max_tokens, "timeout": _effective_timeout, } + # Ollama: forward num_ctx so the local context window fits the prompt + # (Ollama silently defaults to 2048 → truncated/empty output). + completion_kwargs.update(self._provider_extra_params(provider)) # TODO: Minimax implementation needs fixes # There are response parsing issues with the Anthropic-compatible API diff --git a/tests/unit/test_ollama_num_ctx.py b/tests/unit/test_ollama_num_ctx.py new file mode 100644 index 0000000..1596bb5 --- /dev/null +++ b/tests/unit/test_ollama_num_ctx.py @@ -0,0 +1,35 @@ +"""Ollama num_ctx forwarding (local-model context window fix). + +Ollama silently defaults num_ctx to 2048, which truncates long RAG synthesis +prompts and makes the model emit empty output. The client must forward the +configured num_ctx for the Ollama provider (and nothing for others). +""" +import unittest + +from perspicacite.config.schema import LLMConfig +from perspicacite.llm.client import AsyncLLMClient + + +class TestOllamaNumCtx(unittest.TestCase): + def test_config_default(self): + assert LLMConfig().ollama_num_ctx == 8192 + + def test_config_override(self): + assert LLMConfig(ollama_num_ctx=16384).ollama_num_ctx == 16384 + + def test_ollama_provider_gets_num_ctx(self): + client = AsyncLLMClient(LLMConfig()) + assert client._provider_extra_params("ollama") == {"num_ctx": 8192} + + def test_ollama_respects_config_value(self): + client = AsyncLLMClient(LLMConfig(ollama_num_ctx=4096)) + assert client._provider_extra_params("ollama") == {"num_ctx": 4096} + + def test_non_ollama_providers_unchanged(self): + client = AsyncLLMClient(LLMConfig()) + for provider in ("openai", "anthropic", "deepseek", "minimax"): + assert client._provider_extra_params(provider) == {} + + +if __name__ == "__main__": + unittest.main()