Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions config.example.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@ llm:
default_provider: "anthropic"
default_model: "claude-sonnet-4-5"
default_timeout_s: 60 # applied to every LLM call that doesn't set its own timeout
# Ollama only: context window forwarded to the local model. Ollama defaults
# num_ctx to 2048, which truncates long RAG synthesis prompts and makes the
# model return nothing. Raise to fit your prompts (more RAM = larger value).
ollama_num_ctx: 8192

# ── Free-tier auto-rotation (zero-cost alternative) ──────────────────────
# Set free_auto_mode: true + add your OPENROUTER_API_KEY (free account at
Expand Down
7 changes: 7 additions & 0 deletions src/perspicacite/config/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -499,6 +499,13 @@ class LLMConfig(BaseModel):
# v1 core/core.py get_response: truncate mandatory + base system prompt to this length (chars)
max_context_window: int = Field(default=10000, ge=2000, le=500000)

# Ollama only: context window (num_ctx) forwarded to the local model.
# Ollama silently defaults num_ctx to 2048, which truncates long RAG
# synthesis prompts (assembled up to ~context.max_tokens) so the model
# emits nothing. Forwarding this keeps the prompt inside the window.
# Larger values use more RAM; ignored by non-Ollama providers.
ollama_num_ctx: int = Field(default=8192, ge=512)

embedding_models_per_type: dict[str, str] = Field(
default_factory=dict,
description=(
Expand Down
16 changes: 16 additions & 0 deletions src/perspicacite/llm/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -517,6 +517,19 @@ def _build_model_string(self, provider: str, model: str) -> str:
# For Minimax, the actual API call uses minimax/{model} format directly
return f"{provider}/{model}"

def _provider_extra_params(self, provider: str) -> dict[str, Any]:
"""Provider-specific kwargs merged into the LiteLLM completion call.

For Ollama, forward ``num_ctx`` so the local model's context window is
large enough for the assembled prompt. Ollama silently defaults
``num_ctx`` to 2048, which truncates long RAG synthesis prompts and
makes the model return empty output; LiteLLM passes this through to
Ollama's ``options.num_ctx``. Empty for every other provider.
"""
if provider == "ollama":
return {"num_ctx": int(self.config.ollama_num_ctx)}
return {}

@retry(
# F1 (audit 2026-05-15): never retry on deterministic-fail errors
# — auth errors won't suddenly become valid; budget breaches won't
Expand Down Expand Up @@ -677,6 +690,9 @@ async def _complete_primary(
"max_tokens": max_tokens,
"timeout": _effective_timeout,
}
# Ollama: forward num_ctx so the local context window fits the prompt
# (Ollama silently defaults to 2048 → truncated/empty output).
completion_kwargs.update(self._provider_extra_params(provider))

# TODO: Minimax implementation needs fixes
# There are response parsing issues with the Anthropic-compatible API
Expand Down
35 changes: 35 additions & 0 deletions tests/unit/test_ollama_num_ctx.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
"""Ollama num_ctx forwarding (local-model context window fix).

Ollama silently defaults num_ctx to 2048, which truncates long RAG synthesis
prompts and makes the model emit empty output. The client must forward the
configured num_ctx for the Ollama provider (and nothing for others).
"""
import unittest

from perspicacite.config.schema import LLMConfig
from perspicacite.llm.client import AsyncLLMClient


class TestOllamaNumCtx(unittest.TestCase):
def test_config_default(self):
assert LLMConfig().ollama_num_ctx == 8192

def test_config_override(self):
assert LLMConfig(ollama_num_ctx=16384).ollama_num_ctx == 16384

def test_ollama_provider_gets_num_ctx(self):
client = AsyncLLMClient(LLMConfig())
assert client._provider_extra_params("ollama") == {"num_ctx": 8192}

def test_ollama_respects_config_value(self):
client = AsyncLLMClient(LLMConfig(ollama_num_ctx=4096))
assert client._provider_extra_params("ollama") == {"num_ctx": 4096}

def test_non_ollama_providers_unchanged(self):
client = AsyncLLMClient(LLMConfig())
for provider in ("openai", "anthropic", "deepseek", "minimax"):
assert client._provider_extra_params(provider) == {}


if __name__ == "__main__":
unittest.main()
Loading