diff --git a/backend/.env.example b/backend/.env.example index 0ece358..cfb2507 100644 --- a/backend/.env.example +++ b/backend/.env.example @@ -6,13 +6,26 @@ # `python api.py` when running the backend on the host. # ---- LLM provider (profile-driven via LiteLLM) ---- -# Picks an entry from backend/API/config.py::_LLM_PROFILES. -# Currently only `deepinfra` ships out of the box. +# Flip LLM_PROFILE to switch the whole stack. Six profiles ship in +# backend/API/config.py::_LLM_PROFILES — see the README for the full table. +# gemini | deepinfra | nim | together | local | local_gemma LLM_PROFILE=deepinfra -DEEPINFRA_API_KEY=your-deepinfra-api-key-here -# Optional: override the model string for the active profile -# AGENT_MODEL=deepinfra/google/gemma-4-31B-it +# Provider keys — uncomment/fill the one(s) matching the active LLM_PROFILE: +DEEPINFRA_API_KEY=your-deepinfra-api-key-here +# GEMINI_API_KEY= +# NVIDIA_NIM_API_KEY= +# TOGETHER_API_KEY= + +# Local OpenAI-compatible servers (llama.cpp, vLLM, LM Studio, etc.). +# Inside Docker, use host.docker.internal instead of localhost. +# LOCAL_API_KEY=sk-noauth # any non-empty value +# LOCAL_API_BASE=http://host.docker.internal:8003/v1 # for LLM_PROFILE=local +# LOCAL_API_BASE_GEMMA=http://host.docker.internal:8002/v1 # for LLM_PROFILE=local_gemma + +# Optional overrides (rarely needed — profile defaults usually suffice): +# AGENT_MODEL= # override the active profile's model id +# AGENT_API_BASE= # global base_url override # ---- Agent loop ---- AGENT_MAX_TURNS=15 diff --git a/backend/API/agent.py b/backend/API/agent.py index f9674a7..c85869d 100644 --- a/backend/API/agent.py +++ b/backend/API/agent.py @@ -28,10 +28,13 @@ def _create_model() -> LitellmModel: - return LitellmModel( - model=settings.resolved_agent_model, - api_key=settings.resolved_api_key, - ) + kwargs: dict = { + "model": settings.resolved_agent_model, + "api_key": settings.resolved_api_key, + } + if settings.resolved_api_base: + kwargs["base_url"] = settings.resolved_api_base + return LitellmModel(**kwargs) def _build_instructions(user_task: str, output_schema: list[dict], scrape_backend: str) -> str: diff --git a/backend/API/config.py b/backend/API/config.py index d72d886..4260d71 100644 --- a/backend/API/config.py +++ b/backend/API/config.py @@ -12,9 +12,40 @@ # Adding a provider = one dict entry. No code change elsewhere. _LLM_PROFILES: dict[str, dict[str, str]] = { + "gemini": { + "model": "gemini/gemini-3.1-flash-lite-preview", + "api_key_env": "GEMINI_API_KEY", + "api_base_env": "", + }, "deepinfra": { "model": "deepinfra/google/gemma-4-31B-it", "api_key_env": "DEEPINFRA_API_KEY", + "api_base_env": "", + }, + "nim": { + "model": "nvidia_nim/google/gemma-4-31b-it", + "api_key_env": "NVIDIA_NIM_API_KEY", + "api_base_env": "", + }, + "together": { + "model": "together_ai/google/gemma-4-31B-it", + "api_key_env": "TOGETHER_API_KEY", + "api_base_env": "", + }, + "local": { + # llama.cpp llama-server (OpenAI-compatible) hosting Qwen on :8003. + # The "openai/" prefix tells LiteLLM to route via api_base. + "model": "openai/qwen", + "api_key_env": "LOCAL_API_KEY", + "api_base_env": "LOCAL_API_BASE", + }, + "local_gemma": { + # OpenAI-compatible server hosting Gemma 4 31B (AWQ) on :8002. + # Edit this string if your server exposes the model under a different + # name, or override at runtime with AGENT_MODEL. + "model": "openai/cyankiwi/gemma-4-31B-it-AWQ-4bit", + "api_key_env": "LOCAL_API_KEY", + "api_base_env": "LOCAL_API_BASE_GEMMA", }, } @@ -25,8 +56,9 @@ class Settings(BaseSettings): # --- Master switch --- llm_profile: str = "deepinfra" - # --- Optional model override (blank = use profile default) --- + # --- Optional overrides (blank = use profile default) --- agent_model: str = "" + agent_api_base: str = "" # --- Agent loop --- agent_max_turns: int = 15 @@ -57,8 +89,15 @@ def resolved_agent_model(self) -> str: @property def resolved_api_key(self) -> str: - env = _LLM_PROFILES[self.llm_profile]["api_key_env"] - return os.getenv(env, "") + env = _LLM_PROFILES[self.llm_profile].get("api_key_env", "") + return os.getenv(env, "") if env else "" + + @property + def resolved_api_base(self) -> str: + if self.agent_api_base: + return self.agent_api_base + env = _LLM_PROFILES[self.llm_profile].get("api_base_env", "") + return os.getenv(env, "") if env else "" @property def running_in_docker(self) -> bool: @@ -79,6 +118,7 @@ def log_config_summary(self, logger: Optional[logging.Logger] = None) -> None: logger.info(" LLM profile: %s", self.llm_profile) logger.info(" Resolved model: %s", self.resolved_agent_model) logger.info(" API key set: %s", bool(self.resolved_api_key)) + logger.info(" API base: %s", self.resolved_api_base or "(provider default)") logger.info(" Firecrawl key set: %s", bool(self.firecrawl_api_key)) logger.info(" Running in Docker: %s", self.running_in_docker) logger.info(" Browser-visible supported: %s", self.browser_visible_supported) diff --git a/backend/README.md b/backend/README.md index 4d59ad8..941fca0 100644 --- a/backend/README.md +++ b/backend/README.md @@ -8,7 +8,7 @@ The backend exposes a small HTTP API. Each call accepts one row of input data, a ### Key Features -- **Profile-based LLM provider** — `LLM_PROFILE=deepinfra` runs `deepinfra/google/gemma-4-31B-it`. Adding a provider is one entry in `_LLM_PROFILES` (see `config.py`). +- **Profile-based LLM provider** — `LLM_PROFILE` flips the whole stack: ships with six profiles (`gemini`, `deepinfra`, `nim`, `together`, `local`, `local_gemma`). Cloud providers (Gemini 3.1 Flash Lite, DeepInfra, NVIDIA NIM, Together) and OpenAI-compatible local servers (llama.cpp, vLLM, LM Studio) all work via the same dict in `config.py`. - **Two scrape backends, picked per request**: - `local` (default) — Playwright (`search_google`) + crawl4ai (`visit_webpage`). Chromium baked into the image. - `firecrawl` — Firecrawl MCP (`firecrawl_scrape`, `firecrawl_search`). @@ -58,7 +58,7 @@ backend/ ### Prerequisites - Python 3.13+ -- DeepInfra API key (default profile) — or any LiteLLM-supported provider after adding it to `_LLM_PROFILES` +- API key for one of the six built-in profiles (`gemini`, `deepinfra`, `nim`, `together`, `local`, `local_gemma`) — or any LiteLLM-supported provider after adding it to `_LLM_PROFILES` - Firecrawl API key (only required if you'll use `scrape_backend=firecrawl`) - For host run: `playwright install chromium` after `pip install` @@ -71,17 +71,32 @@ pip install -r requirements.txt playwright install chromium # only needed for host runs ``` -2. **Configure environment** (create `.env`): +2. **Configure environment** (create `.env` — pick the profile you want): ```bash -# Required — LLM provider profile + key +# Required — LLM provider profile + matching API key LLM_PROFILE=deepinfra DEEPINFRA_API_KEY=your-deepinfra-api-key +# GEMINI_API_KEY=your-gemini-api-key # for LLM_PROFILE=gemini +# NVIDIA_NIM_API_KEY=your-nim-api-key # for LLM_PROFILE=nim +# TOGETHER_API_KEY=your-together-api-key # for LLM_PROFILE=together + +# Local OpenAI-compatible server (llama.cpp / vLLM / LM Studio): +# LLM_PROFILE=local # Qwen on :8003 by default +# LOCAL_API_KEY=sk-noauth # any non-empty value +# LOCAL_API_BASE=http://localhost:8003/v1 +# (inside Docker: http://host.docker.internal:8003/v1) + +# LLM_PROFILE=local_gemma # Gemma 4 31B AWQ on :8002 +# LOCAL_API_KEY=sk-noauth +# LOCAL_API_BASE_GEMMA=http://localhost:8002/v1 +# AGENT_MODEL= # only needed if your server uses a different model id # Required if you'll use the firecrawl backend FIRECRAWL_API_KEY=your-firecrawl-api-key # Optional overrides -# AGENT_MODEL=deepinfra/google/gemma-4-31B-it +# AGENT_MODEL= # override the profile's model string +# AGENT_API_BASE= # global base_url override # AGENT_MAX_TURNS=15 # FIRECRAWL_MAX_CONTENT_LENGTH=20000 # DEBUG_MODE=true @@ -114,10 +129,17 @@ wsl -- bash -c 'cd /mnt/c/.../knowledge-robot && \ | Variable | Required | Default | Description | |----------|----------|---------|-------------| -| `LLM_PROFILE` | Yes | `deepinfra` | Picks an entry from `_LLM_PROFILES` in `config.py` | +| `LLM_PROFILE` | Yes | `deepinfra` | Picks an entry from `_LLM_PROFILES` in `config.py`. Ships with `gemini` / `deepinfra` / `nim` / `together` / `local` / `local_gemma` | +| `GEMINI_API_KEY` | Yes (for `gemini` profile) | - | Google AI Studio key | | `DEEPINFRA_API_KEY` | Yes (for `deepinfra` profile) | - | DeepInfra API key | +| `NVIDIA_NIM_API_KEY` | Yes (for `nim` profile) | - | NVIDIA NIM API key | +| `TOGETHER_API_KEY` | Yes (for `together` profile) | - | Together AI API key | +| `LOCAL_API_KEY` | Yes (for `local` / `local_gemma`) | - | Any non-empty string; the OpenAI client validates it client-side, the local server typically ignores it | +| `LOCAL_API_BASE` | Yes (for `local` profile) | - | OpenAI-compatible URL, e.g. `http://localhost:8003/v1` (or `http://host.docker.internal:8003/v1` from inside Docker) | +| `LOCAL_API_BASE_GEMMA` | Yes (for `local_gemma` profile) | - | OpenAI-compatible URL for the Gemma server, e.g. `http://localhost:8002/v1` | | `FIRECRAWL_API_KEY` | When `scrape_backend=firecrawl` | - | Firecrawl API key | -| `AGENT_MODEL` | No | (profile default) | Override the profile's model string, e.g. `deepinfra/google/gemma-4-31B-it` | +| `AGENT_MODEL` | No | (profile default) | Override the profile's model string, e.g. `openai/cyankiwi/gemma-4-31B-it-AWQ-4bit` | +| `AGENT_API_BASE` | No | (profile default) | Global `base_url` override (wins over the per-profile `*_API_BASE`) | | `AGENT_MAX_TURNS` | No | `15` | Max tool calls per row | | `AGENT_SUBAGENT_MAX_TURNS` | No | `30` | Max tool calls inside a subagent | | `TOOL_OUTPUT_MAX_CHARS` | No | `8000` | Per-tool truncation cap for local-backend tools | @@ -137,23 +159,33 @@ wsl -- bash -c 'cd /mnt/c/.../knowledge-robot && \ ### Supported LLM Providers -The default profile is `deepinfra`. To add a provider, append to `_LLM_PROFILES` in [API/config.py](API/config.py): +Six profiles ship in [API/config.py](API/config.py): + +| Profile | Default model | Required env | +|---------|---------------|--------------| +| `gemini` | `gemini/gemini-3.1-flash-lite-preview` | `GEMINI_API_KEY` | +| `deepinfra` | `deepinfra/google/gemma-4-31B-it` | `DEEPINFRA_API_KEY` | +| `nim` | `nvidia_nim/google/gemma-4-31b-it` | `NVIDIA_NIM_API_KEY` | +| `together` | `together_ai/google/gemma-4-31B-it` | `TOGETHER_API_KEY` | +| `local` | `openai/qwen` (llama.cpp Qwen on :8003) | `LOCAL_API_KEY` + `LOCAL_API_BASE` | +| `local_gemma` | `openai/cyankiwi/gemma-4-31B-it-AWQ-4bit` (Gemma 4 31B AWQ on :8002) | `LOCAL_API_KEY` + `LOCAL_API_BASE_GEMMA` | + +Set `LLM_PROFILE=` in `.env` to switch — that's it, no other vars to flip. If your local server exposes the model under a different name, edit the `model` field of the profile in [API/config.py](API/config.py) (or override at runtime with `AGENT_MODEL`). + +To add another LiteLLM-supported provider, append to `_LLM_PROFILES`: ```python _LLM_PROFILES = { - "deepinfra": { - "model": "deepinfra/google/gemma-4-31B-it", - "api_key_env": "DEEPINFRA_API_KEY", + # ... existing six ... + "anthropic": { + "model": "anthropic/claude-sonnet-4-5", + "api_key_env": "ANTHROPIC_API_KEY", + "api_base_env": "", }, - # Examples (not currently wired): - # "gemini": {"model": "gemini/gemini-2.5-flash", "api_key_env": "GEMINI_API_KEY"}, - # "anthropic":{"model": "anthropic/claude-3-5-sonnet-latest", "api_key_env": "ANTHROPIC_API_KEY"}, - # "openai": {"model": "openai/gpt-4o-mini", "api_key_env": "OPENAI_API_KEY"}, - # "nim": {"model": "nvidia_nim/google/gemma-4-31b-it", "api_key_env": "NVIDIA_NIM_API_KEY"}, } ``` -Set `LLM_PROFILE=` in `.env` to switch. **Note**: `ModelSettings()` is bare in this codebase — passing `reasoning_effort` raises `UnsupportedParamsError` on DeepInfra/Gemma. If you want reasoning on a Gemini-class profile, set `litellm.drop_params=True` at module load. +**Note**: `ModelSettings()` is bare in this codebase — passing `reasoning_effort` raises `UnsupportedParamsError` on DeepInfra/Gemma. If you want reasoning on a Gemini-class profile, set `litellm.drop_params=True` at module load. ## API Endpoints @@ -265,7 +297,7 @@ Module-level `process_row()` validates the request, builds the dynamic Pydantic ### 2. Agent factory + dispatcher (`agent.py`) - `set_tracing_disabled(disabled=True)` at module load (silences OpenAI tracing service spam). -- `_create_model()` returns a `LitellmModel` from `settings.resolved_agent_model` + `settings.resolved_api_key`. +- `_create_model()` returns a `LitellmModel` from `settings.resolved_agent_model` + `settings.resolved_api_key`, plus `base_url=settings.resolved_api_base` when the active profile defines an `api_base_env` (used by `local` / `local_gemma`). - `_build_instructions(user_task, output_schema, scrape_backend)` returns the system prompt — branches by backend. - `_run_firecrawl(...)` and `_run_local(...)` build the Agent with the right tool list and run it. Both end with `StopAtTools(["submit_result"])`. - `run_agent(...)` is the public dispatcher; raises `ValueError` for anything other than `"firecrawl"` / `"local"`. @@ -290,6 +322,7 @@ Module-level `process_row()` validates the request, builds the dynamic Pydantic `Settings(BaseSettings)` from `pydantic-settings`. Reads `.env` once at import. Singleton `settings` exposed via `get_config()` shim. Computed properties: - `resolved_agent_model` — `agent_model` override OR `_LLM_PROFILES[llm_profile]["model"]` - `resolved_api_key` — env var named by `_LLM_PROFILES[llm_profile]["api_key_env"]` +- `resolved_api_base` — `agent_api_base` override OR env var named by `_LLM_PROFILES[llm_profile]["api_base_env"]` (empty for cloud profiles, set for `local` / `local_gemma`) - `running_in_docker` — checks `/.dockerenv` or `RUNNING_IN_DOCKER=true` - `browser_visible_supported` — true on host, true in WSLg overlay, false in plain Docker diff --git a/docker-compose.local.yml b/docker-compose.local.yml index a010e1c..ed01762 100644 --- a/docker-compose.local.yml +++ b/docker-compose.local.yml @@ -10,6 +10,12 @@ services: - "8080:8080" env_file: - ./backend/.env + extra_hosts: + # Lets the container reach a local LLM server running on the host + # via http://host.docker.internal:/v1 (LLM_PROFILE=local* path). + # Docker Desktop on Windows/Mac resolves this automatically; this + # explicit mapping makes the same setup work on native Linux too. + - "host.docker.internal:host-gateway" environment: # Development settings (override defaults from env_file if needed) - DEBUG_MODE=true