diff --git a/BUILD_REPORT.md b/BUILD_REPORT.md index cb4c582..cc36c81 100644 --- a/BUILD_REPORT.md +++ b/BUILD_REPORT.md @@ -1,82 +1,69 @@ # BUILD_REPORT ## sprint objective -Implement Phase 11 Sprint 2 (`P11-S2`) local-provider support by shipping Ollama and llama.cpp adapters behind the existing provider abstraction, including registration, model enumeration + health posture snapshots, and normalized runtime invoke through existing `v1` seams. +Implement `P11-S3` by adding a vLLM adapter and self-hosted runtime path through the existing provider abstraction, with bounded provider-specific passthrough options, normalized latency/usage telemetry persistence and API exposure, plus self-hosted docs and runnable examples. ## completed work -- Added local provider transport helpers in `apps/api/src/alicebot_api/local_provider_helpers.py`: - - auth header handling (`bearer`/`none`) - - deterministic JSON request helper - - Ollama/llama.cpp model enumeration parsers - - Ollama/llama.cpp invoke response normalization -- Extended provider runtime adapters in `apps/api/src/alicebot_api/provider_runtime.py`: - - added `ollama` and `llamacpp` adapter keys and implementations - - registered both adapters in the existing provider registry - - added deterministic capability snapshot fields for local health/model posture - - preserved normalized runtime provider seam (`openai_responses`) -- Added additive model provider config fields in persistence: - - migration `apps/api/alembic/versions/20260411_0053_phase11_local_provider_config_fields.py` - - store/runtime wiring for `auth_mode`, `model_list_path`, `healthcheck_path`, `invoke_path` -- Updated API contract and serialization surfaces: - - `apps/api/src/alicebot_api/contracts.py` - - `apps/api/src/alicebot_api/store.py` - - `apps/api/src/alicebot_api/main.py` -- Added new registration APIs in `apps/api/src/alicebot_api/main.py`: - - `POST /v1/providers/ollama/register` - - `POST /v1/providers/llamacpp/register` -- Kept existing in-scope APIs working with local adapters: - - `POST /v1/providers/test` - - `POST /v1/runtime/invoke` - - `GET /v1/providers` - - `GET /v1/providers/{provider_id}` -- Added failure-safe capability behavior: - - registration stores failed discovery posture when local provider is unreachable - - provider test stores failed discovery posture when capability discovery fails -- Added sprint verification tests: - - `tests/unit/test_provider_runtime.py` - - `tests/unit/test_20260411_0053_phase11_local_provider_config_fields.py` - - `tests/integration/test_phase11_provider_runtime_api.py` -- Added local setup docs and runnable example paths: - - `docs/integrations/phase11-local-provider-adapters.md` - - `scripts/run_phase11_local_provider_e2e.py` -- Updated control-doc truth checker markers for current sprint state: - - `scripts/check_control_doc_truth.py` - - linked new integration doc from `README.md` +- Added vLLM adapter support in provider runtime: + - new adapter key `vllm` + - capability discovery via `/health` + `/v1/models` + - invoke via `/v1/chat/completions` + - capability snapshot telemetry posture fields (`supports_normalized_latency_telemetry`, `supports_normalized_usage_telemetry`, `telemetry_flow_scope`) +- Added bounded provider-specific passthrough: + - explicit `adapter_options.invoke_passthrough` schema for vLLM registration + - bounded allowlist extraction helper for vLLM passthrough options + - passthrough applied only in vLLM adapter invoke payload +- Added vLLM provider registration endpoint: + - `POST /v1/providers/vllm/register` +- Added provider telemetry persistence + API: + - new telemetry storage table and store methods + - telemetry recording for `/v1/providers/test` and `/v1/runtime/invoke` + - new endpoint `GET /v1/providers/{provider_id}/telemetry` +- Added additive provider config field support: + - `model_providers.adapter_options` persisted and serialized +- Added migration: + - `20260411_0054_phase11_vllm_telemetry` +- Added/updated tests for runtime, integration, and migration coverage +- Added self-hosted docs and runnable script for vLLM end-to-end flow +- Updated control-doc truth check markers to `P11-S3` ## incomplete work -- None for `P11-S2` acceptance criteria and required verification commands. +- None identified within sprint scope. ## files changed -- `apps/api/src/alicebot_api/local_provider_helpers.py` +Sprint-owned files changed: - `apps/api/src/alicebot_api/provider_runtime.py` - `apps/api/src/alicebot_api/main.py` -- `apps/api/src/alicebot_api/store.py` - `apps/api/src/alicebot_api/contracts.py` -- `apps/api/alembic/versions/20260411_0053_phase11_local_provider_config_fields.py` +- `apps/api/src/alicebot_api/store.py` +- `apps/api/src/alicebot_api/vllm_provider_helpers.py` (new) +- `apps/api/alembic/versions/20260411_0054_phase11_vllm_telemetry.py` (new) - `tests/unit/test_provider_runtime.py` -- `tests/unit/test_20260411_0053_phase11_local_provider_config_fields.py` - `tests/integration/test_phase11_provider_runtime_api.py` -- `docs/integrations/phase11-local-provider-adapters.md` -- `scripts/run_phase11_local_provider_e2e.py` +- `tests/unit/test_20260411_0054_phase11_vllm_telemetry.py` (new) +- `docs/integrations/phase11-vllm-self-hosted.md` (new) +- `scripts/run_phase11_vllm_e2e.py` (new) - `scripts/check_control_doc_truth.py` -- `README.md` - `BUILD_REPORT.md` - `REVIEW_REPORT.md` +Pre-existing dirty files excluded from sprint merge scope: +- `README.md` +- `ARCHITECTURE.md` +- `PRODUCT_BRIEF.md` + ## tests run +Required verification commands and exact results: - `python3 scripts/check_control_doc_truth.py` - Result: `PASS` + - Verified: `README.md`, `ROADMAP.md`, `.ai/active/SPRINT_PACKET.md`, `RULES.md`, `.ai/handoff/CURRENT_STATE.md`, `docs/archive/planning/2026-04-08-context-compaction/README.md` - `./.venv/bin/python -m pytest tests/unit tests/integration -q` - - Result: `PASS` (`1118 passed in 183.14s (0:03:03)`) + - Result: `1122 passed in 170.62s (0:02:50)` - `pnpm --dir apps/web test` - - Result: `PASS` (`62 files`, `199 tests`, duration `4.82s`) -- Sprint-targeted subset: - - `./.venv/bin/python -m pytest tests/unit/test_provider_runtime.py tests/unit/test_20260411_0053_phase11_local_provider_config_fields.py tests/integration/test_phase11_provider_runtime_api.py -q` - - Result: `PASS` (`12 passed in 2.50s`) + - Result: `62 passed` test files, `199 passed` tests, duration `4.86s` ## blockers/issues -- No active implementation blockers. +- No blockers during implementation. ## recommended next step -1. Open a sprint PR from `codex/phase11-sprint-2-ollama-llamacpp-adapters` with this report and required test evidence. -2. Keep pre-existing dirty local docs (`ARCHITECTURE.md`, `PRODUCT_BRIEF.md`) excluded from sprint merge scope. +1. Open the sprint PR from branch `codex/phase11-sprint-3-vllm-adapter-selfhosted` and request review focused on vLLM telemetry schema and endpoint response shape stability. diff --git a/REVIEW_REPORT.md b/REVIEW_REPORT.md index caedd6d..3922382 100644 --- a/REVIEW_REPORT.md +++ b/REVIEW_REPORT.md @@ -4,50 +4,51 @@ PASS ## criteria met -- `P11-S2` local provider registration APIs are implemented and functioning: - - `POST /v1/providers/ollama/register` - - `POST /v1/providers/llamacpp/register` -- Existing in-scope APIs are functioning with local adapters: +- `P11-S3` acceptance criteria are met for the vLLM self-hosted path. +- vLLM registration is implemented through the shipped provider registry: + - `POST /v1/providers/vllm/register` +- Provider tests and capability snapshots expose deterministic self-hosted posture through the existing abstraction: - `POST /v1/providers/test` + - capability snapshot fields include normalized telemetry posture (`supports_normalized_usage_telemetry`, `supports_normalized_latency_telemetry`, `telemetry_flow_scope`). +- Runtime invoke works through the shipped normalized provider contract for vLLM: - `POST /v1/runtime/invoke` - - `GET /v1/providers` - - `GET /v1/providers/{provider_id}` -- Ollama and llama.cpp adapters are integrated through the shipped provider abstraction and registry. -- Capability snapshots include deterministic local model enumeration and health posture fields. -- Additive provider config fields are migrated and wired (`auth_mode`, `model_list_path`, `healthcheck_path`, `invoke_path`). -- Local setup documentation and runnable e2e example path are present. -- Regression fix validated: legacy `/v1/providers` path now correctly passes `store` into shared registration helper (`apps/api/src/alicebot_api/main.py:6174-6177`). -- Credential handling tightened: `auth_mode="none"` now rejects non-empty `api_key`, preventing plaintext persistence (`apps/api/src/alicebot_api/main.py:1562-1568`). -- New regression coverage added: - - OpenAI-compatible registration still works and stores secret ref, not plaintext (`tests/integration/test_phase11_provider_runtime_api.py:470-491`). - - `auth_mode="none"` rejects provided `api_key` (`tests/integration/test_phase11_provider_runtime_api.py:494-514`). -- Required verification commands pass on the current branch head: - - `python3 scripts/check_control_doc_truth.py` -> PASS - - `./.venv/bin/python -m pytest tests/unit tests/integration -q` -> PASS (`1118 passed in 183.14s`) - - `pnpm --dir apps/web test` -> PASS (`62 files`, `199 tests`, duration `4.82s`) +- Normalized latency and usage telemetry are persisted and exposed: + - migration adds `provider_invocation_telemetry` + - telemetry writes for `provider_test` and `runtime_invoke` + - `GET /v1/providers/{provider_id}/telemetry` +- Bounded provider-specific passthrough is implemented behind explicit vLLM adapter options (`adapter_options.invoke_passthrough` allowlist). +- Self-hosted docs and runnable examples are now internally consistent for local split endpoints (API `:8000`, vLLM provider `:8001`): + - [phase11-vllm-self-hosted.md](/Users/samirusani/Desktop/Codex/AliceBot/docs/integrations/phase11-vllm-self-hosted.md) + - [run_phase11_vllm_e2e.py](/Users/samirusani/Desktop/Codex/AliceBot/scripts/run_phase11_vllm_e2e.py) +- Existing `P11-S1` / `P11-S2` seams remain intact (verified by full unit+integration pass and existing integration coverage). ## criteria missed -- None identified for `P11-S2` acceptance criteria. +- None. ## quality issues -- No blocking quality issues remain in sprint-owned scope after fixes. +- No blocking quality issues found in sprint-owned changes after the endpoint-default fix. +- Out-of-scope dirty local docs remain present and should stay excluded from sprint merge scope: + - `ARCHITECTURE.md` + - `PRODUCT_BRIEF.md` + - `README.md` (pre-existing dirty context in branch) ## regression risks -- Low. Full required verification is passing, including new regression tests for the previously broken path. -- Residual operational risk remains external local-provider availability (Ollama/llama.cpp process reachability), which is surfaced via explicit discovery/test failure posture. +- Low. +- Required verification suite passes on current workspace state: + - `python3 scripts/check_control_doc_truth.py` -> PASS + - `./.venv/bin/python -m pytest tests/unit tests/integration -q` -> `1122 passed in 170.62s` + - `pnpm --dir apps/web test` -> `62 passed` files, `199 passed` tests, duration `4.86s` ## docs issues -- No local identifiers (local computer paths, names) were found in sprint-owned changed code/docs reviewed here. -- Out-of-scope dirty local docs remain and should stay excluded from sprint merge scope: - - `ARCHITECTURE.md` - - `PRODUCT_BRIEF.md` +- Fixed: vLLM self-hosted docs/script no longer default provider URL to the API URL. +- No local identifiers (local machine paths, personal names, local-only identifiers) were found in reviewed sprint-owned files. ## should anything be added to RULES.md? -- Optional improvement: require backward-compat regression tests for already-shipped endpoints whenever shared registration/runtime helpers are refactored. +- Optional: add a guardrail that runnable docs/scripts must use non-conflicting default endpoints in multi-service flows and be smoke-validated before merge. ## should anything update ARCHITECTURE.md? -- Optional improvement: add a concise note clarifying auth-mode credential invariants (`bearer` uses secret refs; `none` must not persist API keys). +- No required architecture update for `P11-S3` merge. ## recommended next action -1. Ready for Control Tower merge approval with the updated build and review evidence on this branch head. -2. Keep `ARCHITECTURE.md` and `PRODUCT_BRIEF.md` excluded from the sprint PR. +1. Proceed with sprint PR review/merge for `P11-S3`. +2. Keep non-sprint control-doc rewrites excluded from this PR unless explicitly approved as separate scope. diff --git a/apps/api/alembic/versions/20260411_0054_phase11_vllm_telemetry.py b/apps/api/alembic/versions/20260411_0054_phase11_vllm_telemetry.py new file mode 100644 index 0000000..0725fae --- /dev/null +++ b/apps/api/alembic/versions/20260411_0054_phase11_vllm_telemetry.py @@ -0,0 +1,90 @@ +"""Add vLLM adapter options and provider invocation telemetry.""" + +from __future__ import annotations + +from alembic import op + + +revision = "20260411_0054" +down_revision = "20260411_0053" +branch_labels = None +depends_on = None + +_UPGRADE_STATEMENTS = ( + "ALTER TABLE model_providers ADD COLUMN adapter_options jsonb NOT NULL DEFAULT '{}'::jsonb", + ( + "ALTER TABLE model_providers " + "ADD CONSTRAINT model_providers_adapter_options_object_check " + "CHECK (jsonb_typeof(adapter_options) = 'object')" + ), + """ + CREATE TABLE provider_invocation_telemetry ( + id uuid PRIMARY KEY DEFAULT gen_random_uuid(), + workspace_id uuid NOT NULL REFERENCES workspaces(id) ON DELETE CASCADE, + provider_id uuid NOT NULL REFERENCES model_providers(id) ON DELETE CASCADE, + invoked_by_user_account_id uuid NOT NULL REFERENCES user_accounts(id) ON DELETE RESTRICT, + flow_kind text NOT NULL, + adapter_key text NOT NULL, + runtime_provider text NOT NULL, + provider_model text NOT NULL, + status text NOT NULL, + error_message text NULL, + latency_ms integer NOT NULL, + input_tokens integer NULL, + output_tokens integer NULL, + total_tokens integer NULL, + metadata jsonb NOT NULL DEFAULT '{}'::jsonb, + created_at timestamptz NOT NULL DEFAULT now(), + CONSTRAINT provider_invocation_telemetry_flow_kind_check + CHECK (flow_kind IN ('provider_test', 'runtime_invoke')), + CONSTRAINT provider_invocation_telemetry_adapter_key_length_check + CHECK (char_length(adapter_key) >= 1 AND char_length(adapter_key) <= 80), + CONSTRAINT provider_invocation_telemetry_runtime_provider_length_check + CHECK (char_length(runtime_provider) >= 1 AND char_length(runtime_provider) <= 100), + CONSTRAINT provider_invocation_telemetry_provider_model_length_check + CHECK (char_length(provider_model) >= 1 AND char_length(provider_model) <= 200), + CONSTRAINT provider_invocation_telemetry_status_check + CHECK (status IN ('completed', 'failed')), + CONSTRAINT provider_invocation_telemetry_latency_non_negative_check + CHECK (latency_ms >= 0), + CONSTRAINT provider_invocation_telemetry_input_tokens_non_negative_check + CHECK (input_tokens IS NULL OR input_tokens >= 0), + CONSTRAINT provider_invocation_telemetry_output_tokens_non_negative_check + CHECK (output_tokens IS NULL OR output_tokens >= 0), + CONSTRAINT provider_invocation_telemetry_total_tokens_non_negative_check + CHECK (total_tokens IS NULL OR total_tokens >= 0) + ) + """, + ( + "CREATE INDEX provider_invocation_telemetry_provider_created_idx " + "ON provider_invocation_telemetry (provider_id, created_at DESC, id DESC)" + ), + ( + "CREATE INDEX provider_invocation_telemetry_workspace_created_idx " + "ON provider_invocation_telemetry (workspace_id, created_at DESC, id DESC)" + ), +) + +_UPGRADE_GRANT_STATEMENTS = ( + "GRANT SELECT, INSERT, UPDATE, DELETE ON provider_invocation_telemetry TO alicebot_app", +) + +_DOWNGRADE_STATEMENTS = ( + "DROP TABLE IF EXISTS provider_invocation_telemetry", + "ALTER TABLE model_providers DROP CONSTRAINT IF EXISTS model_providers_adapter_options_object_check", + "ALTER TABLE model_providers DROP COLUMN IF EXISTS adapter_options", +) + + +def _execute_statements(statements: tuple[str, ...]) -> None: + for statement in statements: + op.execute(statement) + + +def upgrade() -> None: + _execute_statements(_UPGRADE_STATEMENTS) + _execute_statements(_UPGRADE_GRANT_STATEMENTS) + + +def downgrade() -> None: + _execute_statements(_DOWNGRADE_STATEMENTS) diff --git a/apps/api/src/alicebot_api/contracts.py b/apps/api/src/alicebot_api/contracts.py index fb6771b..5453073 100644 --- a/apps/api/src/alicebot_api/contracts.py +++ b/apps/api/src/alicebot_api/contracts.py @@ -189,9 +189,11 @@ ToolRoutingDecision = Literal["ready", "denied", "approval_required"] PromptSectionName = Literal["system", "developer", "context", "conversation"] ModelProvider = Literal["openai_responses"] -ProviderAdapterKey = Literal["openai_compatible", "ollama", "llamacpp"] +ProviderAdapterKey = Literal["openai_compatible", "ollama", "llamacpp", "vllm"] ModelProviderStatus = Literal["active"] ProviderCapabilityDiscoveryStatus = Literal["ready", "failed"] +ProviderInvocationFlowKind = Literal["provider_test", "runtime_invoke"] +ProviderInvocationStatus = Literal["completed", "failed"] ModelFinishReason = Literal["completed", "incomplete"] ExplicitPreferencePattern = Literal[ "i_like", @@ -1553,6 +1555,7 @@ class ModelProviderRecord(TypedDict): model_list_path: str healthcheck_path: str invoke_path: str + adapter_options: JsonObject metadata: JsonObject created_at: str updated_at: str @@ -1611,6 +1614,39 @@ class RuntimeInvokeResponse(TypedDict): trace: ResponseTraceSummary +class ProviderInvocationTelemetryRecord(TypedDict): + id: str + workspace_id: str + provider_id: str + invoked_by_user_account_id: str + flow_kind: ProviderInvocationFlowKind + adapter_key: ProviderAdapterKey + runtime_provider: ModelProvider + provider_model: str + status: ProviderInvocationStatus + error_message: str | None + latency_ms: int + usage: ModelUsagePayload + metadata: JsonObject + created_at: str + + +class ProviderTelemetrySummary(TypedDict): + total_count: int + completed_count: int + failed_count: int + average_latency_ms: float + latest_created_at: str | None + usage_totals: ModelUsagePayload + + +class ProviderTelemetryResponse(TypedDict): + provider_id: str + summary: ProviderTelemetrySummary + items: list[ProviderInvocationTelemetryRecord] + order: list[str] + + @dataclass(frozen=True, slots=True) class OpenLoopCandidateInput: title: str diff --git a/apps/api/src/alicebot_api/main.py b/apps/api/src/alicebot_api/main.py index 4f08635..f7af3c1 100644 --- a/apps/api/src/alicebot_api/main.py +++ b/apps/api/src/alicebot_api/main.py @@ -603,6 +603,7 @@ class RedisError(Exception): OLLAMA_ADAPTER_KEY, OPENAI_COMPATIBLE_ADAPTER_KEY, OPENAI_RESPONSES_PROVIDER, + VLLM_ADAPTER_KEY, ProviderAdapterNotFoundError, RuntimeProviderConfig, build_provider_test_model_request, @@ -622,6 +623,7 @@ class RedisError(Exception): EventRow, ModelProviderRow, ProviderCapabilityRow, + ProviderInvocationTelemetryRow, SessionRow, ThreadRow, ) @@ -1438,6 +1440,7 @@ def _serialize_model_provider(provider: ModelProviderRow) -> dict[str, object]: "model_list_path": provider["model_list_path"], "healthcheck_path": provider["healthcheck_path"], "invoke_path": provider["invoke_path"], + "adapter_options": provider["adapter_options"], "metadata": provider["metadata"], "created_at": provider["created_at"].isoformat(), "updated_at": provider["updated_at"].isoformat(), @@ -1460,6 +1463,62 @@ def _serialize_provider_capability(capability: ProviderCapabilityRow) -> dict[st } +def _serialize_provider_invocation_telemetry( + row: ProviderInvocationTelemetryRow, +) -> dict[str, object]: + return { + "id": str(row["id"]), + "workspace_id": str(row["workspace_id"]), + "provider_id": str(row["provider_id"]), + "invoked_by_user_account_id": str(row["invoked_by_user_account_id"]), + "flow_kind": row["flow_kind"], + "adapter_key": row["adapter_key"], + "runtime_provider": row["runtime_provider"], + "provider_model": row["provider_model"], + "status": row["status"], + "error_message": row["error_message"], + "latency_ms": row["latency_ms"], + "usage": { + "input_tokens": row["input_tokens"], + "output_tokens": row["output_tokens"], + "total_tokens": row["total_tokens"], + }, + "metadata": row["metadata"], + "created_at": row["created_at"].isoformat(), + } + + +def _summarize_provider_telemetry( + rows: list[ProviderInvocationTelemetryRow], +) -> dict[str, object]: + total_count = len(rows) + completed_count = sum(1 for row in rows if row["status"] == "completed") + failed_count = sum(1 for row in rows if row["status"] == "failed") + total_latency = sum(row["latency_ms"] for row in rows) + average_latency_ms = (float(total_latency) / float(total_count)) if total_count > 0 else 0.0 + latest_created_at = rows[0]["created_at"].isoformat() if total_count > 0 else None + return { + "total_count": total_count, + "completed_count": completed_count, + "failed_count": failed_count, + "average_latency_ms": round(average_latency_ms, 2), + "latest_created_at": latest_created_at, + "usage_totals": { + "input_tokens": sum(row["input_tokens"] or 0 for row in rows), + "output_tokens": sum(row["output_tokens"] or 0 for row in rows), + "total_tokens": sum(row["total_tokens"] or 0 for row in rows), + }, + } + + +def _usage_token_or_none(value: object) -> int | None: + if isinstance(value, bool): + return None + if isinstance(value, int): + return value + return None + + def _runtime_provider_config_or_none( *, store: ContinuityStore, @@ -1493,6 +1552,9 @@ def _fallback_provider_capability_snapshot( model_list_path: str, healthcheck_path: str, invoke_path: str, + supports_normalized_usage_telemetry: bool = False, + supports_normalized_latency_telemetry: bool = False, + telemetry_flow_scope: list[str] | None = None, ) -> dict[str, object]: snapshot = normalized_capability_snapshot( adapter_key=adapter_key, @@ -1502,6 +1564,8 @@ def _fallback_provider_capability_snapshot( supports_store=False, supports_vision_input=False, supports_audio_input=False, + supports_normalized_usage_telemetry=supports_normalized_usage_telemetry, + supports_normalized_latency_telemetry=supports_normalized_latency_telemetry, ) snapshot.update( { @@ -1513,6 +1577,8 @@ def _fallback_provider_capability_snapshot( "models": [], } ) + if telemetry_flow_scope is not None: + snapshot["telemetry_flow_scope"] = telemetry_flow_scope return snapshot @@ -1531,6 +1597,7 @@ def _register_workspace_provider( model_list_path: str, healthcheck_path: str, invoke_path: str, + adapter_options: dict[str, object], metadata: dict[str, object], ) -> tuple[ModelProviderRow, ProviderCapabilityRow]: normalized_display_name = display_name.strip() @@ -1589,6 +1656,7 @@ def _register_workspace_provider( model_list_path=normalized_model_list_path, healthcheck_path=normalized_healthcheck_path, invoke_path=normalized_invoke_path, + adapter_options=adapter_options, ) runtime_provider = resolve_runtime_provider_config_secrets( @@ -1611,6 +1679,11 @@ def _register_workspace_provider( model_list_path=normalized_model_list_path, healthcheck_path=normalized_healthcheck_path, invoke_path=normalized_invoke_path, + supports_normalized_usage_telemetry=adapter.adapter_key == VLLM_ADAPTER_KEY, + supports_normalized_latency_telemetry=adapter.adapter_key == VLLM_ADAPTER_KEY, + telemetry_flow_scope=["provider_test", "runtime_invoke"] + if adapter.adapter_key == VLLM_ADAPTER_KEY + else None, ) discovery_status = "failed" discovery_error = str(exc) @@ -2079,6 +2152,57 @@ class RegisterLlamaCppProviderRequest(BaseModel): metadata: dict[str, object] = Field(default_factory=dict) +class VllmInvokePassthroughOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + temperature: float | None = Field(default=None, ge=0.0, le=2.0) + top_p: float | None = Field(default=None, ge=0.0, le=1.0) + max_tokens: int | None = Field(default=None, ge=1, le=32768) + frequency_penalty: float | None = Field(default=None, ge=-2.0, le=2.0) + presence_penalty: float | None = Field(default=None, ge=-2.0, le=2.0) + n: int | None = Field(default=None, ge=1, le=8) + seed: int | None = Field(default=None, ge=0) + stop: str | list[str] | None = None + + @model_validator(mode="after") + def _validate_stop(self) -> "VllmInvokePassthroughOptions": + if self.stop is None: + return self + if isinstance(self.stop, str): + if self.stop.strip() == "": + raise ValueError("adapter_options.invoke_passthrough.stop must not be empty") + return self + if len(self.stop) == 0 or len(self.stop) > 8: + raise ValueError("adapter_options.invoke_passthrough.stop list length must be between 1 and 8") + for item in self.stop: + if item.strip() == "": + raise ValueError("adapter_options.invoke_passthrough.stop list items must not be empty") + if len(item) > 200: + raise ValueError("adapter_options.invoke_passthrough.stop list items must be 200 chars or shorter") + return self + + +class VllmAdapterOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + invoke_passthrough: VllmInvokePassthroughOptions = Field(default_factory=VllmInvokePassthroughOptions) + + +class RegisterVllmProviderRequest(BaseModel): + model_config = ConfigDict(extra="forbid") + + display_name: str = Field(min_length=1, max_length=120) + base_url: str = Field(default="http://127.0.0.1:8000", min_length=1, max_length=500) + api_key: str | None = Field(default=None, max_length=8000) + auth_mode: Literal["bearer", "none"] = "none" + default_model: str = Field(min_length=1, max_length=200) + model_list_path: str = Field(default="/v1/models", min_length=1, max_length=200) + healthcheck_path: str = Field(default="/health", min_length=1, max_length=200) + invoke_path: str = Field(default="/v1/chat/completions", min_length=1, max_length=200) + adapter_options: VllmAdapterOptions = Field(default_factory=VllmAdapterOptions) + metadata: dict[str, object] = Field(default_factory=dict) + + class TestProviderRequest(BaseModel): model_config = ConfigDict(extra="forbid") @@ -6185,6 +6309,7 @@ def register_v1_provider(request: Request, body: RegisterProviderRequest) -> JSO model_list_path=body.model_list_path, healthcheck_path=body.healthcheck_path, invoke_path=body.invoke_path, + adapter_options={}, metadata=body.metadata, ) except (AuthSessionInvalidError, AuthSessionExpiredError, AuthSessionRevokedDeviceError) as exc: @@ -6247,6 +6372,7 @@ def register_v1_ollama_provider( model_list_path=body.model_list_path, healthcheck_path=body.healthcheck_path, invoke_path=body.invoke_path, + adapter_options={}, metadata=body.metadata, ) except (AuthSessionInvalidError, AuthSessionExpiredError, AuthSessionRevokedDeviceError) as exc: @@ -6309,6 +6435,70 @@ def register_v1_llamacpp_provider( model_list_path=body.model_list_path, healthcheck_path=body.healthcheck_path, invoke_path=body.invoke_path, + adapter_options={}, + metadata=body.metadata, + ) + except (AuthSessionInvalidError, AuthSessionExpiredError, AuthSessionRevokedDeviceError) as exc: + return JSONResponse(status_code=401, content={"detail": str(exc)}) + except psycopg.errors.UniqueViolation: + return JSONResponse( + status_code=409, + content={"detail": "provider display_name must be unique within the workspace"}, + ) + except ProviderAdapterNotFoundError as exc: + return JSONResponse(status_code=422, content={"detail": str(exc)}) + except ProviderSecretManagerError as exc: + return JSONResponse(status_code=500, content={"detail": str(exc)}) + except ValueError as exc: + return JSONResponse(status_code=400, content={"detail": str(exc)}) + + return JSONResponse( + status_code=201, + content=jsonable_encoder( + { + "provider": _serialize_model_provider(provider), + "capabilities": _serialize_provider_capability(capability), + } + ), + ) + + +@app.post("/v1/providers/vllm/register") +def register_v1_vllm_provider( + request: Request, + body: RegisterVllmProviderRequest, +) -> JSONResponse: + settings = get_settings() + + try: + session_token = _extract_bearer_token(request) + with psycopg.connect(settings.database_url, row_factory=dict_row) as conn: + with conn.transaction(): + resolution = resolve_auth_session(conn, session_token=session_token) + workspace = get_current_workspace( + conn, + user_account_id=resolution["user_account"]["id"], + preferred_workspace_id=resolution["session"]["workspace_id"], + ) + if workspace is None: + return JSONResponse(status_code=404, content={"detail": "no workspace is currently selected"}) + + store = ContinuityStore(conn) + provider, capability = _register_workspace_provider( + settings=settings, + store=store, + workspace_id=workspace["id"], + created_by_user_account_id=resolution["user_account"]["id"], + provider_key=VLLM_ADAPTER_KEY, + display_name=body.display_name, + base_url=body.base_url, + api_key=body.api_key or "", + auth_mode=body.auth_mode, + default_model=body.default_model, + model_list_path=body.model_list_path, + healthcheck_path=body.healthcheck_path, + invoke_path=body.invoke_path, + adapter_options=body.adapter_options.model_dump(exclude_none=True), metadata=body.metadata, ) except (AuthSessionInvalidError, AuthSessionExpiredError, AuthSessionRevokedDeviceError) as exc: @@ -6417,6 +6607,55 @@ def get_v1_provider(provider_id: UUID, request: Request) -> JSONResponse: ) +@app.get("/v1/providers/{provider_id}/telemetry") +def get_v1_provider_telemetry( + provider_id: UUID, + request: Request, + limit: int = Query(default=50, ge=1, le=200), +) -> JSONResponse: + settings = get_settings() + + try: + session_token = _extract_bearer_token(request) + with psycopg.connect(settings.database_url, row_factory=dict_row) as conn: + with conn.transaction(): + resolution = resolve_auth_session(conn, session_token=session_token) + workspace = get_current_workspace( + conn, + user_account_id=resolution["user_account"]["id"], + preferred_workspace_id=resolution["session"]["workspace_id"], + ) + if workspace is None: + return JSONResponse(status_code=404, content={"detail": "no workspace is currently selected"}) + + store = ContinuityStore(conn) + provider = store.get_model_provider_for_workspace_optional( + provider_id=provider_id, + workspace_id=workspace["id"], + ) + if provider is None: + return JSONResponse(status_code=404, content={"detail": f"provider {provider_id} was not found"}) + telemetry_rows = store.list_provider_invocation_telemetry_for_provider( + provider_id=provider_id, + workspace_id=workspace["id"], + limit=limit, + ) + except (AuthSessionInvalidError, AuthSessionExpiredError, AuthSessionRevokedDeviceError) as exc: + return JSONResponse(status_code=401, content={"detail": str(exc)}) + + return JSONResponse( + status_code=200, + content=jsonable_encoder( + { + "provider_id": str(provider_id), + "summary": _summarize_provider_telemetry(telemetry_rows), + "items": [_serialize_provider_invocation_telemetry(row) for row in telemetry_rows], + "order": ["created_at_desc", "id_desc"], + } + ), + ) + + @app.post("/v1/providers/test") def test_v1_provider(request: Request, body: TestProviderRequest) -> JSONResponse: settings = get_settings() @@ -6454,27 +6693,54 @@ def test_v1_provider(request: Request, body: TestProviderRequest) -> JSONRespons if model_name == "": raise ValueError("model is required") + discovery_started_at = time.perf_counter() try: capability_snapshot = adapter.discover_capabilities( config=runtime_provider, settings=settings, ) except ModelInvocationError as exc: + discovery_latency_ms = max( + 0, + int((time.perf_counter() - discovery_started_at) * 1000), + ) + fallback_snapshot = _fallback_provider_capability_snapshot( + adapter_key=adapter.adapter_key, + runtime_provider=adapter.runtime_provider, + model_list_path=runtime_provider.model_list_path, + healthcheck_path=runtime_provider.healthcheck_path, + invoke_path=runtime_provider.invoke_path, + supports_normalized_usage_telemetry=adapter.adapter_key == VLLM_ADAPTER_KEY, + supports_normalized_latency_telemetry=adapter.adapter_key == VLLM_ADAPTER_KEY, + telemetry_flow_scope=["provider_test", "runtime_invoke"] + if adapter.adapter_key == VLLM_ADAPTER_KEY + else None, + ) capability = store.upsert_provider_capability( workspace_id=workspace["id"], provider_id=runtime_provider.provider_id, discovered_by_user_account_id=resolution["user_account"]["id"], adapter_key=adapter.adapter_key, discovery_status="failed", - capability_snapshot=_fallback_provider_capability_snapshot( - adapter_key=adapter.adapter_key, - runtime_provider=adapter.runtime_provider, - model_list_path=runtime_provider.model_list_path, - healthcheck_path=runtime_provider.healthcheck_path, - invoke_path=runtime_provider.invoke_path, - ), + capability_snapshot=fallback_snapshot, discovery_error=str(exc), ) + store.create_provider_invocation_telemetry( + workspace_id=workspace["id"], + provider_id=runtime_provider.provider_id, + invoked_by_user_account_id=resolution["user_account"]["id"], + flow_kind="provider_test", + adapter_key=adapter.adapter_key, + runtime_provider=adapter.runtime_provider, + provider_model=model_name, + status="failed", + error_message=str(exc), + latency_ms=discovery_latency_ms, + input_tokens=None, + output_tokens=None, + total_tokens=None, + metadata={"stage": "capability_discovery"}, + ) return JSONResponse( status_code=502, content=jsonable_encoder( @@ -6491,6 +6757,7 @@ def test_v1_provider(request: Request, body: TestProviderRequest) -> JSONRespons prompt_text=body.prompt.strip(), ) + invoke_started_at = time.perf_counter() try: model_response = adapter.invoke( config=runtime_provider, @@ -6498,6 +6765,7 @@ def test_v1_provider(request: Request, body: TestProviderRequest) -> JSONRespons request=model_request, ) except ModelInvocationError as exc: + invoke_latency_ms = max(0, int((time.perf_counter() - invoke_started_at) * 1000)) capability = store.upsert_provider_capability( workspace_id=workspace["id"], provider_id=runtime_provider.provider_id, @@ -6507,6 +6775,22 @@ def test_v1_provider(request: Request, body: TestProviderRequest) -> JSONRespons capability_snapshot=capability_snapshot, discovery_error=str(exc), ) + store.create_provider_invocation_telemetry( + workspace_id=workspace["id"], + provider_id=runtime_provider.provider_id, + invoked_by_user_account_id=resolution["user_account"]["id"], + flow_kind="provider_test", + adapter_key=adapter.adapter_key, + runtime_provider=adapter.runtime_provider, + provider_model=model_name, + status="failed", + error_message=str(exc), + latency_ms=invoke_latency_ms, + input_tokens=None, + output_tokens=None, + total_tokens=None, + metadata={"stage": "invoke"}, + ) return JSONResponse( status_code=502, content=jsonable_encoder( @@ -6527,6 +6811,23 @@ def test_v1_provider(request: Request, body: TestProviderRequest) -> JSONRespons capability_snapshot=capability_snapshot, discovery_error=None, ) + invoke_latency_ms = max(0, int((time.perf_counter() - invoke_started_at) * 1000)) + store.create_provider_invocation_telemetry( + workspace_id=workspace["id"], + provider_id=runtime_provider.provider_id, + invoked_by_user_account_id=resolution["user_account"]["id"], + flow_kind="provider_test", + adapter_key=adapter.adapter_key, + runtime_provider=adapter.runtime_provider, + provider_model=model_name, + status="completed", + error_message=None, + latency_ms=invoke_latency_ms, + input_tokens=_usage_token_or_none(model_response.usage.get("input_tokens")), + output_tokens=_usage_token_or_none(model_response.usage.get("output_tokens")), + total_tokens=_usage_token_or_none(model_response.usage.get("total_tokens")), + metadata={"stage": "invoke"}, + ) except (AuthSessionInvalidError, AuthSessionExpiredError, AuthSessionRevokedDeviceError) as exc: return JSONResponse(status_code=401, content={"detail": str(exc)}) except ProviderAdapterNotFoundError as exc: @@ -6612,9 +6913,31 @@ def invoke_v1_runtime(request: Request, body: RuntimeInvokeRequest) -> JSONRespo except ProviderAdapterNotFoundError as exc: return JSONResponse(status_code=422, content={"detail": str(exc)}) + invoke_latency_ms: int | None = None + invoke_usage_payload: dict[str, object] = { + "input_tokens": None, + "output_tokens": None, + "total_tokens": None, + } + try: with user_connection(settings.database_url, user_account_id) as conn: store = ContinuityStore(conn) + + def _timed_model_invoker(model_request): + nonlocal invoke_latency_ms, invoke_usage_payload + invoke_started_at = time.perf_counter() + try: + response = adapter.invoke( + config=runtime_provider, + settings=settings, + request=model_request, + ) + finally: + invoke_latency_ms = max(0, int((time.perf_counter() - invoke_started_at) * 1000)) + invoke_usage_payload = response.usage + return response + result = generate_response( store=store, settings=settings, @@ -6629,13 +6952,30 @@ def invoke_v1_runtime(request: Request, body: RuntimeInvokeRequest) -> JSONRespo max_entity_edges=body.max_entity_edges, ), runtime_override=(runtime_provider.model_provider, selected_model), - model_invoker=lambda model_request: adapter.invoke( - config=runtime_provider, - settings=settings, - request=model_request, - ), + model_invoker=_timed_model_invoker, ) if isinstance(result, ResponseFailure): + if invoke_latency_ms is not None: + store.create_provider_invocation_telemetry( + workspace_id=workspace_id, + provider_id=runtime_provider.provider_id, + invoked_by_user_account_id=user_account_id, + flow_kind="runtime_invoke", + adapter_key=adapter.adapter_key, + runtime_provider=adapter.runtime_provider, + provider_model=selected_model, + status="failed", + error_message=result.detail, + latency_ms=invoke_latency_ms, + input_tokens=_usage_token_or_none(invoke_usage_payload.get("input_tokens")), + output_tokens=_usage_token_or_none(invoke_usage_payload.get("output_tokens")), + total_tokens=_usage_token_or_none(invoke_usage_payload.get("total_tokens")), + metadata={ + "thread_id": str(body.thread_id), + "response_trace_id": result["trace"]["response_trace_id"], + "compile_trace_id": result["trace"]["compile_trace_id"], + }, + ) return JSONResponse( status_code=502, content=jsonable_encoder( @@ -6674,6 +7014,28 @@ def invoke_v1_runtime(request: Request, body: RuntimeInvokeRequest) -> JSONRespo if isinstance(model_payload, dict) and isinstance(model_payload.get("finish_reason"), str) else "incomplete" ) + if invoke_latency_ms is not None: + store.create_provider_invocation_telemetry( + workspace_id=workspace_id, + provider_id=runtime_provider.provider_id, + invoked_by_user_account_id=user_account_id, + flow_kind="runtime_invoke", + adapter_key=adapter.adapter_key, + runtime_provider=adapter.runtime_provider, + provider_model=selected_model, + status="completed", + error_message=None, + latency_ms=invoke_latency_ms, + input_tokens=_usage_token_or_none(usage_payload.get("input_tokens")), + output_tokens=_usage_token_or_none(usage_payload.get("output_tokens")), + total_tokens=_usage_token_or_none(usage_payload.get("total_tokens")), + metadata={ + "thread_id": str(body.thread_id), + "assistant_event_id": str(assistant_event_id), + "response_trace_id": result["trace"]["response_trace_id"], + "compile_trace_id": result["trace"]["compile_trace_id"], + }, + ) except ContinuityStoreInvariantError as exc: return JSONResponse(status_code=404, content={"detail": str(exc)}) diff --git a/apps/api/src/alicebot_api/provider_runtime.py b/apps/api/src/alicebot_api/provider_runtime.py index 441be16..a5a1d2c 100644 --- a/apps/api/src/alicebot_api/provider_runtime.py +++ b/apps/api/src/alicebot_api/provider_runtime.py @@ -28,10 +28,12 @@ ) from alicebot_api.provider_secrets import resolve_provider_api_key from alicebot_api.store import JsonObject +from alicebot_api.vllm_provider_helpers import extract_vllm_invoke_passthrough_options OPENAI_COMPATIBLE_ADAPTER_KEY = "openai_compatible" OLLAMA_ADAPTER_KEY = "ollama" LLAMACPP_ADAPTER_KEY = "llamacpp" +VLLM_ADAPTER_KEY = "vllm" OPENAI_RESPONSES_PROVIDER = "openai_responses" PROVIDER_CAPABILITY_VERSION_V1 = "provider_capability_v1" @@ -55,12 +57,15 @@ class ProviderCapabilitySnapshot(TypedDict): supports_store: bool supports_vision_input: bool supports_audio_input: bool + supports_normalized_usage_telemetry: bool + supports_normalized_latency_telemetry: bool health_status: NotRequired[str] health_endpoint: NotRequired[str] models_endpoint: NotRequired[str] invoke_endpoint: NotRequired[str] model_count: NotRequired[int] models: NotRequired[list[str]] + telemetry_flow_scope: NotRequired[list[str]] @dataclass(frozen=True, slots=True) @@ -79,6 +84,7 @@ class RuntimeProviderConfig: model_list_path: str healthcheck_path: str invoke_path: str + adapter_options: JsonObject metadata: JsonObject @classmethod @@ -98,6 +104,7 @@ def from_row(cls, row: dict[str, object]) -> RuntimeProviderConfig: model_list_path=str(row.get("model_list_path", "")), healthcheck_path=str(row.get("healthcheck_path", "")), invoke_path=str(row.get("invoke_path", "")), + adapter_options=row.get("adapter_options", {}), # type: ignore[assignment] metadata=row["metadata"], # type: ignore[assignment] ) @@ -154,6 +161,8 @@ def normalized_capability_snapshot( supports_store: bool, supports_vision_input: bool, supports_audio_input: bool, + supports_normalized_usage_telemetry: bool = False, + supports_normalized_latency_telemetry: bool = False, ) -> ProviderCapabilitySnapshot: return { "capability_version": PROVIDER_CAPABILITY_VERSION_V1, @@ -166,6 +175,8 @@ def normalized_capability_snapshot( "supports_store": supports_store, "supports_vision_input": supports_vision_input, "supports_audio_input": supports_audio_input, + "supports_normalized_usage_telemetry": supports_normalized_usage_telemetry, + "supports_normalized_latency_telemetry": supports_normalized_latency_telemetry, } @@ -364,11 +375,102 @@ def invoke( return parse_llamacpp_invoke_response(request=request, payload=payload) +class VllmAdapter: + adapter_key = VLLM_ADAPTER_KEY + runtime_provider = OPENAI_RESPONSES_PROVIDER + default_healthcheck_path = "/health" + default_model_list_path = "/v1/models" + default_invoke_path = "/v1/chat/completions" + + def discover_capabilities( + self, + *, + config: RuntimeProviderConfig, + settings: Settings, + ) -> ProviderCapabilitySnapshot: + headers = build_auth_headers(auth_mode=config.auth_mode, api_key=config.api_key) + healthcheck_path = config.healthcheck_path or self.default_healthcheck_path + model_list_path = config.model_list_path or self.default_model_list_path + request_json( + method="GET", + base_url=config.base_url, + path=healthcheck_path, + timeout_seconds=settings.healthcheck_timeout_seconds, + headers=headers, + ) + model_payload = request_json( + method="GET", + base_url=config.base_url, + path=model_list_path, + timeout_seconds=settings.healthcheck_timeout_seconds, + headers=headers, + ) + models = parse_llamacpp_models(model_payload) + snapshot = normalized_capability_snapshot( + adapter_key=self.adapter_key, + runtime_provider=self.runtime_provider, + supports_tool_calls=False, + supports_streaming=False, + supports_store=False, + supports_vision_input=False, + supports_audio_input=False, + supports_normalized_usage_telemetry=True, + supports_normalized_latency_telemetry=True, + ) + snapshot.update( + { + "health_status": "ok", + "health_endpoint": healthcheck_path, + "models_endpoint": model_list_path, + "invoke_endpoint": config.invoke_path or self.default_invoke_path, + "model_count": len(models), + "models": models, + "telemetry_flow_scope": ["provider_test", "runtime_invoke"], + } + ) + return snapshot + + def invoke( + self, + *, + config: RuntimeProviderConfig, + settings: Settings, + request: ModelInvocationRequest, + ) -> ModelInvocationResponse: + if request.provider != self.runtime_provider: + raise ModelInvocationError(f"unsupported model provider: {request.provider}") + + headers = build_auth_headers(auth_mode=config.auth_mode, api_key=config.api_key) + payload: dict[str, object] = { + "model": request.model, + "stream": False, + "messages": prompt_sections_to_messages(request), + } + payload.update( + extract_vllm_invoke_passthrough_options( + adapter_options=config.adapter_options, + ) + ) + response_payload = request_json( + method="POST", + base_url=config.base_url, + path=config.invoke_path or self.default_invoke_path, + timeout_seconds=settings.model_timeout_seconds, + headers=headers, + payload=payload, + ) + return parse_llamacpp_invoke_response( + request=request, + payload=response_payload, + ) + + def make_provider_adapter_registry() -> ProviderAdapterRegistry: registry = ProviderAdapterRegistry() registry.register(OpenAICompatibleAdapter()) registry.register(OllamaAdapter()) registry.register(LlamaCppAdapter()) + registry.register(VllmAdapter()) return registry diff --git a/apps/api/src/alicebot_api/store.py b/apps/api/src/alicebot_api/store.py index 597d33a..01094e1 100644 --- a/apps/api/src/alicebot_api/store.py +++ b/apps/api/src/alicebot_api/store.py @@ -354,6 +354,7 @@ class ModelProviderRow(TypedDict): model_list_path: str healthcheck_path: str invoke_path: str + adapter_options: JsonObject metadata: JsonObject created_at: datetime updated_at: datetime @@ -373,6 +374,25 @@ class ProviderCapabilityRow(TypedDict): updated_at: datetime +class ProviderInvocationTelemetryRow(TypedDict): + id: UUID + workspace_id: UUID + provider_id: UUID + invoked_by_user_account_id: UUID + flow_kind: str + adapter_key: str + runtime_provider: str + provider_model: str + status: str + error_message: str | None + latency_ms: int + input_tokens: int | None + output_tokens: int | None + total_tokens: int | None + metadata: JsonObject + created_at: datetime + + class MemoryEmbeddingRow(TypedDict): id: UUID user_id: UUID @@ -2066,6 +2086,7 @@ class LabelCountRow(TypedDict): model_list_path, healthcheck_path, invoke_path, + adapter_options, metadata, created_at, updated_at @@ -2085,6 +2106,7 @@ class LabelCountRow(TypedDict): %s, %s, %s, + %s, clock_timestamp(), clock_timestamp() ) @@ -2103,6 +2125,7 @@ class LabelCountRow(TypedDict): model_list_path, healthcheck_path, invoke_path, + adapter_options, metadata, created_at, updated_at @@ -2124,6 +2147,7 @@ class LabelCountRow(TypedDict): model_list_path, healthcheck_path, invoke_path, + adapter_options, metadata, created_at, updated_at @@ -2148,6 +2172,7 @@ class LabelCountRow(TypedDict): model_list_path, healthcheck_path, invoke_path, + adapter_options, metadata, created_at, updated_at @@ -2211,6 +2236,85 @@ class LabelCountRow(TypedDict): AND workspace_id = %s """ +INSERT_PROVIDER_INVOCATION_TELEMETRY_SQL = """ + INSERT INTO provider_invocation_telemetry ( + workspace_id, + provider_id, + invoked_by_user_account_id, + flow_kind, + adapter_key, + runtime_provider, + provider_model, + status, + error_message, + latency_ms, + input_tokens, + output_tokens, + total_tokens, + metadata, + created_at + ) + VALUES ( + %s, + %s, + %s, + %s, + %s, + %s, + %s, + %s, + %s, + %s, + %s, + %s, + %s, + %s, + clock_timestamp() + ) + RETURNING + id, + workspace_id, + provider_id, + invoked_by_user_account_id, + flow_kind, + adapter_key, + runtime_provider, + provider_model, + status, + error_message, + latency_ms, + input_tokens, + output_tokens, + total_tokens, + metadata, + created_at + """ + +LIST_PROVIDER_INVOCATION_TELEMETRY_FOR_PROVIDER_SQL = """ + SELECT + id, + workspace_id, + provider_id, + invoked_by_user_account_id, + flow_kind, + adapter_key, + runtime_provider, + provider_model, + status, + error_message, + latency_ms, + input_tokens, + output_tokens, + total_tokens, + metadata, + created_at + FROM provider_invocation_telemetry + WHERE provider_id = %s + AND workspace_id = %s + ORDER BY created_at DESC, id DESC + LIMIT %s + """ + INSERT_EMBEDDING_CONFIG_SQL = """ INSERT INTO embedding_configs ( user_id, @@ -5999,7 +6103,10 @@ def create_model_provider( model_list_path: str = "", healthcheck_path: str = "", invoke_path: str = "", + adapter_options: JsonObject | None = None, ) -> ModelProviderRow: + if adapter_options is None: + adapter_options = {} return self._fetch_one( "create_model_provider", INSERT_MODEL_PROVIDER_SQL, @@ -6017,6 +6124,7 @@ def create_model_provider( model_list_path, healthcheck_path, invoke_path, + Jsonb(adapter_options), Jsonb(metadata), ), ) @@ -6074,6 +6182,61 @@ def get_provider_capability_for_provider_optional( (provider_id, workspace_id), ) + def create_provider_invocation_telemetry( + self, + *, + workspace_id: UUID, + provider_id: UUID, + invoked_by_user_account_id: UUID, + flow_kind: str, + adapter_key: str, + runtime_provider: str, + provider_model: str, + status: str, + error_message: str | None, + latency_ms: int, + input_tokens: int | None, + output_tokens: int | None, + total_tokens: int | None, + metadata: JsonObject, + ) -> ProviderInvocationTelemetryRow: + return self._fetch_one( + "create_provider_invocation_telemetry", + INSERT_PROVIDER_INVOCATION_TELEMETRY_SQL, + ( + workspace_id, + provider_id, + invoked_by_user_account_id, + flow_kind, + adapter_key, + runtime_provider, + provider_model, + status, + error_message, + latency_ms, + input_tokens, + output_tokens, + total_tokens, + Jsonb(metadata), + ), + ) + + def list_provider_invocation_telemetry_for_provider( + self, + *, + provider_id: UUID, + workspace_id: UUID, + limit: int, + ) -> list[ProviderInvocationTelemetryRow]: + return self._fetch_all( + LIST_PROVIDER_INVOCATION_TELEMETRY_FOR_PROVIDER_SQL, + ( + provider_id, + workspace_id, + limit, + ), + ) + def create_embedding_config( self, *, diff --git a/apps/api/src/alicebot_api/vllm_provider_helpers.py b/apps/api/src/alicebot_api/vllm_provider_helpers.py new file mode 100644 index 0000000..3ec63b2 --- /dev/null +++ b/apps/api/src/alicebot_api/vllm_provider_helpers.py @@ -0,0 +1,64 @@ +from __future__ import annotations + +from typing import Any + +from alicebot_api.store import JsonObject + + +_VLLM_NUMERIC_PASSTHROUGH_FIELDS = { + "temperature", + "top_p", + "frequency_penalty", + "presence_penalty", +} +_VLLM_INT_PASSTHROUGH_FIELDS = { + "max_tokens", + "n", + "seed", +} + + +def extract_vllm_invoke_passthrough_options(*, adapter_options: JsonObject) -> dict[str, Any]: + """Return bounded vLLM invoke passthrough options from persisted adapter options.""" + + passthrough_payload = adapter_options.get("invoke_passthrough") + if not isinstance(passthrough_payload, dict): + return {} + + bounded: dict[str, Any] = {} + + for key in _VLLM_NUMERIC_PASSTHROUGH_FIELDS: + coerced = _coerce_float(passthrough_payload.get(key)) + if coerced is not None: + bounded[key] = coerced + + for key in _VLLM_INT_PASSTHROUGH_FIELDS: + coerced = _coerce_int(passthrough_payload.get(key)) + if coerced is not None: + bounded[key] = coerced + + stop_payload = passthrough_payload.get("stop") + if isinstance(stop_payload, str) and stop_payload.strip() != "": + bounded["stop"] = stop_payload + elif isinstance(stop_payload, list): + stops = [item for item in stop_payload if isinstance(item, str) and item.strip() != ""] + if len(stops) > 0: + bounded["stop"] = stops + + return bounded + + +def _coerce_float(value: object) -> float | None: + if isinstance(value, bool): + return None + if isinstance(value, (int, float)): + return float(value) + return None + + +def _coerce_int(value: object) -> int | None: + if isinstance(value, bool): + return None + if isinstance(value, int): + return value + return None diff --git a/docs/integrations/phase11-vllm-self-hosted.md b/docs/integrations/phase11-vllm-self-hosted.md new file mode 100644 index 0000000..1e50d68 --- /dev/null +++ b/docs/integrations/phase11-vllm-self-hosted.md @@ -0,0 +1,101 @@ +# Phase 11 vLLM Self-Hosted Path (P11-S3) + +This guide covers the sprint-owned vLLM self-hosted provider path: + +- `POST /v1/providers/vllm/register` +- `POST /v1/providers/test` +- `POST /v1/runtime/invoke` +- `GET /v1/providers` +- `GET /v1/providers/{provider_id}` +- `GET /v1/providers/{provider_id}/telemetry` + +Scope note: this page documents vLLM only. + +## Prerequisites + +1. Start Alice API and data services. +2. Authenticate and obtain a hosted session bearer token. +3. Have a thread ID available for runtime invoke. +4. Run a self-hosted vLLM server exposing OpenAI-compatible endpoints. + - Example split: Alice API at `http://127.0.0.1:8000`, vLLM provider at `http://127.0.0.1:8001`. + +## Register vLLM Provider + +```bash +curl -sS -X POST "http://127.0.0.1:8000/v1/providers/vllm/register" \ + -H "Authorization: Bearer $SESSION_TOKEN" \ + -H "Content-Type: application/json" \ + -d '{ + "display_name": "vLLM Self-Hosted", + "base_url": "http://127.0.0.1:8001", + "default_model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "adapter_options": { + "invoke_passthrough": { + "temperature": 0.2, + "top_p": 0.9, + "max_tokens": 256, + "stop": ["###"] + } + } + }' +``` + +`adapter_options.invoke_passthrough` is bounded to explicit allowlisted fields: + +- `temperature` +- `top_p` +- `max_tokens` +- `frequency_penalty` +- `presence_penalty` +- `n` +- `seed` +- `stop` + +## Test Provider Connectivity + +```bash +curl -sS -X POST "http://127.0.0.1:8000/v1/providers/test" \ + -H "Authorization: Bearer $SESSION_TOKEN" \ + -H "Content-Type: application/json" \ + -d '{ + "provider_id": "'$PROVIDER_ID'", + "prompt": "Reply with one sentence confirming self-hosted connectivity." + }' +``` + +## Invoke Through Normalized Runtime Seam + +```bash +curl -sS -X POST "http://127.0.0.1:8000/v1/runtime/invoke" \ + -H "Authorization: Bearer $SESSION_TOKEN" \ + -H "Content-Type: application/json" \ + -d '{ + "provider_id": "'$PROVIDER_ID'", + "thread_id": "'$THREAD_ID'", + "message": "Summarize current runtime status in one sentence." + }' +``` + +## View Provider Telemetry + +```bash +curl -sS "http://127.0.0.1:8000/v1/providers/$PROVIDER_ID/telemetry?limit=20" \ + -H "Authorization: Bearer $SESSION_TOKEN" +``` + +Telemetry includes normalized latency and usage evidence for `provider_test` and `runtime_invoke` flows. + +## Runnable End-to-End Script + +Use the sprint helper script for a full register/test/invoke/telemetry flow: + +```bash +./scripts/run_phase11_vllm_e2e.py \ + --session-token "$SESSION_TOKEN" \ + --thread-id "$THREAD_ID" \ + --provider-base-url "http://127.0.0.1:8001" \ + --model "meta-llama/Meta-Llama-3.1-8B-Instruct" \ + --temperature 0.2 \ + --top-p 0.9 \ + --max-tokens 256 +``` diff --git a/scripts/check_control_doc_truth.py b/scripts/check_control_doc_truth.py index 21481ec..4a956c9 100644 --- a/scripts/check_control_doc_truth.py +++ b/scripts/check_control_doc_truth.py @@ -19,7 +19,7 @@ class ControlDocTruthRule: relative_path="README.md", required_markers=( "Phase 10 is complete and shipped.", - "`P11-S2` Ollama + llama.cpp Adapters is the active sprint", + "`P11-S3` vLLM Adapter + Self-Hosted Performance Path is the active sprint", "Historical planning and control docs: [docs/archive/planning/2026-04-08-context-compaction/README.md]", ), ), @@ -27,14 +27,14 @@ class ControlDocTruthRule: relative_path="ROADMAP.md", required_markers=( "Phase 10 is complete and shipped baseline truth.", - "P11-S2: Ollama + llama.cpp Adapters", + "P11-S3: vLLM Adapter + Self-Hosted Performance Path", ), ), ControlDocTruthRule( relative_path=".ai/active/SPRINT_PACKET.md", required_markers=( - "Phase 11 Sprint 2 (P11-S2): Ollama + llama.cpp Adapters", - "Phase 10 and `P11-S1` shipped scope remain baseline truth and are not reopened as sprint work", + "Phase 11 Sprint 3 (P11-S3): vLLM Adapter + Self-Hosted Performance Path", + "Phase 10, `P11-S1`, and `P11-S2` shipped scope remain baseline truth and are not reopened as sprint work", ), ), ControlDocTruthRule( @@ -49,7 +49,7 @@ class ControlDocTruthRule: required_markers=( "Phase 9 is complete and shipped.", "Phase 10 is complete and shipped.", - "`P11-S2` (Ollama + llama.cpp Adapters) is the active execution sprint packet.", + "`P11-S3` (vLLM Adapter + Self-Hosted Performance Path) is the active execution sprint packet.", ), ), ControlDocTruthRule( diff --git a/scripts/run_phase11_vllm_e2e.py b/scripts/run_phase11_vllm_e2e.py new file mode 100755 index 0000000..dbd912c --- /dev/null +++ b/scripts/run_phase11_vllm_e2e.py @@ -0,0 +1,200 @@ +#!/usr/bin/env python3 +"""Run a vLLM self-hosted e2e flow for P11-S3. + +Flow: +1) Register vLLM provider +2) Run provider test +3) Run runtime invoke +4) Fetch provider telemetry +""" + +from __future__ import annotations + +import argparse +import json +from typing import Any +from urllib.error import HTTPError, URLError +from urllib.parse import urlencode +from urllib.request import Request, urlopen + + +def _request_json( + *, + method: str, + url: str, + bearer_token: str, + payload: dict[str, Any] | None = None, +) -> dict[str, Any]: + body = None if payload is None else json.dumps(payload).encode("utf-8") + request = Request( + url=url, + method=method, + data=body, + headers={ + "Authorization": f"Bearer {bearer_token}", + "Content-Type": "application/json", + "Accept": "application/json", + }, + ) + try: + with urlopen(request, timeout=30) as response: + raw_payload = response.read() + except HTTPError as exc: + error_body = exc.read().decode("utf-8", errors="replace") + raise RuntimeError(f"{method} {url} failed with HTTP {exc.code}: {error_body}") from exc + except URLError as exc: + raise RuntimeError(f"{method} {url} failed: {exc.reason}") from exc + + try: + parsed = json.loads(raw_payload) + except json.JSONDecodeError as exc: + raise RuntimeError(f"{method} {url} returned invalid JSON") from exc + if not isinstance(parsed, dict): + raise RuntimeError(f"{method} {url} returned invalid payload shape") + return parsed + + +def main() -> int: + parser = argparse.ArgumentParser( + prog="run_phase11_vllm_e2e.py", + description="Register and invoke a self-hosted vLLM provider through the P11-S3 runtime paths.", + ) + parser.add_argument( + "--api-base-url", + default="http://127.0.0.1:8000", + help="Alice API base URL (default: http://127.0.0.1:8000).", + ) + parser.add_argument( + "--session-token", + required=True, + help="Hosted session bearer token.", + ) + parser.add_argument( + "--thread-id", + required=True, + help="Thread ID to use for /v1/runtime/invoke.", + ) + parser.add_argument( + "--display-name", + default="vLLM Self-Hosted E2E", + help="Provider display name.", + ) + parser.add_argument( + "--provider-base-url", + default="http://127.0.0.1:8001", + help="vLLM base URL for OpenAI-compatible endpoints (default: http://127.0.0.1:8001).", + ) + parser.add_argument( + "--model", + required=True, + help="Model name to register/test/invoke.", + ) + parser.add_argument( + "--temperature", + type=float, + default=None, + help="Optional vLLM passthrough: temperature.", + ) + parser.add_argument( + "--top-p", + type=float, + default=None, + help="Optional vLLM passthrough: top_p.", + ) + parser.add_argument( + "--max-tokens", + type=int, + default=None, + help="Optional vLLM passthrough: max_tokens.", + ) + parser.add_argument( + "--stop", + action="append", + default=None, + help="Optional vLLM passthrough: stop sequence (repeat flag for multiple values).", + ) + parser.add_argument( + "--test-prompt", + default="Reply with one sentence confirming vLLM connectivity.", + help="Prompt used for /v1/providers/test.", + ) + parser.add_argument( + "--message", + default="Give a concise vLLM runtime confirmation.", + help="Message used for /v1/runtime/invoke.", + ) + args = parser.parse_args() + + invoke_passthrough: dict[str, object] = {} + if args.temperature is not None: + invoke_passthrough["temperature"] = args.temperature + if args.top_p is not None: + invoke_passthrough["top_p"] = args.top_p + if args.max_tokens is not None: + invoke_passthrough["max_tokens"] = args.max_tokens + if args.stop: + invoke_passthrough["stop"] = args.stop + + register_payload: dict[str, object] = { + "display_name": args.display_name, + "base_url": args.provider_base_url.strip(), + "default_model": args.model, + "metadata": {"source": "phase11_vllm_e2e"}, + } + if len(invoke_passthrough) > 0: + register_payload["adapter_options"] = {"invoke_passthrough": invoke_passthrough} + + register_response = _request_json( + method="POST", + url=f"{args.api_base_url.rstrip('/')}/v1/providers/vllm/register", + bearer_token=args.session_token, + payload=register_payload, + ) + provider_id = register_response["provider"]["id"] + + test_response = _request_json( + method="POST", + url=f"{args.api_base_url.rstrip('/')}/v1/providers/test", + bearer_token=args.session_token, + payload={ + "provider_id": provider_id, + "model": args.model, + "prompt": args.test_prompt, + }, + ) + + invoke_response = _request_json( + method="POST", + url=f"{args.api_base_url.rstrip('/')}/v1/runtime/invoke", + bearer_token=args.session_token, + payload={ + "provider_id": provider_id, + "thread_id": args.thread_id, + "message": args.message, + "model": args.model, + }, + ) + + telemetry_url = ( + f"{args.api_base_url.rstrip('/')}/v1/providers/{provider_id}/telemetry?" + + urlencode({"limit": 10}) + ) + telemetry_response = _request_json( + method="GET", + url=telemetry_url, + bearer_token=args.session_token, + ) + + result = { + "provider": register_response["provider"], + "capabilities": register_response["capabilities"], + "test_result": test_response["result"], + "runtime_assistant": invoke_response["assistant"], + "telemetry": telemetry_response, + } + print(json.dumps(result, indent=2, sort_keys=True)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tests/integration/test_phase11_provider_runtime_api.py b/tests/integration/test_phase11_provider_runtime_api.py index 24b909b..3a334d0 100644 --- a/tests/integration/test_phase11_provider_runtime_api.py +++ b/tests/integration/test_phase11_provider_runtime_api.py @@ -449,6 +449,157 @@ def fake_urlopen(request, timeout): assert "http://127.0.0.1:8080/v1/chat/completions" in captured_urls +def test_phase11_vllm_registration_runtime_and_telemetry( + migrated_database_urls, + monkeypatch, +) -> None: + _configure_settings(migrated_database_urls, monkeypatch) + session_token, _, user_account_id = _bootstrap_workspace_session("provider-vllm-a@example.com") + session_token_other, _, _ = _bootstrap_workspace_session("provider-vllm-b@example.com") + + captured_requests: list[dict[str, object]] = [] + + def fake_urlopen(request, timeout): + body = None if request.data is None else json.loads(request.data.decode("utf-8")) + captured_requests.append( + { + "url": request.full_url, + "timeout": timeout, + "headers": dict(request.header_items()), + "body": body, + } + ) + url = request.full_url + if url.endswith("/health"): + return FakeHTTPResponse(json.dumps({"status": "ok"}).encode("utf-8")) + if url.endswith("/v1/models"): + return FakeHTTPResponse(json.dumps({"data": [{"id": "meta-llama/Meta-Llama-3.1-8B-Instruct"}]}).encode("utf-8")) + if url.endswith("/v1/chat/completions"): + return FakeHTTPResponse( + json.dumps( + { + "id": "chatcmpl-vllm-2", + "choices": [ + { + "index": 0, + "message": {"role": "assistant", "content": "vLLM runtime response"}, + "finish_reason": "stop", + } + ], + "usage": { + "prompt_tokens": 19, + "completion_tokens": 6, + "total_tokens": 25, + }, + } + ).encode("utf-8") + ) + raise AssertionError(f"unexpected local provider URL: {url}") + + monkeypatch.setattr("alicebot_api.local_provider_helpers.urlopen", fake_urlopen) + + create_status, create_payload = invoke_request( + "POST", + "/v1/providers/vllm/register", + payload={ + "display_name": "vLLM Self-Hosted", + "base_url": "http://127.0.0.1:8001", + "default_model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "adapter_options": { + "invoke_passthrough": { + "temperature": 0.2, + "top_p": 0.9, + "max_tokens": 256, + "stop": ["###"], + } + }, + "metadata": {"kind": "self_hosted"}, + }, + headers=auth_header(session_token), + ) + assert create_status == 201 + provider_id = create_payload["provider"]["id"] + assert create_payload["provider"]["provider_key"] == "vllm" + assert create_payload["provider"]["adapter_options"]["invoke_passthrough"]["temperature"] == 0.2 + assert create_payload["capabilities"]["snapshot"]["supports_normalized_latency_telemetry"] is True + assert create_payload["capabilities"]["snapshot"]["supports_normalized_usage_telemetry"] is True + assert create_payload["capabilities"]["snapshot"]["telemetry_flow_scope"] == [ + "provider_test", + "runtime_invoke", + ] + + thread_id = _seed_thread_for_user( + admin_db_url=migrated_database_urls["admin"], + user_id=user_account_id, + email="provider-vllm-a@example.com", + ) + + test_status, test_payload = invoke_request( + "POST", + "/v1/providers/test", + payload={ + "provider_id": provider_id, + "prompt": "Validate vllm path.", + }, + headers=auth_header(session_token), + ) + assert test_status == 200 + assert test_payload["result"]["text"] == "vLLM runtime response" + assert test_payload["result"]["usage"]["total_tokens"] == 25 + + invoke_status, invoke_payload = invoke_request( + "POST", + "/v1/runtime/invoke", + payload={ + "provider_id": provider_id, + "thread_id": thread_id, + "message": "How is vllm runtime?", + }, + headers=auth_header(session_token), + ) + assert invoke_status == 200 + assert invoke_payload["assistant"]["provider_key"] == "vllm" + assert invoke_payload["assistant"]["usage"]["total_tokens"] == 25 + + telemetry_status, telemetry_payload = invoke_request( + "GET", + f"/v1/providers/{provider_id}/telemetry", + query_params={"limit": "10"}, + headers=auth_header(session_token), + ) + assert telemetry_status == 200 + assert telemetry_payload["provider_id"] == provider_id + assert telemetry_payload["summary"]["total_count"] == 2 + assert telemetry_payload["summary"]["completed_count"] == 2 + assert telemetry_payload["summary"]["failed_count"] == 0 + assert telemetry_payload["summary"]["usage_totals"]["total_tokens"] == 50 + telemetry_flow_kinds = [item["flow_kind"] for item in telemetry_payload["items"]] + assert "provider_test" in telemetry_flow_kinds + assert "runtime_invoke" in telemetry_flow_kinds + + other_workspace_status, other_workspace_payload = invoke_request( + "GET", + f"/v1/providers/{provider_id}/telemetry", + headers=auth_header(session_token_other), + ) + assert other_workspace_status == 404 + assert "was not found" in other_workspace_payload["detail"] + + invoke_bodies = [ + record["body"] + for record in captured_requests + if record["url"] == "http://127.0.0.1:8001/v1/chat/completions" + ] + assert len(invoke_bodies) == 2 + for invoke_body in invoke_bodies: + assert isinstance(invoke_body, dict) + assert invoke_body["temperature"] == 0.2 + assert invoke_body["top_p"] == 0.9 + assert invoke_body["max_tokens"] == 256 + assert invoke_body["stop"] == ["###"] + assert "unexpected_option" not in invoke_body + + def test_phase11_openai_compatible_registration_still_works(migrated_database_urls, monkeypatch) -> None: _configure_settings(migrated_database_urls, monkeypatch) session_token, workspace_id, _ = _bootstrap_workspace_session("provider-openai-reg@example.com") diff --git a/tests/unit/test_20260411_0054_phase11_vllm_telemetry.py b/tests/unit/test_20260411_0054_phase11_vllm_telemetry.py new file mode 100644 index 0000000..dcfb051 --- /dev/null +++ b/tests/unit/test_20260411_0054_phase11_vllm_telemetry.py @@ -0,0 +1,32 @@ +from __future__ import annotations + +import importlib + + +MODULE_NAME = "apps.api.alembic.versions.20260411_0054_phase11_vllm_telemetry" + + +def load_migration_module(): + return importlib.import_module(MODULE_NAME) + + +def test_upgrade_executes_expected_statements_in_order(monkeypatch) -> None: + module = load_migration_module() + executed: list[str] = [] + + monkeypatch.setattr(module.op, "execute", executed.append) + + module.upgrade() + + assert executed == list(module._UPGRADE_STATEMENTS) + list(module._UPGRADE_GRANT_STATEMENTS) + + +def test_downgrade_executes_expected_statements_in_order(monkeypatch) -> None: + module = load_migration_module() + executed: list[str] = [] + + monkeypatch.setattr(module.op, "execute", executed.append) + + module.downgrade() + + assert executed == list(module._DOWNGRADE_STATEMENTS) diff --git a/tests/unit/test_provider_runtime.py b/tests/unit/test_provider_runtime.py index a0db0f3..faaeabc 100644 --- a/tests/unit/test_provider_runtime.py +++ b/tests/unit/test_provider_runtime.py @@ -9,6 +9,7 @@ OLLAMA_ADAPTER_KEY, OPENAI_COMPATIBLE_ADAPTER_KEY, OPENAI_RESPONSES_PROVIDER, + VLLM_ADAPTER_KEY, ProviderAdapterNotFoundError, RuntimeProviderConfig, build_provider_test_model_request, @@ -39,7 +40,10 @@ def make_runtime_provider_config( model_list_path: str = "/models", healthcheck_path: str = "/models", invoke_path: str = "/responses", + adapter_options: dict[str, object] | None = None, ) -> RuntimeProviderConfig: + if adapter_options is None: + adapter_options = {} return RuntimeProviderConfig( provider_id=uuid4(), workspace_id=uuid4(), @@ -55,6 +59,7 @@ def make_runtime_provider_config( model_list_path=model_list_path, healthcheck_path=healthcheck_path, invoke_path=invoke_path, + adapter_options=adapter_options, metadata={}, ) @@ -65,15 +70,18 @@ def test_provider_registry_resolves_registered_adapter() -> None: adapter = registry.resolve(OPENAI_COMPATIBLE_ADAPTER_KEY) ollama_adapter = registry.resolve(OLLAMA_ADAPTER_KEY) llamacpp_adapter = registry.resolve(LLAMACPP_ADAPTER_KEY) + vllm_adapter = registry.resolve(VLLM_ADAPTER_KEY) assert adapter.adapter_key == OPENAI_COMPATIBLE_ADAPTER_KEY assert adapter.runtime_provider == OPENAI_RESPONSES_PROVIDER assert ollama_adapter.adapter_key == OLLAMA_ADAPTER_KEY assert llamacpp_adapter.adapter_key == LLAMACPP_ADAPTER_KEY + assert vllm_adapter.adapter_key == VLLM_ADAPTER_KEY assert registry.keys() == [ LLAMACPP_ADAPTER_KEY, OLLAMA_ADAPTER_KEY, OPENAI_COMPATIBLE_ADAPTER_KEY, + VLLM_ADAPTER_KEY, ] @@ -311,3 +319,97 @@ def fake_urlopen(request, timeout): assert captured[0]["url"] == "http://127.0.0.1:8080/health" assert captured[1]["url"] == "http://127.0.0.1:8080/v1/models" assert captured[2]["url"] == "http://127.0.0.1:8080/v1/chat/completions" + + +def test_vllm_adapter_discovers_capabilities_and_applies_bounded_passthrough(monkeypatch) -> None: + captured: list[dict[str, object]] = [] + registry = make_provider_adapter_registry() + adapter = registry.resolve(VLLM_ADAPTER_KEY) + runtime_provider = make_runtime_provider_config( + provider_key=VLLM_ADAPTER_KEY, + base_url="http://127.0.0.1:8001", + api_key="", + auth_mode="none", + model_list_path="/v1/models", + healthcheck_path="/health", + invoke_path="/v1/chat/completions", + adapter_options={ + "invoke_passthrough": { + "temperature": 0.1, + "top_p": 0.85, + "max_tokens": 256, + "seed": 42, + "stop": ["###"], + "unexpected_option": "ignored", + } + }, + ) + + def fake_urlopen(request, timeout): + body = None if request.data is None else json.loads(request.data.decode("utf-8")) + captured.append( + { + "url": request.full_url, + "timeout": timeout, + "headers": dict(request.header_items()), + "body": body, + } + ) + if request.full_url.endswith("/health"): + return FakeHTTPResponse(json.dumps({"status": "ok"}).encode("utf-8")) + if request.full_url.endswith("/v1/models"): + return FakeHTTPResponse( + json.dumps({"data": [{"id": "meta-llama/Meta-Llama-3.1-8B-Instruct"}]}).encode("utf-8") + ) + return FakeHTTPResponse( + json.dumps( + { + "id": "chatcmpl-vllm-1", + "choices": [ + { + "index": 0, + "message": {"role": "assistant", "content": "vLLM says hi"}, + "finish_reason": "stop", + } + ], + "usage": { + "prompt_tokens": 21, + "completion_tokens": 7, + "total_tokens": 28, + }, + } + ).encode("utf-8") + ) + + monkeypatch.setattr("alicebot_api.local_provider_helpers.urlopen", fake_urlopen) + + capabilities = adapter.discover_capabilities( + config=runtime_provider, + settings=Settings(healthcheck_timeout_seconds=5), + ) + response = adapter.invoke( + config=runtime_provider, + settings=Settings(model_timeout_seconds=11), + request=build_provider_test_model_request( + runtime_provider=OPENAI_RESPONSES_PROVIDER, + model="meta-llama/Meta-Llama-3.1-8B-Instruct", + prompt_text="Reply from vLLM", + ), + ) + + assert capabilities["adapter_key"] == VLLM_ADAPTER_KEY + assert capabilities["health_status"] == "ok" + assert capabilities["supports_normalized_latency_telemetry"] is True + assert capabilities["supports_normalized_usage_telemetry"] is True + assert capabilities["telemetry_flow_scope"] == ["provider_test", "runtime_invoke"] + assert response.output_text == "vLLM says hi" + assert response.response_id == "chatcmpl-vllm-1" + + invoke_body = captured[2]["body"] + assert isinstance(invoke_body, dict) + assert invoke_body["temperature"] == 0.1 + assert invoke_body["top_p"] == 0.85 + assert invoke_body["max_tokens"] == 256 + assert invoke_body["seed"] == 42 + assert invoke_body["stop"] == ["###"] + assert "unexpected_option" not in invoke_body