diff --git a/CLAUDE.md b/CLAUDE.md index 47be431..181c737 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -181,6 +181,8 @@ sluice pool rotate # operator override: advance active member sluice pool remove ``` +Pools are reachable from all channels — CLI `sluice pool`, REST `/api/pools` (`GET`/`POST`, `GET`/`DELETE /api/pools/{name}`, `POST /api/pools/{name}/rotate`), and Telegram `/pool` — all via the channel-agnostic `internal/poolops`. + Auto-failover on 429/401 is primary; `pool rotate` is an override. **Data model (migration `000006_credential_pools`):** `credential_pools` (name, strategy reserved `failover`), `credential_pool_members` (ordered, pool->credential FK), `credential_health` (`healthy|cooldown`, `cooldown_until`, `last_failure_reason`), all CHECK-constrained. Store API in `internal/store/pools.go`. `reloadAll` loads pool+health into an atomic-pointer-swapped `PoolResolver` (`internal/vault/pool.go`), rewired via `srv.StorePool`/`SetPoolResolver` on SIGHUP and the 2s data-version watcher. diff --git a/README.md b/README.md index df20df8..6752c33 100644 --- a/README.md +++ b/README.md @@ -329,6 +329,11 @@ Manage sluice from your phone. Approve connections and tool calls, add credentia | `/mcp list` | List registered MCP upstreams | | `/mcp add --command [flags]` | Register a new MCP upstream (stdio/http/websocket, see `/help`; chat message auto-deleted because `--env` may carry secrets) | | `/mcp remove ` | Remove an MCP upstream | +| `/pool create ` | Create a credential pool (ordered OAuth members, failover order) | +| `/pool list` | List credential pools | +| `/pool status ` | Active member and per-member health | +| `/pool rotate ` | Operator override: advance the active member | +| `/pool remove ` | Remove a credential pool | | `/status` | Proxy stats and pending approvals | | `/audit recent [N]` | Last N audit entries | @@ -338,6 +343,11 @@ Manage sluice from your phone. Approve connections and tool calls, add credentia REST API on port 3000 for programmatic approval integration. `GET /api/approvals` lists pending requests, `POST /api/approvals/{id}/resolve` resolves them. Use this to build custom approval UIs or integrate with existing workflows. +All `/api/*` endpoints below are protected by bearer auth. Every request must +send `Authorization: Bearer $SLUICE_API_TOKEN` (the token sluice prints at +startup). The curl examples omit the header for brevity, but it is required +for the credential, pool, and rule calls shown here. + Credential management endpoints support both static and OAuth types: ```bash @@ -350,6 +360,26 @@ curl -X POST http://localhost:3000/api/credentials \ -d '{"name":"openai_oauth","type":"oauth","token_url":"https://auth.example.com/token","access_token":"at-xxx","refresh_token":"rt-xxx","destination":"api.openai.com","env_var":"OPENAI_API_KEY"}' ``` +Credential pools are managed over the same REST surface as the CLI `sluice pool` and Telegram `/pool` commands: + +```bash +# List pools +curl http://localhost:3000/api/pools + +# Create a pool (members are ordered OAuth credential names; strategy defaults to "failover") +curl -X POST http://localhost:3000/api/pools \ + -d '{"name":"openai","members":["codex_a","codex_b"]}' + +# Pool status (active member + per-member health) +curl http://localhost:3000/api/pools/openai + +# Operator override: advance the active member +curl -X POST http://localhost:3000/api/pools/openai/rotate + +# Remove a pool +curl -X DELETE http://localhost:3000/api/pools/openai +``` + ## Data Loss Prevention Two complementary inspection layers protect against credential leakage and dangerous tool use: diff --git a/api/openapi.yaml b/api/openapi.yaml index 0f53cbb..6ea9098 100644 --- a/api/openapi.yaml +++ b/api/openapi.yaml @@ -519,6 +519,139 @@ paths: schema: $ref: "#/components/schemas/ErrorResponse" + /api/pools: + get: + operationId: getApiPools + summary: List credential pools + tags: [pools] + responses: + "200": + description: Credential pools + content: + application/json: + schema: + type: array + items: + $ref: "#/components/schemas/Pool" + post: + operationId: postApiPools + summary: Create a credential pool + tags: [pools] + requestBody: + required: true + content: + application/json: + schema: + $ref: "#/components/schemas/CreatePoolRequest" + responses: + "201": + description: Pool created + content: + application/json: + schema: + $ref: "#/components/schemas/Pool" + "400": + description: Invalid request + content: + application/json: + schema: + $ref: "#/components/schemas/ErrorResponse" + "409": + description: Pool name collides or a member is already pooled + content: + application/json: + schema: + $ref: "#/components/schemas/ErrorResponse" + "500": + description: Internal error creating the pool (transaction/DB failure) + content: + application/json: + schema: + $ref: "#/components/schemas/ErrorResponse" + + /api/pools/{name}: + get: + operationId: getApiPoolsName + summary: Pool status (active member + per-member health) + tags: [pools] + parameters: + - name: name + in: path + required: true + schema: + type: string + responses: + "200": + description: Pool status + content: + application/json: + schema: + $ref: "#/components/schemas/PoolStatus" + "404": + description: Pool not found + content: + application/json: + schema: + $ref: "#/components/schemas/ErrorResponse" + delete: + operationId: deleteApiPoolsName + summary: Remove a credential pool + tags: [pools] + parameters: + - name: name + in: path + required: true + schema: + type: string + responses: + "204": + description: Pool removed + "404": + description: Pool not found + content: + application/json: + schema: + $ref: "#/components/schemas/ErrorResponse" + "409": + description: >- + Pool still referenced by one or more bindings. The response body's + `bindings` field lists the blocking bindings (id + destination). + content: + application/json: + schema: + $ref: "#/components/schemas/PoolReferencedErrorResponse" + + /api/pools/{name}/rotate: + post: + operationId: postApiPoolsNameRotate + summary: Operator override — advance the active pool member + tags: [pools] + parameters: + - name: name + in: path + required: true + schema: + type: string + responses: + "200": + description: Rotation result + content: + application/json: + schema: + $ref: "#/components/schemas/PoolRotateResult" + "404": + description: Pool not found + content: + application/json: + schema: + $ref: "#/components/schemas/ErrorResponse" + "409": + description: Rotate raced a concurrent membership change + content: + application/json: + schema: + $ref: "#/components/schemas/ErrorResponse" + /api/audit/recent: get: operationId: getApiAuditRecent @@ -619,6 +752,36 @@ components: code: type: string + PoolReferencedErrorResponse: + description: >- + 409 body for DELETE /api/pools/{name} when the pool is still + referenced by one or more bindings. Carries the generic error/code + plus the structured list of blocking bindings (id + destination); the + CLI and Telegram surfaces render the same list (channel parity). This + is a dedicated schema so the generic ErrorResponse envelope is not + coupled to one endpoint. + type: object + required: [error, bindings] + properties: + error: + type: string + code: + type: string + bindings: + type: array + items: + $ref: "#/components/schemas/PoolReferencingBinding" + + PoolReferencingBinding: + type: object + required: [id, destination] + properties: + id: + type: integer + format: int64 + destination: + type: string + HealthResponse: type: object required: [status] @@ -1009,6 +1172,98 @@ components: timeout_sec: type: integer + Pool: + type: object + required: [name, strategy, members] + properties: + name: + type: string + strategy: + type: string + description: "Pool strategy (only 'failover' is supported)" + created_at: + type: string + format: date-time + members: + type: array + items: + $ref: "#/components/schemas/PoolMember" + + PoolMember: + type: object + required: [credential, position] + properties: + credential: + type: string + position: + type: integer + + CreatePoolRequest: + type: object + required: [name, members] + properties: + name: + type: string + strategy: + type: string + description: "Pool strategy; defaults to 'failover' when omitted" + members: + type: array + description: "Ordered member credential names (failover order)" + items: + type: string + + PoolStatus: + type: object + required: [name, strategy, active, members] + properties: + name: + type: string + strategy: + type: string + active: + type: string + description: "Currently active member credential name" + members: + type: array + items: + $ref: "#/components/schemas/PoolMemberStatus" + + PoolMemberStatus: + type: object + required: [credential, position, active, state] + properties: + credential: + type: string + position: + type: integer + active: + type: boolean + state: + type: string + description: "healthy, cooldown, or healthy (cooldown expired)" + cooldown_until: + type: string + format: date-time + last_failure_reason: + type: string + + PoolRotateResult: + type: object + required: [pool, from, to] + properties: + pool: + type: string + from: + type: string + description: "Member that was active and is now parked" + to: + type: string + description: "New active member after the rotation" + parked_until: + type: string + format: date-time + AuditEntry: type: object required: [timestamp, verdict] diff --git a/cmd/sluice/main.go b/cmd/sluice/main.go index 88c6548..ef6094a 100644 --- a/cmd/sluice/main.go +++ b/cmd/sluice/main.go @@ -516,12 +516,7 @@ func main() { // Exhausted: no distinct member to fail over to (every // member cooling) — report it as pool exhaustion, NOT a // self-referential "X -> X" transition. - msg := fmt.Sprintf("pool %s failed over %s -> %s (%s)", - ev.Pool, ev.From, ev.To, ev.Reason) - if ev.Exhausted { - msg = fmt.Sprintf("pool %s exhausted: all members cooling down (%s); no healthy account to fail over to", - ev.Pool, ev.Reason) - } + msg := proxy.FormatFailoverNotice(ev) ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) defer cancel() for _, ch := range failoverBroker.Channels() { diff --git a/cmd/sluice/pool.go b/cmd/sluice/pool.go index ba277a4..924856b 100644 --- a/cmd/sluice/pool.go +++ b/cmd/sluice/pool.go @@ -7,8 +7,8 @@ import ( "strings" "time" + "github.com/nemirovsky/sluice/internal/poolops" "github.com/nemirovsky/sluice/internal/store" - "github.com/nemirovsky/sluice/internal/vault" ) func handlePoolCommand(args []string) error { @@ -49,13 +49,9 @@ func handlePoolCreate(args []string) error { if *membersStr == "" { return fmt.Errorf("--members is required (comma-separated oauth credential names)") } - var members []string - for _, m := range strings.Split(*membersStr, ",") { - m = strings.TrimSpace(m) - if m == "" { - return fmt.Errorf("empty credential name in --members list") - } - members = append(members, m) + members, err := poolops.ParseMembers(*membersStr) + if err != nil { + return err } db, err := store.New(*dbPath) @@ -64,7 +60,7 @@ func handlePoolCreate(args []string) error { } defer func() { _ = db.Close() }() - if err := db.CreatePoolWithMembers(name, *strategy, members); err != nil { + if err := poolops.Create(db, name, *strategy, members); err != nil { return err } @@ -89,7 +85,7 @@ func handlePoolList(args []string) error { } defer func() { _ = db.Close() }() - pools, err := db.ListPools() + pools, err := poolops.List(db) if err != nil { return err } @@ -124,50 +120,37 @@ func handlePoolStatus(args []string) error { } defer func() { _ = db.Close() }() - p, err := db.GetPool(name) - if err != nil { - return err - } - if p == nil { - return fmt.Errorf("pool %q not found", name) - } - healthRows, err := db.ListCredentialHealth() + res, err := poolops.Status(db, name) if err != nil { + var nf *poolops.PoolNotFoundError + if errors.As(err, &nf) { + return fmt.Errorf("pool %q not found", name) + } return err } - // Compute the active member using the exact same selection logic the - // proxy uses at injection time so `pool status` never disagrees with - // what would actually be injected. - resolver := vault.NewPoolResolver([]store.Pool{*p}, healthRows) - active, _ := resolver.ResolveActive(name) - - healthByCred := make(map[string]store.CredentialHealth, len(healthRows)) - for _, h := range healthRows { - healthByCred[h.Credential] = h - } - - fmt.Printf("pool %q (strategy: %s)\n", p.Name, p.Strategy) - now := time.Now() - for _, m := range p.Members { + fmt.Printf("pool %q (strategy: %s)\n", res.Name, res.Strategy) + for _, m := range res.Members { marker := " " - if m.Credential == active { + if m.Active { marker = "* " } status := "healthy" - if h, ok := healthByCred[m.Credential]; ok && h.Status == "cooldown" && !h.CooldownUntil.IsZero() { - if h.CooldownUntil.After(now) { - status = fmt.Sprintf("cooldown until %s", h.CooldownUntil.Format(time.RFC3339)) - } else { - status = "healthy (cooldown expired)" + switch m.State { + case "cooldown": + status = fmt.Sprintf("cooldown until %s", m.CooldownUntil.Format(time.RFC3339)) + if m.LastFailureReason != "" { + status += " — " + m.LastFailureReason } - if h.LastFailureReason != "" { - status += " — " + h.LastFailureReason + case "healthy (cooldown expired)": + status = "healthy (cooldown expired)" + if m.LastFailureReason != "" { + status += " — " + m.LastFailureReason } } fmt.Printf("%s[%d] %s %s\n", marker, m.Position, m.Credential, status) } - fmt.Printf("active: %s\n", active) + fmt.Printf("active: %s\n", res.Active) return nil } @@ -188,70 +171,22 @@ func handlePoolRotate(args []string) error { } defer func() { _ = db.Close() }() - p, err := db.GetPool(name) - if err != nil { - return err - } - if p == nil { - return fmt.Errorf("pool %q not found", name) - } - healthRows, err := db.ListCredentialHealth() + res, err := poolops.Rotate(db, name) if err != nil { - return err - } - resolver := vault.NewPoolResolver([]store.Pool{*p}, healthRows) - active, ok := resolver.ResolveActive(name) - if !ok || active == "" { - return fmt.Errorf("pool %q has no resolvable member to rotate away from", name) - } - - // Manual override: park the current active member so the next member in - // position order becomes active. The cooldown lapses on its own (lazy - // recovery, same as auto-failover), so a rotated-away member rejoins the - // rotation once its cooldown expires. - // - // Finding 1 (round-15) + Cluster A #3 (round-18): use the pool+epoch - // scoped guarded write, NOT the unconditional SetCredentialHealth and - // NOT the name-only guard. `active` was resolved from the snapshot `p` - // taken above; another process could remove this pool (or this member - // from it) AND re-add the same name into a DIFFERENT pool between that - // snapshot and this write. The name-only guard only checked that - // `active` was a member of SOME pool — the re-added successor satisfies - // that, so the rotate would park the OTHER pool's member while - // reporting a successful rotate of THIS pool. Capture `active`'s - // pool+epoch identity from the snapshot and gate the write on exactly - // (active, this pool, that epoch): a raced removal/re-add makes the - // write a no-op (wrote=false) because the snapshot's epoch no longer - // matches the live membership row, so we surface a failed/stale rotate - // instead of silently parking an unrelated pool's member. - var rotateEpoch int64 = -1 - for _, m := range p.Members { - if m.Credential == active { - rotateEpoch = m.Epoch - break + var nf *poolops.PoolNotFoundError + if errors.As(err, &nf) { + return fmt.Errorf("pool %q not found", name) + } + // poolops keeps RotateRaceError channel-neutral; the CLI adds its + // own remediation hint here so CLI UX is unchanged. + var race *poolops.RotateRaceError + if errors.As(err, &race) { + return fmt.Errorf("%w; re-check the pool with \"sluice pool status %s\" and retry", err, name) } - } - if rotateEpoch < 0 { - return fmt.Errorf("pool %q rotate: resolved active member %q is not in the pool snapshot (membership changed under the rotate); re-check with \"sluice pool list %s\"", name, active, name) - } - until := time.Now().Add(vault.AuthFailCooldown) - wrote, err := db.SetCredentialHealthIfPoolMemberEpoch(active, name, rotateEpoch, "cooldown", until, vault.ManualRotateReason) - if err != nil { - return err - } - if !wrote { - return fmt.Errorf("pool %q rotate raced a concurrent pool/member removal or re-add: %q is no longer a live member of pool %q at the snapshotted epoch %d, so nothing was persisted; re-check the pool with \"sluice pool list %s\"", name, active, name, rotateEpoch, name) - } - - // Recompute the new active member for operator feedback. - healthRows, err = db.ListCredentialHealth() - if err != nil { return err } - resolver = vault.NewPoolResolver([]store.Pool{*p}, healthRows) - next, _ := resolver.ResolveActive(name) fmt.Printf("pool %q rotated: %s -> %s (parked %s until %s)\n", - name, active, next, active, until.Format(time.RFC3339)) + name, res.From, res.To, res.From, res.ParkedUntil.Format(time.RFC3339)) return nil } @@ -272,24 +207,7 @@ func handlePoolRemove(args []string) error { } defer func() { _ = db.Close() }() - // Reject the removal while any binding still references this pool by - // name. A pool shares the credential namespace, so a binding's - // "credential" column may hold the pool name (e.g. created via - // "sluice binding add --destination ..."). Deleting the pool - // out from under such bindings would leave them pointing at a - // non-existent credential (injection silently fails for those - // destinations) and, worse, a later credential created with the same - // name would silently inherit the stale bindings. This mirrors the - // fail-closed pool-membership guard in "sluice cred remove": refuse - // with an actionable error instead of cascading or orphaning. - // - // Finding 3: the reference check and the pool delete MUST be atomic. - // RemovePoolIfUnreferenced folds both into ONE store transaction so a - // concurrent "sluice binding add " cannot commit in a window - // between a separate pre-check and the delete and leave a binding - // pointing at a now-deleted pool. The store method is the authoritative - // atomic gate; this CLI layer only formats its typed error. - removed, err := db.RemovePoolIfUnreferenced(name) + err = poolops.Remove(db, name) if err != nil { var refErr *store.PoolReferencedError if errors.As(err, &refErr) { @@ -300,11 +218,12 @@ func handlePoolRemove(args []string) error { return fmt.Errorf("pool %q is still referenced by %d binding(s): %s; rebind or remove these bindings first (sluice binding remove , which also clears the auto-created allow rule), then retry pool remove", name, len(refErr.Bindings), strings.Join(details, ", ")) } + var nf *poolops.PoolNotFoundError + if errors.As(err, &nf) { + return fmt.Errorf("pool %q not found", name) + } return err } - if !removed { - return fmt.Errorf("pool %q not found", name) - } fmt.Printf("pool %q removed\n", name) return nil } diff --git a/docs/plans/20260518-channel-feature-parity.md b/docs/plans/20260518-channel-feature-parity.md index 9427a63..fabc1ac 100644 --- a/docs/plans/20260518-channel-feature-parity.md +++ b/docs/plans/20260518-channel-feature-parity.md @@ -7,6 +7,8 @@ Two follow-ups surfaced while operating the credential-pool feature live: 1. **Pools are CLI-only.** `sluice pool create|list|status|rotate|remove` exists, but there are **no HTTP API endpoints** and **no Telegram `/pool` commands**. Every other store-backed management surface (policy, credentials, bindings, MCP upstreams) is reachable from CLI **and** REST **and** Telegram. Pools must reach parity. 2. **Token-host phantom expansion is grant-blind.** sluice rewrites *every* request to a pool's shared OAuth token host (e.g. `auth.openai.com`), including non-refresh grants. A fresh in-container `codex login --device-auth` (a `device_code` grant) is therefore corrupted into `400 token_exchange_user_error`. The expansion must act only on `grant_type=refresh_token`. +A third, smaller follow-up: the **pool failover Telegram notice is operator-unfriendly** — it emits the raw `pool failed over -> ()` with a bare HTTP/grant code, unlike sluice's other plain-text Telegram messages. Reword it into a concise, human-readable notice (raw code kept in parentheses for operators) without touching the `cred_failover` audit `Reason` format. + This plan also codifies a general **channel feature-parity principle** (CLI / HTTP API / Telegram / future channels expose the same management features unless there is a documented single-channel rationale) in `CLAUDE.md` and `CONTRIBUTING.md` so future store-backed features do not regress into a single channel again. ## Context @@ -52,11 +54,11 @@ This plan also codifies a general **channel feature-parity principle** (CLI / HT - Add: `internal/poolops/poolops_test.go` - Modify: `cmd/sluice/pool.go` (call the shared package instead of inline logic) -- [ ] Define `Create(store, name, strategy, members)`, `List(store)`, `Status(store, name)`, `Rotate(store, name)`, `Remove(store, name)` with typed results and sentinel errors (reuse `store.ErrCredentialInUseByPool`, `store.PoolReferencedError`) -- [ ] Move the epoch-guarded rotate write (`SetCredentialHealthIfPoolMemberEpoch` + `vault.ManualRotateReason`) and the `vault.NewPoolResolver(...).ResolveActive` status derivation into `Status`/`Rotate` -- [ ] Rewrite `cmd/sluice/pool.go` handlers to thin wrappers over `internal/poolops` (no behavior change; existing CLI tests must still pass) -- [ ] Tests: every op, success + each sentinel error; rotate epoch-race no-op -- [ ] Run tests +- [x] Define `Create(store, name, strategy, members)`, `List(store)`, `Status(store, name)`, `Rotate(store, name)`, `Remove(store, name)` with typed results and sentinel errors (reuse `store.ErrCredentialInUseByPool`, `store.PoolReferencedError`) +- [x] Move the epoch-guarded rotate write (`SetCredentialHealthIfPoolMemberEpoch` + `vault.ManualRotateReason`) and the `vault.NewPoolResolver(...).ResolveActive` status derivation into `Status`/`Rotate` +- [x] Rewrite `cmd/sluice/pool.go` handlers to thin wrappers over `internal/poolops` (no behavior change; existing CLI tests must still pass) +- [x] Tests: every op, success + each sentinel error; rotate epoch-race no-op +- [x] Run tests ### Task 2: REST endpoints for pools (spec-first) @@ -66,11 +68,11 @@ This plan also codifies a general **channel feature-parity principle** (CLI / HT - Modify: `internal/api/server.go` - Modify: `internal/api/server_test.go` -- [ ] Add to `api/openapi.yaml`: `/api/pools` (GET, POST), `/api/pools/{name}` (GET, DELETE), `/api/pools/{name}/rotate` (POST), with schemas mirroring the credentials/bindings style -- [ ] `make generate`; implement the new `ServerInterface` methods in `server.go` calling `internal/poolops` -- [ ] Map errors: validation → 400, unknown pool → 404, `ErrCredentialInUseByPool` / `PoolReferencedError` → 409, else 500 (mirror the existing cred-removal mapping at `server.go:~1287`) -- [ ] Tests: list/create/status/rotate/remove happy paths + 400/404/409 -- [ ] `make lint-api`; run `go test ./internal/api/` +- [x] Add to `api/openapi.yaml`: `/api/pools` (GET, POST), `/api/pools/{name}` (GET, DELETE), `/api/pools/{name}/rotate` (POST), with schemas mirroring the credentials/bindings style +- [x] `make generate`; implement the new `ServerInterface` methods in `server.go` calling `internal/poolops` +- [x] Map errors: validation → 400, unknown pool → 404, `ErrCredentialInUseByPool` / `PoolReferencedError` → 409, else 500 (mirror the existing cred-removal mapping at `server.go:~1287`) +- [x] Tests: list/create/status/rotate/remove happy paths + 400/404/409 +- [x] `make lint-api`; run `go test ./internal/api/` ### Task 3: Telegram `/pool` commands @@ -79,11 +81,11 @@ This plan also codifies a general **channel feature-parity principle** (CLI / HT - Modify: `internal/telegram/commands_test.go` - Modify: command help/menu registration (wherever `/policy`,`/cred`,`/mcp` are listed) -- [ ] Add `case "pool":` to the dispatch switch; subcommands `create|list|status|rotate|remove` calling `internal/poolops` -- [ ] Render `status` like the CLI (`* [i] member healthy|cooldown … reason`); render errors as plain text -- [ ] No message auto-delete (pool args carry no secrets — unlike `/mcp add`); add `/pool` to the grouped help/command menu -- [ ] Tests: each subcommand success + error -- [ ] Run tests +- [x] Add `case "pool":` to the dispatch switch; subcommands `create|list|status|rotate|remove` calling `internal/poolops` +- [x] Render `status` like the CLI (`* [i] member healthy|cooldown … reason`); render errors as plain text +- [x] No message auto-delete (pool args carry no secrets — unlike `/mcp add`); add `/pool` to the grouped help/command menu +- [x] Tests: each subcommand success + error +- [x] Run tests ### Task 4: Scope pool token-host phantom expansion to refresh grants @@ -91,18 +93,31 @@ This plan also codifies a general **channel feature-parity principle** (CLI / HT - Modify: `internal/proxy/addon.go` - Modify: `internal/proxy/addon_test.go` (or `pool_phantom_test.go`) -- [ ] At the pool token-host expansion site, parse the request body form and only expand when `grant_type == "refresh_token"`; pass `device_code`/`authorization_code`/absent-grant requests through unmodified (reuse the form-parse already used by `classifyFailover`) -- [ ] Tests (fail-before/pass-after): `refresh_token` grant still expanded (regression guard); `device_code` grant body + headers reach upstream byte-unchanged so a fresh in-container `codex login --device-auth` is not corrupted -- [ ] Run tests; `go vet ./...` and `go vet -tags=e2e ./e2e/` +- [x] At the pool token-host expansion site, parse the request body form and only expand when `grant_type == "refresh_token"`; pass `device_code`/`authorization_code`/absent-grant requests through unmodified (reuse the form-parse already used by `classifyFailover`) +- [x] Tests (fail-before/pass-after): `refresh_token` grant still expanded (regression guard); `device_code` grant body + headers reach upstream byte-unchanged so a fresh in-container `codex login --device-auth` is not corrupted +- [x] Run tests; `go vet ./...` and `go vet -tags=e2e ./e2e/` + +### Task 4b: Friendlier pool failover Telegram notification text + +**Files:** +- Modify: `cmd/sluice/main.go` (the `onFailover` goroutine, lines ~519–524 — `msg := fmt.Sprintf("pool %s failed over %s -> %s (%s)", ...)` and the `ev.Exhausted` branch `"pool %s exhausted: all members cooling down (%s); ..."`) +- Modify: the unit test(s) asserting the failover notice / `FailoverEvent` wording (search `internal/proxy/pool_failover_test.go`, `pool_failover_apihost_test.go`, `pool_splithost_test.go`; if the human-facing string is only built in `cmd/sluice/main.go` with no direct test, add a focused test for the message builder — extract a small pure helper if needed so it is testable, without changing the `onFailover` callback shape) + +Context (verified): the notice is sent via `channel.Notify` (plain text, no parse mode — `internal/telegram/approval.go:215` `TelegramChannel.Notify` uses `tgbotapi.NewMessage` with no `ParseMode`, so markdown/HTML renders literally). sluice's other Telegram messages are plain sentence-style, no emoji, no markdown (e.g. `internal/telegram/commands.go`: `"Added allow rule"`, `"Removed rule ID: %d"`, `"No rule found for: %s"`; `internal/telegram/approval.go:198`: `" — applied to %d requests"`). Reason-code mapping for human words: `429`/`403`/quota tags → rate limit / quota exhausted; `401`/`invalid_grant`/`invalid_token` → auth failure. + +- [x] Replace the bare `pool failed over -> ()` Telegram notice (and the `ev.Exhausted` variant) with a friendlier, concise message consistent with sluice's other plain-text Telegram notifications (sentence-style, no emoji/markdown, matching the `commands.go`/`approval.go` examples above). Keep it compact (one short line, not verbose). Translate the raw reason/HTTP code into human words (rate limit / quota exhausted for 429/403; auth failure for 401/invalid_grant/invalid_token) while still including the technical code in parentheses for operators. Show the pool name and the from->to members clearly (and, for the exhausted case, that no healthy member remains). +- [x] Keep it plain-text safe (this notice path is best-effort plain text — `TelegramChannel.Notify` sets no parse mode per CLAUDE.md and the code comment at `cmd/sluice/main.go:514`). Do not introduce Telegram markdown/HTML the notice path does not render. +- [x] Update/extend the existing unit test(s) that assert the failover notice / `FailoverEvent` wording (fail-before/pass-after on the new wording). Do NOT change the `cred_failover` audit `Reason` format (`:->:`) nor the `pool_exhausted` `Reason` (`:exhausted:`) asserted in `internal/proxy/pool_failover_test.go` — only the human-facing Telegram text. +- [x] Run tests (`go test ./internal/proxy/ ./internal/telegram/ -timeout 60s`, plus `go test ./cmd/sluice/ -timeout 60s` if a builder test lands there) and `gofumpt -w` changed files ### Task 5: Final validation **Files:** none (validation only) -- [ ] `gofumpt -l` clean; `golangci-lint run ./...` 0 issues -- [ ] Full `go test ./...` + `-race` on `internal/proxy`,`internal/api`,`internal/telegram`,`internal/poolops`,`internal/store` -- [ ] `go build ./...`; `go vet ./...`; `go vet -tags=e2e ./e2e/` -- [ ] Independently verify the committed HEAD (no conflict markers; `git diff --stat HEAD` empty) before pushing +- [x] `gofumpt -l` clean; `golangci-lint run ./...` 0 issues +- [x] Full `go test ./...` + `-race` on `internal/proxy`,`internal/api`,`internal/telegram`,`internal/poolops`,`internal/store` +- [x] `go build ./...`; `go vet ./...`; `go vet -tags=e2e ./e2e/` +- [x] Independently verify the committed HEAD (no conflict markers; `git diff --stat HEAD` empty) before pushing ## Out of Scope diff --git a/internal/api/api.gen.go b/internal/api/api.gen.go index 62b8309..5477fa9 100644 --- a/internal/api/api.gen.go +++ b/internal/api/api.gen.go @@ -430,6 +430,16 @@ type CreateMCPUpstreamRequest struct { Transport *string `json:"transport,omitempty"` } +// CreatePoolRequest defines model for CreatePoolRequest. +type CreatePoolRequest struct { + // Members Ordered member credential names (failover order) + Members []string `json:"members"` + Name string `json:"name"` + + // Strategy Pool strategy; defaults to 'failover' when omitted + Strategy *string `json:"strategy,omitempty"` +} + // CreateRuleRequest defines model for CreateRuleRequest. type CreateRuleRequest struct { Destination *string `json:"destination,omitempty"` @@ -517,6 +527,67 @@ type MCPUpstream struct { Transport *string `json:"transport,omitempty"` } +// Pool defines model for Pool. +type Pool struct { + CreatedAt *time.Time `json:"created_at,omitempty"` + Members []PoolMember `json:"members"` + Name string `json:"name"` + + // Strategy Pool strategy (only 'failover' is supported) + Strategy string `json:"strategy"` +} + +// PoolMember defines model for PoolMember. +type PoolMember struct { + Credential string `json:"credential"` + Position int `json:"position"` +} + +// PoolMemberStatus defines model for PoolMemberStatus. +type PoolMemberStatus struct { + Active bool `json:"active"` + CooldownUntil *time.Time `json:"cooldown_until,omitempty"` + Credential string `json:"credential"` + LastFailureReason *string `json:"last_failure_reason,omitempty"` + Position int `json:"position"` + + // State healthy, cooldown, or healthy (cooldown expired) + State string `json:"state"` +} + +// PoolReferencedErrorResponse 409 body for DELETE /api/pools/{name} when the pool is still referenced by one or more bindings. Carries the generic error/code plus the structured list of blocking bindings (id + destination); the CLI and Telegram surfaces render the same list (channel parity). This is a dedicated schema so the generic ErrorResponse envelope is not coupled to one endpoint. +type PoolReferencedErrorResponse struct { + Bindings []PoolReferencingBinding `json:"bindings"` + Code *string `json:"code,omitempty"` + Error string `json:"error"` +} + +// PoolReferencingBinding defines model for PoolReferencingBinding. +type PoolReferencingBinding struct { + Destination string `json:"destination"` + Id int64 `json:"id"` +} + +// PoolRotateResult defines model for PoolRotateResult. +type PoolRotateResult struct { + // From Member that was active and is now parked + From string `json:"from"` + ParkedUntil *time.Time `json:"parked_until,omitempty"` + Pool string `json:"pool"` + + // To New active member after the rotation + To string `json:"to"` +} + +// PoolStatus defines model for PoolStatus. +type PoolStatus struct { + // Active Currently active member credential name + Active string `json:"active"` + Members []PoolMemberStatus `json:"members"` + Name string `json:"name"` + Strategy string `json:"strategy"` +} + // ResolveRequest defines model for ResolveRequest. type ResolveRequest struct { Verdict ResolveRequestVerdict `json:"verdict"` @@ -611,6 +682,9 @@ type PatchApiCredentialsNameJSONRequestBody = CredentialUpdate // PostApiMcpUpstreamsJSONRequestBody defines body for PostApiMcpUpstreams for application/json ContentType. type PostApiMcpUpstreamsJSONRequestBody = CreateMCPUpstreamRequest +// PostApiPoolsJSONRequestBody defines body for PostApiPools for application/json ContentType. +type PostApiPoolsJSONRequestBody = CreatePoolRequest + // PostApiRulesJSONRequestBody defines body for PostApiRules for application/json ContentType. type PostApiRulesJSONRequestBody = CreateRuleRequest @@ -676,6 +750,21 @@ type ServerInterface interface { // Remove an MCP upstream // (DELETE /api/mcp/upstreams/{name}) DeleteApiMcpUpstreamsName(w http.ResponseWriter, r *http.Request, name string) + // List credential pools + // (GET /api/pools) + GetApiPools(w http.ResponseWriter, r *http.Request) + // Create a credential pool + // (POST /api/pools) + PostApiPools(w http.ResponseWriter, r *http.Request) + // Remove a credential pool + // (DELETE /api/pools/{name}) + DeleteApiPoolsName(w http.ResponseWriter, r *http.Request, name string) + // Pool status (active member + per-member health) + // (GET /api/pools/{name}) + GetApiPoolsName(w http.ResponseWriter, r *http.Request, name string) + // Operator override — advance the active pool member + // (POST /api/pools/{name}/rotate) + PostApiPoolsNameRotate(w http.ResponseWriter, r *http.Request, name string) // List policy rules // (GET /api/rules) GetApiRules(w http.ResponseWriter, r *http.Request, params GetApiRulesParams) @@ -817,6 +906,36 @@ func (_ Unimplemented) DeleteApiMcpUpstreamsName(w http.ResponseWriter, r *http. w.WriteHeader(http.StatusNotImplemented) } +// List credential pools +// (GET /api/pools) +func (_ Unimplemented) GetApiPools(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusNotImplemented) +} + +// Create a credential pool +// (POST /api/pools) +func (_ Unimplemented) PostApiPools(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusNotImplemented) +} + +// Remove a credential pool +// (DELETE /api/pools/{name}) +func (_ Unimplemented) DeleteApiPoolsName(w http.ResponseWriter, r *http.Request, name string) { + w.WriteHeader(http.StatusNotImplemented) +} + +// Pool status (active member + per-member health) +// (GET /api/pools/{name}) +func (_ Unimplemented) GetApiPoolsName(w http.ResponseWriter, r *http.Request, name string) { + w.WriteHeader(http.StatusNotImplemented) +} + +// Operator override — advance the active pool member +// (POST /api/pools/{name}/rotate) +func (_ Unimplemented) PostApiPoolsNameRotate(w http.ResponseWriter, r *http.Request, name string) { + w.WriteHeader(http.StatusNotImplemented) +} + // List policy rules // (GET /api/rules) func (_ Unimplemented) GetApiRules(w http.ResponseWriter, r *http.Request, params GetApiRulesParams) { @@ -1338,6 +1457,139 @@ func (siw *ServerInterfaceWrapper) DeleteApiMcpUpstreamsName(w http.ResponseWrit handler.ServeHTTP(w, r) } +// GetApiPools operation middleware +func (siw *ServerInterfaceWrapper) GetApiPools(w http.ResponseWriter, r *http.Request) { + + ctx := r.Context() + + ctx = context.WithValue(ctx, BearerAuthScopes, []string{}) + + r = r.WithContext(ctx) + + handler := http.Handler(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + siw.Handler.GetApiPools(w, r) + })) + + for _, middleware := range siw.HandlerMiddlewares { + handler = middleware(handler) + } + + handler.ServeHTTP(w, r) +} + +// PostApiPools operation middleware +func (siw *ServerInterfaceWrapper) PostApiPools(w http.ResponseWriter, r *http.Request) { + + ctx := r.Context() + + ctx = context.WithValue(ctx, BearerAuthScopes, []string{}) + + r = r.WithContext(ctx) + + handler := http.Handler(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + siw.Handler.PostApiPools(w, r) + })) + + for _, middleware := range siw.HandlerMiddlewares { + handler = middleware(handler) + } + + handler.ServeHTTP(w, r) +} + +// DeleteApiPoolsName operation middleware +func (siw *ServerInterfaceWrapper) DeleteApiPoolsName(w http.ResponseWriter, r *http.Request) { + + var err error + + // ------------- Path parameter "name" ------------- + var name string + + err = runtime.BindStyledParameterWithOptions("simple", "name", chi.URLParam(r, "name"), &name, runtime.BindStyledParameterOptions{ParamLocation: runtime.ParamLocationPath, Explode: false, Required: true, Type: "string", Format: ""}) + if err != nil { + siw.ErrorHandlerFunc(w, r, &InvalidParamFormatError{ParamName: "name", Err: err}) + return + } + + ctx := r.Context() + + ctx = context.WithValue(ctx, BearerAuthScopes, []string{}) + + r = r.WithContext(ctx) + + handler := http.Handler(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + siw.Handler.DeleteApiPoolsName(w, r, name) + })) + + for _, middleware := range siw.HandlerMiddlewares { + handler = middleware(handler) + } + + handler.ServeHTTP(w, r) +} + +// GetApiPoolsName operation middleware +func (siw *ServerInterfaceWrapper) GetApiPoolsName(w http.ResponseWriter, r *http.Request) { + + var err error + + // ------------- Path parameter "name" ------------- + var name string + + err = runtime.BindStyledParameterWithOptions("simple", "name", chi.URLParam(r, "name"), &name, runtime.BindStyledParameterOptions{ParamLocation: runtime.ParamLocationPath, Explode: false, Required: true, Type: "string", Format: ""}) + if err != nil { + siw.ErrorHandlerFunc(w, r, &InvalidParamFormatError{ParamName: "name", Err: err}) + return + } + + ctx := r.Context() + + ctx = context.WithValue(ctx, BearerAuthScopes, []string{}) + + r = r.WithContext(ctx) + + handler := http.Handler(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + siw.Handler.GetApiPoolsName(w, r, name) + })) + + for _, middleware := range siw.HandlerMiddlewares { + handler = middleware(handler) + } + + handler.ServeHTTP(w, r) +} + +// PostApiPoolsNameRotate operation middleware +func (siw *ServerInterfaceWrapper) PostApiPoolsNameRotate(w http.ResponseWriter, r *http.Request) { + + var err error + + // ------------- Path parameter "name" ------------- + var name string + + err = runtime.BindStyledParameterWithOptions("simple", "name", chi.URLParam(r, "name"), &name, runtime.BindStyledParameterOptions{ParamLocation: runtime.ParamLocationPath, Explode: false, Required: true, Type: "string", Format: ""}) + if err != nil { + siw.ErrorHandlerFunc(w, r, &InvalidParamFormatError{ParamName: "name", Err: err}) + return + } + + ctx := r.Context() + + ctx = context.WithValue(ctx, BearerAuthScopes, []string{}) + + r = r.WithContext(ctx) + + handler := http.Handler(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + siw.Handler.PostApiPoolsNameRotate(w, r, name) + })) + + for _, middleware := range siw.HandlerMiddlewares { + handler = middleware(handler) + } + + handler.ServeHTTP(w, r) +} + // GetApiRules operation middleware func (siw *ServerInterfaceWrapper) GetApiRules(w http.ResponseWriter, r *http.Request) { @@ -1674,6 +1926,21 @@ func HandlerWithOptions(si ServerInterface, options ChiServerOptions) http.Handl r.Group(func(r chi.Router) { r.Delete(options.BaseURL+"/api/mcp/upstreams/{name}", wrapper.DeleteApiMcpUpstreamsName) }) + r.Group(func(r chi.Router) { + r.Get(options.BaseURL+"/api/pools", wrapper.GetApiPools) + }) + r.Group(func(r chi.Router) { + r.Post(options.BaseURL+"/api/pools", wrapper.PostApiPools) + }) + r.Group(func(r chi.Router) { + r.Delete(options.BaseURL+"/api/pools/{name}", wrapper.DeleteApiPoolsName) + }) + r.Group(func(r chi.Router) { + r.Get(options.BaseURL+"/api/pools/{name}", wrapper.GetApiPoolsName) + }) + r.Group(func(r chi.Router) { + r.Post(options.BaseURL+"/api/pools/{name}/rotate", wrapper.PostApiPoolsNameRotate) + }) r.Group(func(r chi.Router) { r.Get(options.BaseURL+"/api/rules", wrapper.GetApiRules) }) @@ -1702,68 +1969,83 @@ func HandlerWithOptions(si ServerInterface, options ChiServerOptions) http.Handl // Base64 encoded, gzipped, json marshaled Swagger object var swaggerSpec = []string{ - "H4sIAAAAAAAC/+Rc23LbOJN+FRR3L+wqRVImmd0a3Tme7MS1ObhsZ/ciSkkw2bIwBgEOAMrWn/K7/4UD", - "SZACD0psOVNzNbGIQx++7v4ANudbFPM04wyYktHsWyTjNaTY/PMkywTfYHoBf+Uglf4pEzwDoQiYAbEA", - "rCBZYPNsxUWq/xUlWMELRVKIRpHaZhDNIqkEYTfRwyhKQCrCsCKc6Uk7z0kS/DnjQnkPCFNwAyJ6eBhF", - "Av7KiYAkmn3Rs+tbuJkjX9avpVj8+k+IlV7/JE+IesuU2O6qieNWaWMBCTBFMF3kEsKi92ncptooygRs", - "Fmss18F5ArBsWVIbXyqcZsP9ojinwbU2IBISq8Czhu2rTatJIVO/ISzR8x8FTpUDvsv2wDaLDRb6WQIy", - "FiSzQ6O3bEMEZykwhTZYEHxNATGcAlpxgbI1ZoqnSPFbYIgwrZpF284Oa8AJiA6ol3oSpv7rdbWEjwMu", - "bHASBakMY8X9goXAW4sdrnjMaXCe5/bGNAVpRrGCfncHQs1zRofjP2eJ26Bu8nMs9FSUm+fGzhhd2zlj", - "dKIQBSwV4gzQigBNUJpLha4BSVCIMKTWMGfC5ip0zZPtCHG1BnFHJOiHSILYgEACVC6YRK+nU/QGJ8il", - "N3SUYhWvCbuZMz369P3Z8Rh9YnRrt5NIrbFCWADKBEgNDP3vlCdkRSAZo0tQysxeelZZIsURRoyzF5Bm", - "aos2mOaAMJXcKSqNbBnWdp0znCv+wgUCwpTyOyRyCuhI4ZsbSNAdUWu0dFZ5gZNkNs+n01dxZXrzNywR", - "F3O21D93jTpGkpv9GdwhT2xEJGJcIS401LV2ZytrFiuoFWqNJboGYEhAyjeQoBSzHFO6HZk1nZT+unNG", - "JJKK0MLNCcIsQRjdYcH0WCIR5UbTI8bRClN6jeNbux2Rc+ZMczxG51hKPQMzZC1rMGxgszQBo02AlmUc", - "LFFMAQtp1EgD8y3O7QI2apcjtCwCYjkyy7mEsZwzu5pR1OBjjAyw9RIBAFi3GwTFnFKSgJwz40zMDEpL", - "a3FmwapzTeUvD7W/oVPOVpTEajw3Ba6WRH/qjPfTJLLd3CS0Yu8Juw3W/xzT9kIM9xnEunC1jqCEwYLl", - "6XXNLm0kxh/dXH1UkyaUZU/XmDGgj1NegWkY+LzmmnMKmO1Vv+wv3yJgeWqYAlC4ETiNRtFaqcxTo9r5", - "Dq7XnN8uJMQC1C5c3304OUX2oQGoJDcmgbh5KMNbynEio461c0F3F/5/t8Dni/dm4XdXV+cotkYNrBaq", - "imZIZbwOL10qrHK566tOu+9vziZTGyxfVav3kG/XdX0eGBCgOueRAGlMYIVzqhYeTS3sYsqnISlsqwNH", - "3gaRprHPc6XFDWekjdkgIeGcZp/qkxIJp712bdqs++Q6BUUy6cFxNO/IV48OO0pWzAxdg7rTHAD71coW", - "da8WjdFVycA0S9O7SpTkGSWxXnDmrehXwKNq0ZG/3vGcaSaCUrxFXHM0uCeGHMYwRheQWQJ1/unySrrS", - "2Vs5/85HiZ/yhLDX4cAC67Qc1HrpgOMYpFwYe+0a+9NJrjmVGeNselTIhO7WwJDeWdNMjnO1Ph5wRVHf", - "4Gylzxsjy+DjIhqYz9U1+B2WwzX1MYGCCFOWwuMbPTHmTGHCDHXYB0F6n0eDloCVALnudpIbVHiJm6c6", - "zLW29kfBlY324I1FOxBHkZkfLvBXZmlgScYJU6bO7w2RogS7TG2eYUVieySu58siJ+o5Y/S7naHBieyc", - "sSnFNr2Xq5itg6ndnCQCWq1rxwXHjOyx40gqrpUDFottZo9PF4XGhjyZba2EvRTHIKU9hD+cnn/OpBKA", - "0/YYFjd7Zp2YpylmSVvmNYsmCbEQOq9t1rZ2JXgr9nupgRKYyeICr+GQ4pEx6wxJlRCOjhxijkdIczVz", - "qLyDa8njW1DDTF8Zo90JFzmFVuv3VbL2VICVAsGeuwIJyCiOQefJltjvv8rs4FN6gwTXLi9b3NF1y3la", - "YxN1B7Tjbb+sZYiPjl+TLQzOerJVR2rSWcJkgWK9433y0h45wm3adg34Ee5c2jI3gMwyO80Lq/Q2Rv9T", - "Za3qZznStXnOlmb+0o6yxaY5CC19HrE0NbuoQHQ7Z8taCVta+lpKsiwdtdR1wtwHig0k4zmbs/OTq9N3", - "SEKKmSKxnCGeEmWn1RctFGxKOGfFgvZ6Kc6FAKboFrk8XqucY3TFEdxrIk30GHMvZe/qauO02pqVN667", - "jpbzermeRzM0j+bR8ng8Z1drq94GmJIopsT811xjmdoMlnwb+uFzrpXg6ZxJQq3cOuMIvtX7GWYfUiPA", - "yLupngZKiO5VAeE5PRgXPTSl2qBBVcI7jNGnlCikeOU/o22JmobbLkPu0LONBxFxh5ShBEBLW6/6q2CM", - "HAerzE6gvhWCiwuQGWcycESNeRJOYaDn9Z8N7LBQhngHmKp1+86yvDKBe5xm1My+7S2dblpox7NUF68L", - "kIbJNfdzTF4uCJMgVO3Sw6tt5TB5S7KsbVRszv0LCS1v/PT5oW8nO6Zzm9wRsL6lqnEdyzUM2RCxKc8o", - "YLCAdYIyhgSq2SzkPY9vPjXR/L4b1B8kp4NvWQ/HYr+Lu5qb0QEE9gIkp5t29tpG4hacxXptTO/wVi5q", - "xO6HmFwpUFtGamlZGPzi3FimU4KcwiG7L34EbH+TM4LkuYjheY8PvX63bwc6inDxVsI3zX8KWEWz6D8m", - "VV/PxDX1TOpvHULWBpOnF9i1/si2xhR+v11QIhUw18rRfA3Q0LU5I7TVqFIoZI3/A0FW29YybV7gLShh", - "t8Pt4b31CxiDwg2OtdAM2sDKFaZdAzaYkqQSqqe0+svV547q+jVk27WWRjjEuSBqe6l1dTYCLEBoRrub", - "6E/Ozxy91aejDcHo8v3ns9O3i5Pzs8XVp/99+xEB26ANFtHINokZd5sVqwxh3j896O0JW/HdXS7eXl4h", - "vZXmpilm+KY4E1zSnMT+FdYLd8XJblABEWRQNNbqkxhcSNhMFH04u7K1ThlG6JY75UwJTtE5xQz0xjbi", - "pJXm5Xg6nupZPAOGMxLNolfj6fhVZNLY2thsgjMyqYXDjSVvGnwmkZ4l0Sz6A9RJRk48LAsXtmbOL9Op", - "Zc1MuYyEM/vqg3A2+dP1cVlYDkZvsz9vB8IPD83ryHMbdJVFXb+MtJDJ0xSLbTSL3hOpUNY6eBQprInV", - "l6iyzFe9Qt1ak28keZgIWz9NyHIZsN05lzXjnSWu5Bo/CJyCAqF3+xYRrYT2TcElZjaLVlGkRA4jz5LN", - "FPzVDgap3vBku5dTunzRYC0P9dDWQj38ICQG7e4qRcDzJ5UTzdhEw+X19PWjyVA/MHZJwLhCK54zJ8Jv", - "zyACpgJwsi2Noams5kwJ4rlqxIIzLsKtAdEXD3lC1ERA7BTsSiB66IUdGQb/XzmIbYV+SlKiIh/w5SuJ", - "X6ejKMX3JNVU5eV0qv8kzP25y+lsZDx5zqr6bAekK2sJZAyIgCmhi33TO94Qym/KYZ5P9LMdf2wMnRji", - "D0s8oieM3xq1CdjBPHdLa8zaYb4Z7AqeGdZYrlG8xoR1WKI4lvdY4U0x7BAIKdqDB8DDu8EuNQnUsTgw", - "rLJJ+dNXc0LpKFA1Mzx+CQk2YAwqJC8fTYbS+LvGdo/cO2+Xu6eHy91nzBDiMuceunYU+helw1zrSvuy", - "ek1ks/vFP2nXMXmSJPVmGa9XYBeTzVA1nMrmeQr2/UkdrL+b3z24niWPwaN67wQC9eP17gGgsKLrGD44", - "CSn29zhIo5xoufbwjzkqxOtA0tA/P5cbHj871Zv3D8xvB6Ql107+c6Sl5wX0gRPj795nA0VvO3Kt7dXL", - "r6K5z+bLnf72RhhamH1HmvSvxDoYzWnVz/v0jKboyB7AaD5yVRG9UpcAp2HBgZVpqpu0HdOUFaQncRU2", - "+psnrnon84ETV+n6AHm1j+qJ64CJo9i/rRKWIRiCWg/Syl7trhC0g57S+HaHkO62owLFxQhf8T9AFS0X", - "bkAuiibWUmk7cUD599R8Amz7beSHhna7dc2Tn6cih2Dd69gSy1XnRB+gvZEHKSte8dzrrKwTtkRHpktE", - "mo8YGWxAILjPuITkuOcMbab7JvP07j1DN430VMfo3UbyA5+kfe90egMnyT/wNO0boHag7jkst+IuEK+T", - "bxqqww7LHi4/2h6Ffsbjmhn2fPfQd0I+9b/5fJ5Dsp8rhp+Tu1JCT408mPGfJN3UG1oH5Zluv/9Tz7KD", - "cGf6OswR0rY58lVLk3BvqkjjbFI2nPUU9w9x9rkceojq7re1DSjvH07PUaVLoIDXB1SmSeOsv2rvaP9U", - "ZTvw8ciB63bN7rt2Lp79Y6/BSwP0l21WA90O5oJhuE/N9lH5zEW7tMpzlexSgL6CPdAppqm3JydemDGD", - "XlMXXW6+ob+3hy68g/umfXd5BuqOC9MgzjmNqg7FwNoHeQVuujmH9OpwSuItsp4Itef4zys/2r97U3rh", - "vKfL5f43aAdO4tbEgX6CnMLPk7kDBx3PpwGX1oJzoo/rQg2J0bf37n+Ctge4FU9pXflmsOwoePXpw3vk", - "pKorZwWwUEVYIj2yVz+SFvr1A9l+wtEJ5zSnimRYqMmKi/RFghWu61fvJ10RGvrKVqto74iQGTGqLrav", - "CcMmI3V3/ZppgY7Ng17e1T55CWE1tQ5zA54pUgxM6kgqBDNIWgmeDsPS4JfmFk0/0xtzk7Oei1aYzXvv", - "AIZlrerDrY585VrTnxD6jab6jrcDptsYObHrqp97j0y7h/6nX4WLz82M9mvzSdu/ulR/54Y8oeKND+tC", - "jMNoRSSyApsWvV+nrw4Ht1IADTlzvKi1skezL199L1iNULyG+NazvZVe274+t94A/+Wrjj/7f6SxIV6X", - "5T2PMXWWmJycn7n/eU00isz32abVfTaZvPzlv8fT8XT8cvZqOp1GD18f/h0AAP//EhGdcVJVAAA=", + "H4sIAAAAAAAC/+Rd73LbuHZ/FQzbmWvPVSTt3dx21veT47g3niYbj+O0H6KMBJNHFq5JgAuActSMZ/oQ", + "fcI+SQcHIAlSICkltpztftpYxJ+D8/d3Dg+4X6NYZLngwLWKTr5GKl5BRvGfp3kuxZqmV/BbAUqbn3Ip", + "cpCaAQ6IJVANyZzis6WQmflXlFANLzTLIBpFepNDdBIpLRm/jR5GUQJKM041E9xM2nrOkuDPuZDae8C4", + "hluQ0cPDKJLwW8EkJNHJJzO7uYWbOfJp/VyRJW7+AbE2658WCdPnXMvN9jFp3EltLCEBrhlN54WCMOlD", + "J+462ijKJaznK6pWwXkSqOpY0jBfaZrlu8tFC5EG11qDTFisA89avK83rSeFWP2K8cTMfxR1qgXwTbwH", + "vp6vqTTPElCxZLkdGp3zNZOCZ8A1WVPJ6E0KhNMMyFJIkq8o1yIjWtwBJ4ybo1lt29phBTQB2aPq1TkZ", + "1//ysl7C1wMhrXEyDZkK64r7hUpJN1Z3hBaxSIPzPLG3pmnI8pRqGBZ3wNQ8YfQI/mOeuA2aLL+k0kwl", + "BT5HPlNyY+eMyakmKVClieBAlgzShGSF0uQGiAJNGCd6BTMura8iNyLZjIjQK5D3TIF5SBTINUgiQReS", + "K/JyOiWvaEKceyNHGdXxivHbGTejz95eHI/Je55u7HaK6BXVhEoguQRlFMP8OxMJWzJIxuQDaI2zFx5X", + "FkQLQgkX/AVkud6QNU0LIDRVwh1UIW05NXydcVpo8cIZAqFpKu6JLFIgR5re3kJC7plekYXjyguaJCez", + "Yjr9Oa5Zj3/Dggg54wvzc9+oY6IE7s/hnnhkE6YIF5oIaVTdnO5iadliCbVEragiNwCcSMjEGhKSUV7Q", + "NN2McE1Hpb/ujDNFlGZpKeaEUJ4QSu6p5GYsUyQVeNIjLsiSpukNje/sdkzNuGPN8ZhcUqXMDMqJ5Szq", + "MKrNAg3GsIAsKjtYkDgFKhUeIwvMt3puF7BWuxiRRWkQixEu5xzGYsbtanhQ1I8xQcU2SwQUwIodNSgW", + "acoSUDOOwqQctbTiluBWWY2vqeXlae0v5EzwZcpiPZ5hgGs40R/a4/0wjmzbN0lzsLeM3wXjf0HT7kAM", + "X3KITeDqHJEyDnNeZDcNvnSBGH90e/VRg5qQlz1bUc4hfZzwCtyogY9rboRIgfK94pf95WsEvMgQKUAK", + "t5Jm0ShaaZ17x6h3voeblRB3cwWxBL2trm/enZ4R+xAVVLFbdCBuHsnpJhU0UVHP2oVMtxf+T7fAx6u3", + "uPCb6+tLElumBlYLRUUcUjOvR0ofNNWF2pZVL9/3Z2cbqe1MXx2r96BvW3RDEtjBQI3PYwHQmMCSFqme", + "ezC15AuGTwQpfGMMR90FNc3ovii0ITfskda4QcLCPs0+NZkSC7u97tN0cffJzxQkCd2Dw2heyte0DjtK", + "1ciM3IC+NxiA+tHKBnUvFo3JdYXADEozuyqSFHnKYrPgibeiHwGP6kVH/nrHM26QCMnohgiD0eALQ3AY", + "w5hcQW4B1OX7D9fKhc7ByPl7TiV+yAxhr+TAKtZZNaiz6EDjGJSaI7+2mf3+tDCYCsc4nh6VNJH7FXBi", + "djYwU9BCr453KFE0N7hYmnxjZBF8XFoD97G6UX6ny+GY+piKQhjXFsLTWzMxFlxTxhE67KNBZp9HUy0J", + "Swlq1S8kN6iUksCnxszNae2PUmhr7cGKRbcijiKcHw7w17g08CQXjGuM83urSBmCnafGZ1Sz2KbETX9Z", + "+kQzZ0xe2xlGOYmdM8ZQbN17tQpuHXTtmEkETrVqpAsOGdm040hpYQ4HPJab3KZPV+WJETzhtpbCQYiD", + "mtJtwu/OLj/mSkugWbcNy9s9vU4ssozypMvz4qJJwqwKXTY261q7JrxT9wehgZaUq7KA1xJI+QjZekKU", + "TpggR05jjkfEYDVMKu/hRon4DvRurK+Z0S2ESyG6PWgGJrVQAbuUCRiVsAN8dTIbK3K0pCwVa5BEmJHG", + "MnaXYCeTlZZUw+0mUBESIiXl47+RxDOdP5Wk/MmarMiY1pDsysGSBd0cvCpS6OTgEBbodqZUa5D8uWO4", + "hDylMZhI0+E9h4vBPYjUbJDQRvm3Qxx9deKzBh5rCqDbYvfz+wgdjQdEf4uWOuDve5y78bPoR8v1jvfx", + "7Ht4WbdpVyH1V7h3jh9rqNxiY4Osa4sek3+r/X79sxoZdDPjC5y/sKNsuG4PIgsfiS0Q9ZQxPN3M+KIB", + "AhY2AagoWVSCWphIixVVuYZkPOMzfnl6ffaGKMgo1yxWJ9a6cVpz0fKAbQpnvFzQFujiQkrgOt0QFwkb", + "2GNMrgWBLyYVYWYMVvZstbMxzhzb5DWtguHRYtYEPLPohMyiWbQ4Hs/49coebw1cKxKnDP+LhUBEN2DT", + "FwRwPmpdSpHNuGKppdt4HCk2Zj/MjULHCOQ0/WDZKEoIMNcG4Qk9aBcDQK/eoAX2wjuMyfuMaaJFLT88", + "baU1LbF9CInDzEYJEubSvF0hlKG2iZuWQRs5DkaZLUM9l1LIK1C54CqQ5MciCbswMPOGsys7LOQh3gBN", + "9ap7Z1UVneALzfIUZ98Nhk43LbTjRWaC1xUoxMLt/VwupOaMK5C6UTbyYls1TN2xPO8aFWPlZK6g452p", + "ycCGdrJjercpHIQdWqoe17Nci5EtEtv0jAIMC3AnSGOIoAbPQtLzEPtTQ/Vvq0F/J7zfuU59uDzgm9A/", + "1pZ3SAEuHXr7/vK/lylU8v9nCcvoJPqnSd03MnFNIxOz8zuc8xQZgINrHvRniqgiN8yF5HhX7F/t1p8G", + "eGcJ8bKvVJgLxVq5QZcv8Fby5vXT0/XSgMaarSFck4+FSBNxz+cF1yx9tP6KlCo9N/IoJMx7GlL6ODLC", + "yBKIxyuMY5sRKYlHe3G/kqPyVwPdDDuHFSDM7FHJuJKQLu5fwRIk8BiSrdDeJPzl9BdsQEB4+vr87fn1", + "OZnQnE1yIVI1+WoU8cGVmlZAzK+kei8uq13IzQa7HYQkmZDV+3Q1JmdUSuaw7S1wkCwmiAcmBlWQPC2U", + "g4myiHVhoGKK5fEluUlFfId1e7caOWIJ+XOjtv434hogENRfu9dLRBVySWNQRAJPQNZFelz7yL0jIzmV", + "TG+ODeBnypyLkgQSLPMnxHqKsuugpL3BTwJ8DamwJTguNIlFkaeQmLTfsKPM4MZbaLc80l7+qpQq47dl", + "Z1IwqD0GWqvD+KCSeeTsXX7YMeANNfR0koh5SxfaM4nLtklYz2XznnuqiDU51C+U8r1Rm7tQAWcU2Sf7", + "eq68q4ahRRj6O5Jc8YsutVPxsgg96F5wx5FlAG7TxcBhB96qM1Tpa5PGVoHucSO4I/Jb4/i+objywX0x", + "+QqUSNfddbmu8tRc8NisTdN7ulHzRsnqu2pUFUFduVZHO+vOTZVolL0UFCkcsjP3e2D076T6qUQhY3je", + "wuig3K159pQXyo6VXS2/2ZES4jZgOJpT1xauupqWxZfN3EAC4C56teFo22+2ZoS2GtUHCnHjP0Cy5aaz", + "AIHNXfOU8bvd+eF1hAWYkcItjQ3RHLqUVWia9g1Y05QlNVED0dlfrjl31Dxfi7ZtbhkNh7gwIO2DOavj", + "EVAJ8rTQq+34c3p54Qp3CjRZM0o+vP14cXY+P728mF+///fzXw1mI2tqEA7yD8WNK9YeAnuTHsz2jC8D", + "Mfjq/MM1MVsZ1JxRTm/LaueHtGCx/3rzhXv9zW9JqSIEtchgwpTF4EzCeqLo3cW1zeI11rrccmeCaylS", + "cplSDmZja3HKUvPTeDqemlkiB05zFp1EP4+n458jdGMr5Bli+oY53NqylFE+dKQXSXQS/R30ac5OPV2W", + "zmxxzl+mU1sP5Np5JJrbthgm+OQfLqWyarmz9rbvbmyp8MND+1X1pTW6mqOul1pZlSmyjMpNdBK9NWg/", + "7xw8ijQ1EPxTVHPms1mhya3JV5Y8TKSNn2iyQgV4dylUg3kXiQu5KAdJM9AIbz59jZg5hJFNWSU5sV60", + "tiItCxh5nGy74M92MCj9SiSbvYTSJ4sWanlomrYh6uE7VWKn3V2kCEj+tBYijk2Murycvnw0Gpr5ch8F", + "Jt9bioI7En55BhJoKoEmm4oZJv82mCkhotAtW3DMJbTTIIbsoUiYnkiI3QH7HIgZemVHhpX/twLkptb+", + "lGVMR77CV+0qf52Ooox+YZmBKj9Np+ZPxt2fgUzx80F8Vn0Hawd3ZTlBkIEEuJYm2Lel4w1JxW01zJOJ", + "ebYljzXCiV3kYYFH9IT224A2AT7gc7e00Vk7zGeDXcFjw4qqFYlXlPEeTvjFlB4uvCqHHUJDOgs022zx", + "3s1XJwnEsTgwrOZJXa2xNczuANVgw+OHkGBz7k6B5KdHo6Fi/jaz3SPXD+l89/RwvvuCIyCufO6hY0d5", + "/jJ04AtrZRsZV0y1O6P9TLupk6dJ0myk9vpIt3WybaqIqayfT8FW1JvK+hp/99T1InkMHDVcadyOHy+3", + "E4CSi+422cFBSLm/h0Fa4cTQtYd8MFWIVwGnYX5+LjE8vndqXuw8ML7dwS25q4Y/hlt6XoU+sGN87V0p", + "Le89EnftsW7rKS9+WH+5dfexZYZWzb7BTfolsR5Ec1bf9Xp6RFPe1tsB0fwqdA30qrMEMA0PDqxZU1fS", + "tlhTRZABx1Xy6HfuuJq33A7suCrRB8Cre5XacFwHdBzl/l2RsDLBkKoNaFp1j6/PBO2gp2S+3SF0dvuy", + "jcTlCP/gfwddNpO6AYWsXg6Wh7YTdwj/3jGfQLf9K4aHVu1u7uKTHycih9R6ULCVLtc9oUMK7Y08SFjx", + "gudeubK76IH9rwo/cMFhDZLAl1woSI4Hcmic7rPMO/dgDt1m0lOl0duXDA+cSfvS6ZUGTZI/YDbtM6CR", + "UA8ky516F7BX14S1U7Ls6eWvtm9hGPG4Boc93z0MZchn/vdAnidJ9n3F7nlyn0sYiJEHY/6TuJvmVZ2d", + "/Ey/3P+ouexOeod9HZhC2gscYtlx/WnQVWRxPqla6QeC+7s4/1gNPUR09xv2dwjv784uSX2WQABvDqhZ", + "k8X5cNTeOv1The3AxeIDx+0G37f5XD77w5bBKwYMh23eULotnQua4T4x29fKZw7aFVeeK2RXBAwF7B2F", + "gl3sAz7xEsccwhnibZf9khx7gP5MJncHKFlg/x70hvW5n8oN+lf7D+z/LKsDnUNCpH9Yp4eHx6+0VAVu", + "/HKl69FmqvKGRoUsg/56WAZpkJym9oaIlVPZWYdXT47w9pr9yu3k9Svi7vO0c32rfs26u2t5bxtJw1Ps", + "47bRfJ7ZX6NAn8tXW216qvc1fbeYuojZ/VbSNV6Yd5d4bkSy+ZOqPlWqFu6jrSm+GcfPg+5wE2m8wyvY", + "Dh0cDYanQ+rZ9FFlWDZpd4qs7OF+Vt1tSM4jjBw1r7H8meQgX7g/7PW+4x2dysR+Q2Gwf7SSt7279PuU", + "euPeVagjzl1UqrrA/l/5rsHtLXuIpMZJURILXr6pcVeKVizHd1S30NLN96gyQhKxBilZAuR///t/CE3W", + "lLtyglNYDJd2tR4Fxev8A/j4Csfs1MZZ3gLxVe9b75iEd3DfA91enoO+FxI/DWGdbHmDJ7D2QVpE8bbT", + "Lr3sImXxhlhJhNrX/ee1JO3fgyC/FN7TgXz/61MHBvmWxQHzKlL4cUB+4EWAJ9OASBvGOYEv5ccahmz0", + "/Iv7H0jsodxaZGnz8G1j2Trg9ft3b4mjqnk4S4BVVUIVMSMHz8ey8nzDimw/3tKrzlmRapZTqSdLIbMX", + "CdW0eb7WFWCWhr5QaI5o36ESHDGqGz9uGKfokfpvxeG0wI2mg77cbnzsJqSrmRWYF4SfwVJQTZqaVBKG", + "mrSUIttNl3ZuKrXa9CN1lKLPeq5UDjcffEe2m9eqP9nU469cVvCEqt+6dNrTPYO38apEpJkJeI+wHdr8", + "04/C5Yem8PQ2H/ivvqO/cUOe8OCtT2qFEAeeiqny+yS2zPPzAdF3SYBROSw4Na56RiefPvtSsCci8Qri", + "O4/3lnrD++bc5gXRT5+N/dmveVsTb9LyVsQ0dZyYnF5euA9/R6MIv8yIV0FPJpOf/vKv4+l4Ov7p5Ofp", + "dBo9fH74vwAAAP//eOrONI5qAAA=", } // GetSwagger returns the content of the embedded swagger specification file diff --git a/internal/api/pool_create_response_test.go b/internal/api/pool_create_response_test.go new file mode 100644 index 0000000..d33e819 --- /dev/null +++ b/internal/api/pool_create_response_test.go @@ -0,0 +1,102 @@ +package api + +import ( + "errors" + "fmt" + "net/http" + "testing" + + "github.com/nemirovsky/sluice/internal/poolops" + "github.com/nemirovsky/sluice/internal/store" +) + +// TestPoolCreateError_StatusMapping is the Finding 1 seam: poolCreateError +// must map the two conflict sentinels to 409, every genuine client-input +// validation sentinel to 400, and ANYTHING ELSE (a sentinel-free wrapped +// internal/DB error) to 500 — never downgrade an internal failure to a +// misleading 400. +func TestPoolCreateError_StatusMapping(t *testing.T) { + cases := []struct { + name string + err error + want int + }{ + // 409 conflicts (wrapped at origin like CreatePoolWithMembers does). + {"name conflict", fmt.Errorf("%w: pool %q already exists", store.ErrPoolNameConflict, "p"), http.StatusConflict}, + {"already pooled", fmt.Errorf("%w: credential %q is already a member", store.ErrCredentialAlreadyPooled, "c"), http.StatusConflict}, + // 400 client-input validation sentinels. + {"poolops no members", poolops.ErrNoMembers, http.StatusBadRequest}, + {"store no members", fmt.Errorf("%w: pool %q requires at least one member", store.ErrPoolNoMembers, "p"), http.StatusBadRequest}, + {"invalid strategy", fmt.Errorf("%w %q: only %q is supported", store.ErrPoolStrategyInvalid, "rr", "failover"), http.StatusBadRequest}, + {"duplicate member", fmt.Errorf("%w: pool %q lists credential %q more than once", store.ErrPoolMemberDuplicate, "p", "c"), http.StatusBadRequest}, + {"unknown member", fmt.Errorf("%w: credential %q does not exist", store.ErrPoolMemberNotFound, "c"), http.StatusBadRequest}, + {"static member", fmt.Errorf("%w: credential %q is static", store.ErrPoolMemberNotOAuth, "c"), http.StatusBadRequest}, + // 500 default: a genuine internal/DB error carries NO sentinel and + // must NOT be misclassified as a 400. + {"wrapped internal tx error", fmt.Errorf("begin tx: %w", errors.New("database is closed")), http.StatusInternalServerError}, + {"wrapped internal collision-check error", fmt.Errorf("check name collision for %q: %w", "p", errors.New("disk I/O error")), http.StatusInternalServerError}, + {"bare error", errors.New("something unexpected"), http.StatusInternalServerError}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + if got := poolCreateError(tc.err); got != tc.want { + t.Fatalf("poolCreateError(%v) = %d, want %d", tc.err, got, tc.want) + } + }) + } +} + +// TestMembersToStorePoolMembers is the Finding 7 seam: PostApiPools builds the +// 201 body from the request members (not a store read-back) so a read-back +// error can no longer turn a successful create into a misleading 500. The +// mapping must preserve order as 0-based positions (request order == failover +// order). +func TestMembersToStorePoolMembers(t *testing.T) { + got := membersToStorePoolMembers([]string{"credA", "credB", "credC"}) + want := []store.PoolMember{ + {Credential: "credA", Position: 0}, + {Credential: "credB", Position: 1}, + {Credential: "credC", Position: 2}, + } + if len(got) != len(want) { + t.Fatalf("len = %d, want %d", len(got), len(want)) + } + for i := range want { + if got[i].Credential != want[i].Credential || got[i].Position != want[i].Position { + t.Fatalf("member %d = %+v, want %+v", i, got[i], want[i]) + } + } + + // Empty members -> empty slice (not nil-panic). The handler never + // reaches this with empty members (poolops.Create rejects them first), + // but the helper must be total. + if e := membersToStorePoolMembers(nil); len(e) != 0 { + t.Fatalf("nil members -> %+v, want empty", e) + } +} + +// TestPoolCreateResponseShape_FromRequestData asserts the request-data path +// renders the same API Pool shape the read-back path would, including the +// failover strategy default poolops.Create applies for an empty strategy +// (Finding 7: the 201 body must be correct without a store read-back). +func TestPoolCreateResponseShape_FromRequestData(t *testing.T) { + out := storePoolToAPI(store.Pool{ + Name: "openai_pool", + Strategy: store.PoolStrategyFailover, + Members: membersToStorePoolMembers([]string{"credA", "credB"}), + }) + if out.Name != "openai_pool" { + t.Errorf("name = %q, want openai_pool", out.Name) + } + if out.Strategy != store.PoolStrategyFailover { + t.Errorf("strategy = %q, want %q", out.Strategy, store.PoolStrategyFailover) + } + if len(out.Members) != 2 || + out.Members[0].Credential != "credA" || out.Members[0].Position != 0 || + out.Members[1].Credential != "credB" || out.Members[1].Position != 1 { + t.Fatalf("members = %+v, want ordered credA(0),credB(1)", out.Members) + } + if out.CreatedAt != nil { + t.Errorf("CreatedAt should be nil without a read-back, got %v", *out.CreatedAt) + } +} diff --git a/internal/api/server.go b/internal/api/server.go index 0f9aa9d..9740b41 100644 --- a/internal/api/server.go +++ b/internal/api/server.go @@ -23,6 +23,7 @@ import ( "github.com/nemirovsky/sluice/internal/channel" "github.com/nemirovsky/sluice/internal/container" "github.com/nemirovsky/sluice/internal/policy" + "github.com/nemirovsky/sluice/internal/poolops" "github.com/nemirovsky/sluice/internal/proxy" "github.com/nemirovsky/sluice/internal/store" "github.com/nemirovsky/sluice/internal/vault" @@ -1733,6 +1734,210 @@ func (s *Server) DeleteApiMcpUpstreamsName(w http.ResponseWriter, r *http.Reques w.WriteHeader(http.StatusNoContent) } +// --- Pool handlers --- +// +// All pool operations route through the channel-agnostic internal/poolops +// package so the REST surface cannot drift from the CLI / Telegram surfaces +// (channel feature-parity principle). Error mapping mirrors the credential +// handlers: validation/bad input -> 400, unknown pool -> 404, +// *PoolReferencedError -> 409, else 500. + +// poolStatusError maps a poolops/store error to an HTTP status for the pool +// Status / Rotate / Remove handlers. Unknown pool is 404; a pool still +// referenced by a binding (Remove) is 409; everything else (store faults, +// etc.) is 500. +// +// store.ErrCredentialInUseByPool is deliberately NOT mapped here: it is +// raised only by the credential-removal path (RemoveCredentialStoreState in +// store.go, when deleting a credential that is still a live pool member) and +// is handled by its own errors.Is check in the credential-delete handler. No +// poolops.Status/Rotate/Remove store path can return it, so checking it here +// would be dead, misleading code. +func poolStatusError(err error) int { + var notFound *poolops.PoolNotFoundError + if errors.As(err, ¬Found) { + return http.StatusNotFound + } + var referenced *store.PoolReferencedError + if errors.As(err, &referenced) { + return http.StatusConflict + } + return http.StatusInternalServerError +} + +// GetApiPools lists all credential pools. +func (s *Server) GetApiPools(w http.ResponseWriter, _ *http.Request) { //nolint:revive // generated interface name + pools, err := poolops.List(s.store) + if err != nil { + writeError(w, http.StatusInternalServerError, "failed to list pools: "+err.Error(), "") + return + } + out := make([]Pool, len(pools)) + for i, p := range pools { + out[i] = storePoolToAPI(p) + } + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(out) +} + +// poolCreateError maps a poolops.Create / store error to an HTTP status. +// Per the OpenAPI contract for POST /api/pools: a name collision (pool name +// already taken or shadowing a credential) or an already-pooled member is a +// 409 Conflict; genuine input validation (no members, invalid strategy, +// static/non-oauth member, unknown member, duplicate in the submitted list) +// is 400; ANYTHING ELSE (tx/DB/INSERT failure inside +// store.CreatePoolWithMembers — wrapped fmt.Errorf strings, not sentinels) +// is an internal error and must surface as 500, never be downgraded to a +// misleading 400. Defaults to 500 like poolStatusError so a wrapped internal +// error fails closed correctly (Finding 1). +func poolCreateError(err error) int { + switch { + case errors.Is(err, store.ErrPoolNameConflict), + errors.Is(err, store.ErrCredentialAlreadyPooled): + return http.StatusConflict + case errors.Is(err, poolops.ErrNoMembers), + errors.Is(err, store.ErrPoolNoMembers), + errors.Is(err, store.ErrPoolStrategyInvalid), + errors.Is(err, store.ErrPoolMemberDuplicate), + errors.Is(err, store.ErrPoolMemberNotFound), + errors.Is(err, store.ErrPoolMemberNotOAuth): + return http.StatusBadRequest + default: + return http.StatusInternalServerError + } +} + +// PostApiPools creates a credential pool. Members are ordered (failover +// order). A namespace/pool-name collision or an already-pooled member is a +// 409 Conflict (the OpenAPI contract); genuine input validation (no members, +// static member, unknown member) is 400. The store does the static/oauth + +// duplicate gating in one transaction. +func (s *Server) PostApiPools(w http.ResponseWriter, r *http.Request) { //nolint:revive // generated interface name + var req CreatePoolRequest + if err := json.NewDecoder(limitedBody(w, r)).Decode(&req); err != nil { + writeError(w, http.StatusBadRequest, "invalid request body", "") + return + } + if req.Name == "" { + writeError(w, http.StatusBadRequest, "name is required", "") + return + } + // Empty-members validation is owned by poolops.Create (returns + // ErrNoMembers, mapped to 400 by poolCreateError) so the rule has a + // single channel-agnostic source of truth. + strategy := "" + if req.Strategy != nil { + strategy = *req.Strategy + } + if err := poolops.Create(s.store, req.Name, strategy, req.Members); err != nil { + writeError(w, poolCreateError(err), err.Error(), "") + return + } + + // The create succeeded. Build the 201 body from the data we already have + // (request name + ordered members + the strategy poolops.Create applied) + // rather than gating the response on an independent GetPool read-back: a + // read-back error would otherwise tell the client the create FAILED when + // it actually succeeded, and a client retry would then 409 on the + // now-existing pool (Finding 7). poolops.Create defaults an empty + // strategy to store.PoolStrategyFailover, so mirror that here. The + // read-back is kept only as a best-effort enrichment for CreatedAt (an + // optional field); its failure no longer changes the status code. + effectiveStrategy := strategy + if effectiveStrategy == "" { + effectiveStrategy = store.PoolStrategyFailover + } + out := storePoolToAPI(store.Pool{ + Name: req.Name, + Strategy: effectiveStrategy, + Members: membersToStorePoolMembers(req.Members), + }) + if p, err := s.store.GetPool(req.Name); err == nil && p != nil { + out = storePoolToAPI(*p) + } + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusCreated) + _ = json.NewEncoder(w).Encode(out) +} + +// membersToStorePoolMembers maps an ordered credential-name slice (the create +// request order == failover order) to store.PoolMember rows with 0-based +// positions, so PostApiPools can render the 201 body without a store +// read-back (Finding 7). +func membersToStorePoolMembers(names []string) []store.PoolMember { + out := make([]store.PoolMember, len(names)) + for i, n := range names { + out[i] = store.PoolMember{Credential: n, Position: i} + } + return out +} + +// GetApiPoolsName returns the pool status (active member + per-member health), +// derived with the exact selection logic the proxy uses at injection time. +func (s *Server) GetApiPoolsName(w http.ResponseWriter, _ *http.Request, name string) { //nolint:revive // generated interface name + res, err := poolops.Status(s.store, name) + if err != nil { + writeError(w, poolStatusError(err), err.Error(), "") + return + } + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(poolStatusToAPI(res)) +} + +// PostApiPoolsNameRotate is the operator override: park the active member so +// the next member in position order becomes active. A concurrent membership +// change that invalidates the snapshot is a 409 (RotateRaceError); the +// operator should re-check the pool and retry. +func (s *Server) PostApiPoolsNameRotate(w http.ResponseWriter, _ *http.Request, name string) { //nolint:revive // generated interface name + res, err := poolops.Rotate(s.store, name) + if err != nil { + status := poolStatusError(err) + var race *poolops.RotateRaceError + if errors.As(err, &race) { + status = http.StatusConflict + } + writeError(w, status, err.Error(), "") + return + } + out := PoolRotateResult{Pool: res.Pool, From: res.From, To: res.To} + if !res.ParkedUntil.IsZero() { + t := res.ParkedUntil + out.ParkedUntil = &t + } + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(out) +} + +// DeleteApiPoolsName removes a pool. It refuses (409) while any binding still +// references it by name; an unknown pool is 404. On the 409 the structured +// list of referencing bindings (id + destination) is included in the response +// body, matching what the CLI and Telegram surfaces render from +// store.PoolReferencedError.Bindings (channel parity). +func (s *Server) DeleteApiPoolsName(w http.ResponseWriter, _ *http.Request, name string) { //nolint:revive // generated interface name + if err := poolops.Remove(s.store, name); err != nil { + var referenced *store.PoolReferencedError + if errors.As(err, &referenced) { + bindings := make([]PoolReferencingBinding, len(referenced.Bindings)) + for i, b := range referenced.Bindings { + bindings[i] = PoolReferencingBinding{Id: b.ID, Destination: b.Destination} + } + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusConflict) + // Dedicated 409 schema (Finding 8): the blocking-binding list + // lives on PoolReferencedErrorResponse, not the generic + // ErrorResponse envelope. + _ = json.NewEncoder(w).Encode(PoolReferencedErrorResponse{ + Error: err.Error(), + Bindings: bindings, + }) + return + } + writeError(w, poolStatusError(err), err.Error(), "") + return + } + w.WriteHeader(http.StatusNoContent) +} + // --- Audit handlers --- // GetApiAuditRecent returns the last N audit log entries. @@ -2016,6 +2221,52 @@ func storeMCPUpstreamToAPI(u store.MCPUpstreamRow) MCPUpstream { return upstream } +// storePoolToAPI converts a store.Pool to the API Pool type. +func storePoolToAPI(p store.Pool) Pool { + pool := Pool{ + Name: p.Name, + Strategy: p.Strategy, + Members: make([]PoolMember, len(p.Members)), + } + for i, m := range p.Members { + pool.Members[i] = PoolMember{Credential: m.Credential, Position: m.Position} + } + if p.CreatedAt != "" { + if t, err := time.Parse("2006-01-02 15:04:05", p.CreatedAt); err == nil { + pool.CreatedAt = &t + } + } + return pool +} + +// poolStatusToAPI converts a *poolops.StatusResult to the API PoolStatus type. +func poolStatusToAPI(res *poolops.StatusResult) PoolStatus { + out := PoolStatus{ + Name: res.Name, + Strategy: res.Strategy, + Active: res.Active, + Members: make([]PoolMemberStatus, len(res.Members)), + } + for i, m := range res.Members { + ms := PoolMemberStatus{ + Credential: m.Credential, + Position: m.Position, + Active: m.Active, + State: m.State, + } + if !m.CooldownUntil.IsZero() { + t := m.CooldownUntil + ms.CooldownUntil = &t + } + if m.LastFailureReason != "" { + r := m.LastFailureReason + ms.LastFailureReason = &r + } + out.Members[i] = ms + } + return out +} + // storeChannelToAPI converts a store.Channel to the API Channel type. func storeChannelToAPI(ch store.Channel) Channel { var chType ChannelType diff --git a/internal/api/server_test.go b/internal/api/server_test.go index 2b1dba3..e80b349 100644 --- a/internal/api/server_test.go +++ b/internal/api/server_test.go @@ -2680,6 +2680,474 @@ func TestDeleteApiMcpUpstreams_NotFound(t *testing.T) { } } +// --- Pool handler tests --- + +// seedOAuthCred registers oauth credential_meta rows so the store-side pool +// member validation (oauth + non-empty token_url) passes. Mirrors how the +// real REST flow creates an OAuth credential before pooling it. +func seedOAuthCred(t *testing.T, st *store.Store, names ...string) { + t.Helper() + for _, n := range names { + if err := st.AddCredentialMeta(n, "oauth", "https://auth.example.com/token"); err != nil { + t.Fatalf("seed oauth credential meta %q: %v", n, err) + } + } +} + +func TestGetApiPools_Empty(t *testing.T) { + st := newTestStore(t) + enableHTTPChannel(t, st) + srv := api.NewServer(st, nil, nil, "") + + t.Setenv("SLUICE_API_TOKEN", "tok") + handler := newTestHandler(t, srv, st) + + req := httptest.NewRequest("GET", "/api/pools", nil) + req.Header.Set("Authorization", "Bearer tok") + rec := httptest.NewRecorder() + handler.ServeHTTP(rec, req) + + if rec.Code != http.StatusOK { + t.Fatalf("expected 200, got %d: %s", rec.Code, rec.Body.String()) + } + var pools []api.Pool + if err := json.NewDecoder(rec.Body).Decode(&pools); err != nil { + t.Fatalf("decode: %v", err) + } + if len(pools) != 0 { + t.Errorf("expected 0 pools, got %d", len(pools)) + } +} + +func TestPostApiPools_Success(t *testing.T) { + st := newTestStore(t) + enableHTTPChannel(t, st) + seedOAuthCred(t, st, "credA", "credB") + srv := api.NewServer(st, nil, nil, "") + + t.Setenv("SLUICE_API_TOKEN", "tok") + handler := newTestHandler(t, srv, st) + + body := `{"name": "openai_pool", "members": ["credA", "credB"]}` + req := httptest.NewRequest("POST", "/api/pools", strings.NewReader(body)) + req.Header.Set("Authorization", "Bearer tok") + req.Header.Set("Content-Type", "application/json") + rec := httptest.NewRecorder() + handler.ServeHTTP(rec, req) + + if rec.Code != http.StatusCreated { + t.Fatalf("expected 201, got %d: %s", rec.Code, rec.Body.String()) + } + var p api.Pool + if err := json.NewDecoder(rec.Body).Decode(&p); err != nil { + t.Fatalf("decode: %v", err) + } + if p.Name != "openai_pool" { + t.Errorf("expected name openai_pool, got %q", p.Name) + } + if p.Strategy != store.PoolStrategyFailover { + t.Errorf("expected strategy %q, got %q", store.PoolStrategyFailover, p.Strategy) + } + if len(p.Members) != 2 || p.Members[0].Credential != "credA" || p.Members[1].Credential != "credB" { + t.Errorf("unexpected members: %+v", p.Members) + } +} + +func TestPostApiPools_MissingMembers(t *testing.T) { + st := newTestStore(t) + enableHTTPChannel(t, st) + srv := api.NewServer(st, nil, nil, "") + + t.Setenv("SLUICE_API_TOKEN", "tok") + handler := newTestHandler(t, srv, st) + + body := `{"name": "p1", "members": []}` + req := httptest.NewRequest("POST", "/api/pools", strings.NewReader(body)) + req.Header.Set("Authorization", "Bearer tok") + req.Header.Set("Content-Type", "application/json") + rec := httptest.NewRecorder() + handler.ServeHTTP(rec, req) + + if rec.Code != http.StatusBadRequest { + t.Errorf("expected 400, got %d: %s", rec.Code, rec.Body.String()) + } +} + +func TestPostApiPools_StaticMemberRejected(t *testing.T) { + st := newTestStore(t) + enableHTTPChannel(t, st) + seedCred(t, st, "static_one") // static cred_type + srv := api.NewServer(st, nil, nil, "") + + t.Setenv("SLUICE_API_TOKEN", "tok") + handler := newTestHandler(t, srv, st) + + body := `{"name": "p1", "members": ["static_one"]}` + req := httptest.NewRequest("POST", "/api/pools", strings.NewReader(body)) + req.Header.Set("Authorization", "Bearer tok") + req.Header.Set("Content-Type", "application/json") + rec := httptest.NewRecorder() + handler.ServeHTTP(rec, req) + + if rec.Code != http.StatusBadRequest { + t.Errorf("expected 400 for static member, got %d: %s", rec.Code, rec.Body.String()) + } +} + +// TestPostApiPools_DuplicateName asserts that creating a pool whose name is +// already taken returns 409 Conflict per the OpenAPI contract (POST +// /api/pools: "Pool name collides or a member is already pooled"), not 400. +func TestPostApiPools_DuplicateName(t *testing.T) { + st := newTestStore(t) + enableHTTPChannel(t, st) + seedOAuthCred(t, st, "credA", "credB") + if err := st.CreatePoolWithMembers("dup_pool", store.PoolStrategyFailover, []string{"credA"}); err != nil { + t.Fatalf("seed pool: %v", err) + } + srv := api.NewServer(st, nil, nil, "") + + t.Setenv("SLUICE_API_TOKEN", "tok") + handler := newTestHandler(t, srv, st) + + body := `{"name": "dup_pool", "members": ["credB"]}` + req := httptest.NewRequest("POST", "/api/pools", strings.NewReader(body)) + req.Header.Set("Authorization", "Bearer tok") + req.Header.Set("Content-Type", "application/json") + rec := httptest.NewRecorder() + handler.ServeHTTP(rec, req) + + if rec.Code != http.StatusConflict { + t.Fatalf("expected 409 for duplicate pool name, got %d: %s", rec.Code, rec.Body.String()) + } +} + +// TestPostApiPools_NameCollidesWithCredential asserts the pool/credential +// shared-namespace collision is a 409 Conflict, not 400. +func TestPostApiPools_NameCollidesWithCredential(t *testing.T) { + st := newTestStore(t) + enableHTTPChannel(t, st) + seedOAuthCred(t, st, "credA", "credB") + srv := api.NewServer(st, nil, nil, "") + + t.Setenv("SLUICE_API_TOKEN", "tok") + handler := newTestHandler(t, srv, st) + + // "credA" already exists as a credential; a pool may not shadow it. + body := `{"name": "credA", "members": ["credB"]}` + req := httptest.NewRequest("POST", "/api/pools", strings.NewReader(body)) + req.Header.Set("Authorization", "Bearer tok") + req.Header.Set("Content-Type", "application/json") + rec := httptest.NewRecorder() + handler.ServeHTTP(rec, req) + + if rec.Code != http.StatusConflict { + t.Fatalf("expected 409 for name colliding with a credential, got %d: %s", rec.Code, rec.Body.String()) + } +} + +// TestPostApiPools_MemberAlreadyPooled asserts that re-pooling a credential +// that already belongs to another pool returns 409 Conflict per the spec. +func TestPostApiPools_MemberAlreadyPooled(t *testing.T) { + st := newTestStore(t) + enableHTTPChannel(t, st) + seedOAuthCred(t, st, "credA", "credB") + if err := st.CreatePoolWithMembers("pool_one", store.PoolStrategyFailover, []string{"credA"}); err != nil { + t.Fatalf("seed pool: %v", err) + } + srv := api.NewServer(st, nil, nil, "") + + t.Setenv("SLUICE_API_TOKEN", "tok") + handler := newTestHandler(t, srv, st) + + // credA is already a member of pool_one; a credential may belong to at + // most one pool. + body := `{"name": "pool_two", "members": ["credA", "credB"]}` + req := httptest.NewRequest("POST", "/api/pools", strings.NewReader(body)) + req.Header.Set("Authorization", "Bearer tok") + req.Header.Set("Content-Type", "application/json") + rec := httptest.NewRecorder() + handler.ServeHTTP(rec, req) + + if rec.Code != http.StatusConflict { + t.Fatalf("expected 409 for already-pooled member, got %d: %s", rec.Code, rec.Body.String()) + } +} + +// TestPostApiPools_UnknownMember asserts a member credential that does not +// exist is a 400 client error (store.ErrPoolMemberNotFound), not 500. +func TestPostApiPools_UnknownMember(t *testing.T) { + st := newTestStore(t) + enableHTTPChannel(t, st) + srv := api.NewServer(st, nil, nil, "") + + t.Setenv("SLUICE_API_TOKEN", "tok") + handler := newTestHandler(t, srv, st) + + body := `{"name": "p1", "members": ["nope_does_not_exist"]}` + req := httptest.NewRequest("POST", "/api/pools", strings.NewReader(body)) + req.Header.Set("Authorization", "Bearer tok") + req.Header.Set("Content-Type", "application/json") + rec := httptest.NewRecorder() + handler.ServeHTTP(rec, req) + + if rec.Code != http.StatusBadRequest { + t.Fatalf("expected 400 for unknown member, got %d: %s", rec.Code, rec.Body.String()) + } +} + +// TestPostApiPools_DuplicateMember asserts a member listed twice in the +// submitted list is a 400 client error (store.ErrPoolMemberDuplicate). +func TestPostApiPools_DuplicateMember(t *testing.T) { + st := newTestStore(t) + enableHTTPChannel(t, st) + seedOAuthCred(t, st, "credA") + srv := api.NewServer(st, nil, nil, "") + + t.Setenv("SLUICE_API_TOKEN", "tok") + handler := newTestHandler(t, srv, st) + + body := `{"name": "p1", "members": ["credA", "credA"]}` + req := httptest.NewRequest("POST", "/api/pools", strings.NewReader(body)) + req.Header.Set("Authorization", "Bearer tok") + req.Header.Set("Content-Type", "application/json") + rec := httptest.NewRecorder() + handler.ServeHTTP(rec, req) + + if rec.Code != http.StatusBadRequest { + t.Fatalf("expected 400 for duplicate member in list, got %d: %s", rec.Code, rec.Body.String()) + } +} + +// TestPostApiPools_InternalErrorIs500 asserts a genuine internal failure +// (the DB is closed, so CreatePoolWithMembers' tx Begin fails with a +// sentinel-free wrapped "begin tx: %w" error) surfaces as 500, NOT a +// misleading 400. This is the Finding 1 regression guard at the HTTP layer. +func TestPostApiPools_InternalErrorIs500(t *testing.T) { + st := newTestStore(t) + enableHTTPChannel(t, st) + seedOAuthCred(t, st, "credA", "credB") + srv := api.NewServer(st, nil, nil, "") + + t.Setenv("SLUICE_API_TOKEN", "tok") + handler := newTestHandler(t, srv, st) + + // Close the underlying DB so the create transaction fails with a + // non-sentinel internal error inside store.CreatePoolWithMembers. The + // request body is otherwise fully valid, so a 400 here would mean an + // internal failure was wrongly downgraded to a client error. + if err := st.Close(); err != nil { + t.Fatalf("close store: %v", err) + } + + body := `{"name": "ok_pool", "members": ["credA", "credB"]}` + req := httptest.NewRequest("POST", "/api/pools", strings.NewReader(body)) + req.Header.Set("Authorization", "Bearer tok") + req.Header.Set("Content-Type", "application/json") + rec := httptest.NewRecorder() + handler.ServeHTTP(rec, req) + + if rec.Code != http.StatusInternalServerError { + t.Fatalf("expected 500 for internal DB error, got %d: %s", rec.Code, rec.Body.String()) + } +} + +func TestGetApiPoolsName_Status(t *testing.T) { + st := newTestStore(t) + enableHTTPChannel(t, st) + seedOAuthCred(t, st, "credA", "credB") + if err := st.CreatePoolWithMembers("pool1", store.PoolStrategyFailover, []string{"credA", "credB"}); err != nil { + t.Fatalf("create pool: %v", err) + } + srv := api.NewServer(st, nil, nil, "") + + t.Setenv("SLUICE_API_TOKEN", "tok") + handler := newTestHandler(t, srv, st) + + req := httptest.NewRequest("GET", "/api/pools/pool1", nil) + req.Header.Set("Authorization", "Bearer tok") + rec := httptest.NewRecorder() + handler.ServeHTTP(rec, req) + + if rec.Code != http.StatusOK { + t.Fatalf("expected 200, got %d: %s", rec.Code, rec.Body.String()) + } + var ps api.PoolStatus + if err := json.NewDecoder(rec.Body).Decode(&ps); err != nil { + t.Fatalf("decode: %v", err) + } + if ps.Name != "pool1" { + t.Errorf("expected name pool1, got %q", ps.Name) + } + if ps.Active != "credA" { + t.Errorf("expected active credA (first member, all healthy), got %q", ps.Active) + } + if len(ps.Members) != 2 { + t.Fatalf("expected 2 member statuses, got %d", len(ps.Members)) + } + if !ps.Members[0].Active || ps.Members[0].State != "healthy" { + t.Errorf("member[0] unexpected: %+v", ps.Members[0]) + } +} + +func TestGetApiPoolsName_NotFound(t *testing.T) { + st := newTestStore(t) + enableHTTPChannel(t, st) + srv := api.NewServer(st, nil, nil, "") + + t.Setenv("SLUICE_API_TOKEN", "tok") + handler := newTestHandler(t, srv, st) + + req := httptest.NewRequest("GET", "/api/pools/nope", nil) + req.Header.Set("Authorization", "Bearer tok") + rec := httptest.NewRecorder() + handler.ServeHTTP(rec, req) + + if rec.Code != http.StatusNotFound { + t.Errorf("expected 404, got %d: %s", rec.Code, rec.Body.String()) + } +} + +func TestPostApiPoolsNameRotate_Success(t *testing.T) { + st := newTestStore(t) + enableHTTPChannel(t, st) + seedOAuthCred(t, st, "credA", "credB") + if err := st.CreatePoolWithMembers("pool1", store.PoolStrategyFailover, []string{"credA", "credB"}); err != nil { + t.Fatalf("create pool: %v", err) + } + srv := api.NewServer(st, nil, nil, "") + + t.Setenv("SLUICE_API_TOKEN", "tok") + handler := newTestHandler(t, srv, st) + + req := httptest.NewRequest("POST", "/api/pools/pool1/rotate", nil) + req.Header.Set("Authorization", "Bearer tok") + rec := httptest.NewRecorder() + handler.ServeHTTP(rec, req) + + if rec.Code != http.StatusOK { + t.Fatalf("expected 200, got %d: %s", rec.Code, rec.Body.String()) + } + var rr api.PoolRotateResult + if err := json.NewDecoder(rec.Body).Decode(&rr); err != nil { + t.Fatalf("decode: %v", err) + } + if rr.Pool != "pool1" { + t.Errorf("expected pool pool1, got %q", rr.Pool) + } + if rr.From != "credA" { + t.Errorf("expected from credA, got %q", rr.From) + } + if rr.To != "credB" { + t.Errorf("expected to credB after rotate, got %q", rr.To) + } +} + +func TestPostApiPoolsNameRotate_NotFound(t *testing.T) { + st := newTestStore(t) + enableHTTPChannel(t, st) + srv := api.NewServer(st, nil, nil, "") + + t.Setenv("SLUICE_API_TOKEN", "tok") + handler := newTestHandler(t, srv, st) + + req := httptest.NewRequest("POST", "/api/pools/nope/rotate", nil) + req.Header.Set("Authorization", "Bearer tok") + rec := httptest.NewRecorder() + handler.ServeHTTP(rec, req) + + if rec.Code != http.StatusNotFound { + t.Errorf("expected 404, got %d: %s", rec.Code, rec.Body.String()) + } +} + +func TestDeleteApiPoolsName_Success(t *testing.T) { + st := newTestStore(t) + enableHTTPChannel(t, st) + seedOAuthCred(t, st, "credA", "credB") + if err := st.CreatePoolWithMembers("pool1", store.PoolStrategyFailover, []string{"credA", "credB"}); err != nil { + t.Fatalf("create pool: %v", err) + } + srv := api.NewServer(st, nil, nil, "") + + t.Setenv("SLUICE_API_TOKEN", "tok") + handler := newTestHandler(t, srv, st) + + req := httptest.NewRequest("DELETE", "/api/pools/pool1", nil) + req.Header.Set("Authorization", "Bearer tok") + rec := httptest.NewRecorder() + handler.ServeHTTP(rec, req) + + if rec.Code != http.StatusNoContent { + t.Fatalf("expected 204, got %d: %s", rec.Code, rec.Body.String()) + } + pools, err := st.ListPools() + if err != nil { + t.Fatalf("list pools: %v", err) + } + if len(pools) != 0 { + t.Errorf("expected 0 pools after delete, got %d", len(pools)) + } +} + +func TestDeleteApiPoolsName_NotFound(t *testing.T) { + st := newTestStore(t) + enableHTTPChannel(t, st) + srv := api.NewServer(st, nil, nil, "") + + t.Setenv("SLUICE_API_TOKEN", "tok") + handler := newTestHandler(t, srv, st) + + req := httptest.NewRequest("DELETE", "/api/pools/nope", nil) + req.Header.Set("Authorization", "Bearer tok") + rec := httptest.NewRecorder() + handler.ServeHTTP(rec, req) + + if rec.Code != http.StatusNotFound { + t.Errorf("expected 404, got %d: %s", rec.Code, rec.Body.String()) + } +} + +func TestDeleteApiPoolsName_ReferencedByBinding(t *testing.T) { + st := newTestStore(t) + enableHTTPChannel(t, st) + seedOAuthCred(t, st, "credA", "credB") + if err := st.CreatePoolWithMembers("pool1", store.PoolStrategyFailover, []string{"credA", "credB"}); err != nil { + t.Fatalf("create pool: %v", err) + } + // A binding referencing the pool by name keeps it from being removed. + if _, err := st.AddBinding("api.example.com", "pool1", store.BindingOpts{}); err != nil { + t.Fatalf("add binding: %v", err) + } + srv := api.NewServer(st, nil, nil, "") + + t.Setenv("SLUICE_API_TOKEN", "tok") + handler := newTestHandler(t, srv, st) + + req := httptest.NewRequest("DELETE", "/api/pools/pool1", nil) + req.Header.Set("Authorization", "Bearer tok") + rec := httptest.NewRecorder() + handler.ServeHTTP(rec, req) + + if rec.Code != http.StatusConflict { + t.Fatalf("expected 409 (pool still referenced by a binding), got %d: %s", rec.Code, rec.Body.String()) + } + // Finding 8: the 409 body is the dedicated PoolReferencedErrorResponse, + // not the generic ErrorResponse (which no longer carries bindings). + var er api.PoolReferencedErrorResponse + if err := json.NewDecoder(rec.Body).Decode(&er); err != nil { + t.Fatalf("decode error response: %v", err) + } + if len(er.Bindings) != 1 { + t.Fatalf("expected 1 structured referencing binding, got %+v", er.Bindings) + } + if er.Bindings[0].Destination != "api.example.com" { + t.Errorf("expected binding destination api.example.com, got %q", er.Bindings[0].Destination) + } + if er.Bindings[0].Id == 0 { + t.Errorf("expected non-zero binding id, got %d", er.Bindings[0].Id) + } +} + // --- Audit handler tests --- func TestGetApiAuditRecent_NoPath(t *testing.T) { diff --git a/internal/poolops/poolops.go b/internal/poolops/poolops.go new file mode 100644 index 0000000..7e9f691 --- /dev/null +++ b/internal/poolops/poolops.go @@ -0,0 +1,249 @@ +// Package poolops holds the channel-agnostic credential-pool operations +// (create / list / status / rotate / remove). CLI, the REST API, and the +// Telegram bot are all thin adapters over this package so the three +// management surfaces cannot drift. +// +// The epoch-guarded rotate write and the ResolveActive-based status +// derivation live here exactly once. This is the fix for the historical +// parity gap where the logic was written inline in cmd/sluice and could not +// be reused by other channels. +package poolops + +import ( + "errors" + "fmt" + "strings" + "time" + + "github.com/nemirovsky/sluice/internal/store" + "github.com/nemirovsky/sluice/internal/vault" +) + +// Store is the subset of *store.Store the pool operations need. Keeping it an +// interface lets each channel pass its own already-open store and lets the +// tests substitute a fake. +type Store interface { + CreatePoolWithMembers(name, strategy string, members []string) error + GetPool(name string) (*store.Pool, error) + ListPools() ([]store.Pool, error) + RemovePoolIfUnreferenced(name string) (bool, error) + ListCredentialHealth() ([]store.CredentialHealth, error) + SetCredentialHealthIfPoolMemberEpoch(credential, pool string, epoch int64, status string, cooldownUntil time.Time, reason string) (bool, error) +} + +// ErrNoMembers is returned by Create when the member list is empty. +var ErrNoMembers = errors.New("at least one member credential is required") + +// PoolNotFoundError is returned by Status/Rotate/Remove when the named pool +// does not exist. Channels map this to a 404 / "not found" message. +type PoolNotFoundError struct { + Name string +} + +func (e *PoolNotFoundError) Error() string { + return fmt.Sprintf("pool %q not found", e.Name) +} + +// RotateRaceError is returned by Rotate when a concurrent pool/member +// removal or cross-pool re-add invalidated the snapshot the rotate was +// computed from, so nothing was persisted. The operator should re-check the +// pool and retry. +type RotateRaceError struct { + Pool string + Member string + Epoch int64 +} + +func (e *RotateRaceError) Error() string { + // Channel-neutral: poolops is shared by CLI, REST and Telegram, so the + // message states the condition without prescribing a CLI command. Each + // channel adds its own remediation hint at its adapter layer. + return fmt.Sprintf("pool %q rotate raced a concurrent pool/member removal or re-add: %q is no longer a live member of pool %q at the snapshotted epoch %d, so nothing was persisted", + e.Pool, e.Member, e.Pool, e.Epoch) +} + +// MemberStatus is one member's view in a StatusResult. +type MemberStatus struct { + Credential string + Position int + Active bool + State string // "healthy", "cooldown", "healthy (cooldown expired)" + CooldownUntil time.Time + LastFailureReason string +} + +// StatusResult is the channel-agnostic pool status. Channels render it +// however suits their surface (CLI text, JSON, Telegram). +type StatusResult struct { + Name string + Strategy string + Active string + Members []MemberStatus +} + +// RotateResult is the outcome of a successful Rotate. +type RotateResult struct { + Pool string + From string + To string + ParkedUntil time.Time +} + +// ParseMembers splits a comma-separated member list into a trimmed slice, +// rejecting empty entries. Shared by every channel that accepts the list as a +// single string (CLI --members, Telegram args, REST CSV fallback). +func ParseMembers(membersStr string) ([]string, error) { + if strings.TrimSpace(membersStr) == "" { + return nil, ErrNoMembers + } + var members []string + for _, m := range strings.Split(membersStr, ",") { + m = strings.TrimSpace(m) + if m == "" { + return nil, fmt.Errorf("empty credential name in members list") + } + members = append(members, m) + } + return members, nil +} + +// Create creates a pool with the given ordered members. An empty strategy +// defaults to the only supported strategy (failover). Sentinel errors from +// the store (namespace collision, static member, unknown member) propagate +// unchanged so channels can map them. +func Create(s Store, name, strategy string, members []string) error { + if strategy == "" { + strategy = store.PoolStrategyFailover + } + if len(members) == 0 { + return ErrNoMembers + } + return s.CreatePoolWithMembers(name, strategy, members) +} + +// List returns every configured pool, ordered as the store returns them. +func List(s Store) ([]store.Pool, error) { + return s.ListPools() +} + +// Status derives the active member with the exact same selection logic the +// proxy uses at injection time (vault.NewPoolResolver.ResolveActive) so the +// reported status never disagrees with what would actually be injected. +func Status(s Store, name string) (*StatusResult, error) { + p, err := s.GetPool(name) + if err != nil { + return nil, err + } + if p == nil { + return nil, &PoolNotFoundError{Name: name} + } + healthRows, err := s.ListCredentialHealth() + if err != nil { + return nil, err + } + + resolver := vault.NewPoolResolver([]store.Pool{*p}, healthRows) + active, _ := resolver.ResolveActive(name) + + healthByCred := make(map[string]store.CredentialHealth, len(healthRows)) + for _, h := range healthRows { + healthByCred[h.Credential] = h + } + + now := time.Now() + res := &StatusResult{Name: p.Name, Strategy: p.Strategy, Active: active} + for _, m := range p.Members { + ms := MemberStatus{ + Credential: m.Credential, + Position: m.Position, + Active: m.Credential == active, + State: "healthy", + } + if h, ok := healthByCred[m.Credential]; ok && h.Status == "cooldown" && !h.CooldownUntil.IsZero() { + ms.CooldownUntil = h.CooldownUntil + ms.LastFailureReason = h.LastFailureReason + if h.CooldownUntil.After(now) { + ms.State = "cooldown" + } else { + ms.State = "healthy (cooldown expired)" + } + } + res.Members = append(res.Members, ms) + } + return res, nil +} + +// Rotate is the operator override: park the current active member so the +// next member in position order becomes active. The cooldown lapses on its +// own (lazy recovery, same as auto-failover), so a rotated-away member +// rejoins the rotation once its cooldown expires. +// +// The write is the pool+epoch scoped guarded write, NOT the unconditional +// SetCredentialHealth and NOT a name-only guard. `active` is resolved from a +// snapshot; another process could remove this pool (or this member from it) +// AND re-add the same name into a DIFFERENT pool between that snapshot and +// this write. Gating on exactly (active, this pool, that epoch) makes a +// raced removal/re-add a no-op (wrote=false) instead of silently parking an +// unrelated pool's member; the caller gets a RotateRaceError. +func Rotate(s Store, name string) (*RotateResult, error) { + p, err := s.GetPool(name) + if err != nil { + return nil, err + } + if p == nil { + return nil, &PoolNotFoundError{Name: name} + } + healthRows, err := s.ListCredentialHealth() + if err != nil { + return nil, err + } + resolver := vault.NewPoolResolver([]store.Pool{*p}, healthRows) + active, ok := resolver.ResolveActive(name) + if !ok || active == "" { + return nil, fmt.Errorf("pool %q has no resolvable member to rotate away from", name) + } + + var rotateEpoch int64 = -1 + for _, m := range p.Members { + if m.Credential == active { + rotateEpoch = m.Epoch + break + } + } + if rotateEpoch < 0 { + return nil, fmt.Errorf("pool %q rotate: resolved active member %q is not in the pool snapshot (membership changed under the rotate)", name, active) + } + until := time.Now().Add(vault.AuthFailCooldown) + wrote, err := s.SetCredentialHealthIfPoolMemberEpoch(active, name, rotateEpoch, "cooldown", until, vault.ManualRotateReason) + if err != nil { + return nil, err + } + if !wrote { + return nil, &RotateRaceError{Pool: name, Member: active, Epoch: rotateEpoch} + } + + // Recompute the new active member for operator feedback. + healthRows, err = s.ListCredentialHealth() + if err != nil { + return nil, err + } + resolver = vault.NewPoolResolver([]store.Pool{*p}, healthRows) + next, _ := resolver.ResolveActive(name) + return &RotateResult{Pool: name, From: active, To: next, ParkedUntil: until}, nil +} + +// Remove deletes the pool, refusing (with a *store.PoolReferencedError) while +// any binding still references it by name. The store method folds the +// reference check and the delete into one transaction; this function only +// surfaces the typed errors. Returns a *PoolNotFoundError if the pool does +// not exist. +func Remove(s Store, name string) error { + removed, err := s.RemovePoolIfUnreferenced(name) + if err != nil { + return err + } + if !removed { + return &PoolNotFoundError{Name: name} + } + return nil +} diff --git a/internal/poolops/poolops_test.go b/internal/poolops/poolops_test.go new file mode 100644 index 0000000..673f867 --- /dev/null +++ b/internal/poolops/poolops_test.go @@ -0,0 +1,226 @@ +package poolops_test + +import ( + "errors" + "path/filepath" + "testing" + "time" + + "github.com/nemirovsky/sluice/internal/poolops" + "github.com/nemirovsky/sluice/internal/store" + "github.com/nemirovsky/sluice/internal/vault" +) + +// newTestStore opens a fresh sqlite store and a paired vault dir, then seeds +// `creds` as oauth members so they are valid pool members. +func newTestStore(t *testing.T, creds ...string) *store.Store { + t.Helper() + dir := t.TempDir() + dbPath := filepath.Join(dir, "sluice.db") + db, err := store.New(dbPath) + if err != nil { + t.Fatalf("open store: %v", err) + } + for _, c := range creds { + if err := db.AddCredentialMeta(c, "oauth", "https://auth.example.com/token"); err != nil { + t.Fatalf("add meta %q: %v", c, err) + } + } + vs, err := vault.NewStore(dir) + if err != nil { + t.Fatalf("open vault: %v", err) + } + for _, c := range creds { + if _, err := vs.Add(c, `{"access_token":"x","token_url":"https://auth.example.com/token"}`); err != nil { + t.Fatalf("vault add %q: %v", c, err) + } + } + t.Cleanup(func() { _ = db.Close() }) + return db +} + +func TestParseMembers(t *testing.T) { + got, err := poolops.ParseMembers(" a , b ,c") + if err != nil { + t.Fatalf("ParseMembers: %v", err) + } + if len(got) != 3 || got[0] != "a" || got[1] != "b" || got[2] != "c" { + t.Fatalf("ParseMembers = %v", got) + } + if _, err := poolops.ParseMembers(""); !errors.Is(err, poolops.ErrNoMembers) { + t.Fatalf("empty members err = %v, want ErrNoMembers", err) + } + if _, err := poolops.ParseMembers("a,,b"); err == nil { + t.Fatalf("expected error for empty entry in members list") + } +} + +func TestCreateListStatusRotateRemove(t *testing.T) { + db := newTestStore(t, "acct_a", "acct_b") + + if err := poolops.Create(db, "codex", "", []string{"acct_a", "acct_b"}); err != nil { + t.Fatalf("Create: %v", err) + } + + pools, err := poolops.List(db) + if err != nil { + t.Fatalf("List: %v", err) + } + if len(pools) != 1 || pools[0].Name != "codex" { + t.Fatalf("List = %+v", pools) + } + + st, err := poolops.Status(db, "codex") + if err != nil { + t.Fatalf("Status: %v", err) + } + if st.Active != "acct_a" { + t.Fatalf("Status.Active = %q, want acct_a", st.Active) + } + if len(st.Members) != 2 || !st.Members[0].Active || st.Members[0].State != "healthy" { + t.Fatalf("Status.Members = %+v", st.Members) + } + + rr, err := poolops.Rotate(db, "codex") + if err != nil { + t.Fatalf("Rotate: %v", err) + } + if rr.From != "acct_a" || rr.To != "acct_b" { + t.Fatalf("Rotate = %+v, want acct_a -> acct_b", rr) + } + + st, err = poolops.Status(db, "codex") + if err != nil { + t.Fatalf("Status post-rotate: %v", err) + } + if st.Active != "acct_b" { + t.Fatalf("post-rotate Active = %q, want acct_b", st.Active) + } + // acct_a now in cooldown. + if st.Members[0].State != "cooldown" || st.Members[0].LastFailureReason != vault.ManualRotateReason { + t.Fatalf("post-rotate Members[0] = %+v", st.Members[0]) + } + + if err := poolops.Remove(db, "codex"); err != nil { + t.Fatalf("Remove: %v", err) + } + if _, err := poolops.Status(db, "codex"); err == nil { + t.Fatalf("Status after remove: err = nil, want PoolNotFoundError") + } +} + +func TestCreateErrors(t *testing.T) { + db := newTestStore(t, "acct_a") + + if err := poolops.Create(db, "p", "", nil); !errors.Is(err, poolops.ErrNoMembers) { + t.Fatalf("Create empty members err = %v, want ErrNoMembers", err) + } + + // Static member rejected by the store. + if err := db.AddCredentialMeta("static_one", "static", ""); err != nil { + t.Fatalf("add static meta: %v", err) + } + if err := poolops.Create(db, "p2", "", []string{"static_one"}); err == nil { + t.Fatalf("Create with static member: err = nil, want rejection") + } + + // Namespace collision: a pool name equal to an existing credential. + if err := poolops.Create(db, "acct_a", "", []string{"acct_a"}); err == nil { + t.Fatalf("Create with name colliding with a credential: err = nil, want rejection") + } +} + +func TestStatusRotateRemoveUnknownPool(t *testing.T) { + db := newTestStore(t) + + _, err := poolops.Status(db, "missing") + var nf *poolops.PoolNotFoundError + if !errors.As(err, &nf) { + t.Fatalf("Status unknown pool err = %v, want PoolNotFoundError", err) + } + + _, err = poolops.Rotate(db, "missing") + if !errors.As(err, &nf) { + t.Fatalf("Rotate unknown pool err = %v, want PoolNotFoundError", err) + } + + err = poolops.Remove(db, "missing") + if !errors.As(err, &nf) { + t.Fatalf("Remove unknown pool err = %v, want PoolNotFoundError", err) + } +} + +func TestRemoveBlockedByBinding(t *testing.T) { + db := newTestStore(t, "acct_a", "acct_b") + if err := poolops.Create(db, "codex", "", []string{"acct_a", "acct_b"}); err != nil { + t.Fatalf("Create: %v", err) + } + // A binding whose credential column holds the pool name blocks removal. + if _, err := db.AddBinding("api.example.com", "codex", store.BindingOpts{ + Ports: []int{443}, + Header: "Authorization", + Template: "Bearer {value}", + }); err != nil { + t.Fatalf("AddBinding: %v", err) + } + err := poolops.Remove(db, "codex") + var refErr *store.PoolReferencedError + if !errors.As(err, &refErr) { + t.Fatalf("Remove with referencing binding err = %v, want *PoolReferencedError", err) + } +} + +// TestRotateEpochRaceNoOp reproduces the post-race store state the guarded +// write observes: the pool snapshot resolved a member, but by write time the +// member row was deleted from credential_pool_members. The guarded write must +// no-op and Rotate must return a RotateRaceError, persisting nothing. +func TestRotateEpochRaceNoOp(t *testing.T) { + db := newTestStore(t, "acct_a", "acct_b") + if err := poolops.Create(db, "codex", "", []string{"acct_a", "acct_b"}); err != nil { + t.Fatalf("Create: %v", err) + } + // Drop membership rows directly, leaving the credential_pools row intact: + // exactly the state a concurrent member removal leaves. GetPool then + // returns codex with NO members, so Rotate reports no resolvable member + // and writes nothing. + if _, err := db.DB().Exec("DELETE FROM credential_pool_members WHERE pool = 'codex'"); err != nil { + t.Fatalf("delete membership rows: %v", err) + } + if _, err := poolops.Rotate(db, "codex"); err == nil { + t.Fatalf("Rotate against vanished members: err = nil, want failure") + } + // No health row resurrected for the vanished members. + rows, err := db.ListCredentialHealth() + if err != nil { + t.Fatalf("ListCredentialHealth: %v", err) + } + for _, r := range rows { + if (r.Credential == "acct_a" || r.Credential == "acct_b") && r.Status == "cooldown" { + t.Fatalf("vanished member %q got a resurrected cooldown: %+v", r.Credential, r) + } + } +} + +// fakeRaceStore returns a live snapshot from GetPool but forces the +// epoch-guarded write to report wrote=false, simulating a cross-pool re-add +// that advanced the epoch between snapshot and write. Rotate must surface a +// *RotateRaceError. +type fakeRaceStore struct { + poolops.Store +} + +func (f fakeRaceStore) SetCredentialHealthIfPoolMemberEpoch(string, string, int64, string, time.Time, string) (bool, error) { + return false, nil +} + +func TestRotateRaceErrorOnGuardedWriteNoOp(t *testing.T) { + db := newTestStore(t, "acct_a", "acct_b") + if err := poolops.Create(db, "codex", "", []string{"acct_a", "acct_b"}); err != nil { + t.Fatalf("Create: %v", err) + } + _, err := poolops.Rotate(fakeRaceStore{Store: db}, "codex") + var re *poolops.RotateRaceError + if !errors.As(err, &re) { + t.Fatalf("Rotate with no-op guarded write err = %v, want *RotateRaceError", err) + } +} diff --git a/internal/proxy/addon.go b/internal/proxy/addon.go index b55154a..d067015 100644 --- a/internal/proxy/addon.go +++ b/internal/proxy/addon.go @@ -754,7 +754,7 @@ func (a *SluiceAddon) Request(f *mitmproxy.Flow) { proto := a.detectRequestProtocol(f, port) protoStr := proto.String() - pairs := a.buildPhantomPairs(host, port, protoStr, f.Request.URL) + pairs := a.buildPhantomPairs(host, port, protoStr, f.Request.URL, requestFlowGrantType(f, a.oauthIndex.Load())) if len(pairs) == 0 && !a.hasPhantomPrefix(f) { return } @@ -834,7 +834,26 @@ func (a *SluiceAddon) StreamRequestModifier(f *mitmproxy.Flow, in io.Reader) io. proto := a.detectRequestProtocol(f, port) protoStr := proto.String() - pairs := a.buildPhantomPairs(host, port, protoStr, f.Request.URL) + // DELIBERATE LIMITATION (Finding 11): the pool token-host refresh-phantom + // expansion in buildPhantomPairs is gated on grant_type=="refresh_token", + // and grant_type can only be parsed from a buffered body. Streamed bodies + // are NOT buffered into f.Request.Body, so requestFlowGrantType returns "" + // here and the pool token-host expansion is therefore unreachable on the + // streamed path. This is SAFE, not a latent bug, because an OAuth token + // request (refresh_token / device_code / authorization_code grant) is, by + // RFC 6749 §4, a tiny application/x-www-form-urlencoded POST (a few + // hundred bytes). go-mitmproxy only switches a request to the streamed + // path when its body exceeds the buffering threshold, so a token request + // is ALWAYS buffered through Request (where the expansion runs) and never + // reaches StreamRequestModifier. An empty grantType here therefore only + // suppresses the pool token-host expansion for genuinely large streamed + // bodies, which by construction are not token requests. The CONNECT-host + // binding loop (plain + pooled API-host bindings) does not depend on + // grant_type and is unaffected on the streamed path. If a future change + // ever streams token POSTs, this assumption breaks and the expansion would + // silently stop — that change must re-buffer token-host POSTs or move the + // grant_type probe into the stream reader. + pairs := a.buildPhantomPairs(host, port, protoStr, f.Request.URL, requestFlowGrantType(f, a.oauthIndex.Load())) if len(pairs) == 0 { return in } @@ -1545,7 +1564,17 @@ func (a *SluiceAddon) persistAddonOAuthTokens(credName string, realAccess, realR // post-swap by tagPooledFlowAfterSwap, which inspects the built pairs' // pooledMember field and tags only the members whose pool phantom was // actually present in this request. -func (a *SluiceAddon) buildPhantomPairs(host string, port int, proto string, reqURL *url.URL) []phantomPair { +// grantType is the OAuth grant_type parsed from the outbound request body +// (requestGrantType; "" when there is no body or it is unparseable / not a +// token request). It scopes the pool token-host expansion: that expansion +// rewrites every request to a pool's shared OAuth token host (e.g. +// auth.openai.com), so a non-refresh grant on the same host — most importantly +// a fresh in-container `codex login --device-auth` (a device_code grant) — +// would be corrupted into a 400 token_exchange_user_error. The expansion is +// therefore gated to grant_type == "refresh_token"; device_code, +// authorization_code, and absent/unparseable grants pass through with the +// request body + headers byte-identical to what the agent sent. +func (a *SluiceAddon) buildPhantomPairs(host string, port int, proto string, reqURL *url.URL, grantType string) []phantomPair { res := a.resolver.Load() if res == nil { return nil @@ -1651,7 +1680,18 @@ func (a *SluiceAddon) buildPhantomPairs(host string, port int, proto string, req // and token-endpoint failover work). MatchAll (not Match) is used so a // plain OAuth credential that sorts before the pool members and shares // the token URL cannot mask them. - if reqURL != nil { + // Grant scope (Task 4): this expansion is for the REFRESH round-trip + // only. A pool's token host (e.g. auth.openai.com) also serves the OTHER + // OAuth grants — device_code (a fresh in-container `codex login + // --device-auth`) and authorization_code. Rewriting those bodies/headers + // with the pool refresh phantom corrupts them (Codex returns + // 400 token_exchange_user_error and the in-container login fails). Only + // expand when the parsed grant_type is exactly "refresh_token"; + // device_code / authorization_code / absent / unparseable grants fall + // through with the request byte-identical to what the agent sent. The + // grant_type parse (requestGrantType) mirrors classifyFailover's + // token-endpoint form parse. + if reqURL != nil && grantType == "refresh_token" { if idx := a.oauthIndex.Load(); idx != nil { pr := (*vault.PoolResolver)(nil) if a.poolResolver != nil { diff --git a/internal/proxy/failover_notice_test.go b/internal/proxy/failover_notice_test.go new file mode 100644 index 0000000..1deddd8 --- /dev/null +++ b/internal/proxy/failover_notice_test.go @@ -0,0 +1,125 @@ +package proxy + +import ( + "strings" + "testing" +) + +// TestFormatFailoverNotice asserts the human-facing Telegram notice wording. +// This is the new friendlier text; the audit Reason format is asserted +// separately (pool_failover_test.go etc.) and must NOT be affected. +func TestFormatFailoverNotice(t *testing.T) { + tests := []struct { + name string + ev FailoverEvent + want string + mustHave []string + }{ + { + name: "rate limit 429", + ev: FailoverEvent{Pool: "openai_pool", From: "openai_oauth_2", To: "openai_oauth", Reason: "429"}, + want: `Pool "openai_pool" failed over from "openai_oauth_2" to "openai_oauth" after rate limit (429).`, + }, + { + name: "quota exhausted 403", + ev: FailoverEvent{Pool: "p", From: "a", To: "b", Reason: "403"}, + want: `Pool "p" failed over from "a" to "b" after quota exhausted (403).`, + }, + { + name: "auth failure 401", + ev: FailoverEvent{Pool: "openai_pool", From: "openai_oauth_2", To: "openai_oauth", Reason: "401"}, + want: `Pool "openai_pool" failed over from "openai_oauth_2" to "openai_oauth" after auth failure (401).`, + }, + { + name: "auth failure invalid_grant", + ev: FailoverEvent{Pool: "p", From: "a", To: "b", Reason: "invalid_grant"}, + want: `Pool "p" failed over from "a" to "b" after auth failure (invalid_grant).`, + }, + { + name: "auth failure invalid_token", + ev: FailoverEvent{Pool: "p", From: "a", To: "b", Reason: "invalid_token"}, + want: `Pool "p" failed over from "a" to "b" after auth failure (invalid_token).`, + }, + { + name: "exhausted", + ev: FailoverEvent{Pool: "openai_pool", From: "openai_oauth_2", To: "openai_oauth_2", Reason: "429", Exhausted: true}, + want: `Pool "openai_pool" exhausted: all members are cooling down, no healthy account to fail over to (rate limit (429)).`, + }, + { + // Finding 5: an empty reason tag must NOT render the awkward + // "(unknown reason)" parenthetical; the clause is dropped. + name: "exhausted empty reason drops parenthetical", + ev: FailoverEvent{Pool: "openai_pool", From: "a", To: "a", Reason: "", Exhausted: true}, + want: `Pool "openai_pool" exhausted: all members are cooling down, no healthy account to fail over to.`, + }, + { + // Finding 5: normal failover with an empty reason tag drops the + // "after unknown reason" clause entirely. + name: "normal failover empty reason drops clause", + ev: FailoverEvent{Pool: "p", From: "a", To: "b", Reason: ""}, + want: `Pool "p" failed over from "a" to "b".`, + }, + { + // Finding 2: an unknown tag must read naturally after "after" + // (no redundant "failed over ... after failover (teapot)"). + name: "unknown tag reads naturally after the after-clause", + ev: FailoverEvent{Pool: "p", From: "a", To: "b", Reason: "teapot"}, + want: `Pool "p" failed over from "a" to "b" after unknown reason (teapot).`, + }, + { + // Finding 2: an unknown tag in the exhausted message also reads + // naturally and still surfaces the raw tag. + name: "unknown tag in exhausted message reads naturally", + ev: FailoverEvent{Pool: "p", From: "a", To: "a", Reason: "teapot", Exhausted: true}, + want: `Pool "p" exhausted: all members are cooling down, no healthy account to fail over to (unknown reason (teapot)).`, + mustHave: []string{"teapot"}, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := FormatFailoverNotice(tt.ev) + if tt.want != "" && got != tt.want { + t.Fatalf("FormatFailoverNotice() =\n %q\nwant\n %q", got, tt.want) + } + for _, sub := range tt.mustHave { + if !strings.Contains(got, sub) { + t.Fatalf("FormatFailoverNotice() = %q, missing %q", got, sub) + } + } + // Plain-text contract: no markdown/HTML the notice path can't + // render. (Underscores/asterisks are legitimate in pool and + // credential names — invalid_grant, openai_pool — and Notify + // sends no parse mode, so they render literally and are safe.) + for _, bad := range []string{"`", "", "", "", "**"} { + if strings.Contains(got, bad) { + t.Fatalf("FormatFailoverNotice() = %q, contains non-plain-text %q", got, bad) + } + } + if strings.Contains(got, "\n") { + t.Fatalf("FormatFailoverNotice() = %q, must be a single line", got) + } + }) + } +} + +// TestHumanizeFailoverReason covers every reason tag form produced by +// failoverReasonTag / classifyFailover so none falls through unlabeled. The +// empty-tag case is intentionally absent: it is handled by +// FormatFailoverNotice (the sole caller short-circuits an empty reason and +// drops the clause), asserted by TestFormatFailoverNotice's empty-reason +// cases, so humanizeFailoverReason has no reachable "" branch. +func TestHumanizeFailoverReason(t *testing.T) { + cases := map[string]string{ + "429": "rate limit (429)", + "403": "quota exhausted (403)", + "401": "auth failure (401)", + "invalid_grant": "auth failure (invalid_grant)", + "invalid_token": "auth failure (invalid_token)", + "weird": "unknown reason (weird)", + } + for tag, want := range cases { + if got := humanizeFailoverReason(tag); got != want { + t.Fatalf("humanizeFailoverReason(%q) = %q, want %q", tag, got, want) + } + } +} diff --git a/internal/proxy/grant_type_pregate_test.go b/internal/proxy/grant_type_pregate_test.go new file mode 100644 index 0000000..c5fa93c --- /dev/null +++ b/internal/proxy/grant_type_pregate_test.go @@ -0,0 +1,203 @@ +package proxy + +import ( + "net/http" + "net/url" + "strings" + "testing" + + mitmproxy "github.com/lqqyt2423/go-mitmproxy/proxy" + "github.com/nemirovsky/sluice/internal/store" +) + +// tokenHostIndex returns an OAuthIndex with a single OAuth token endpoint at +// https://auth.example.com/token, used by the pre-gate tests. +func tokenHostIndex() *OAuthIndex { + return NewOAuthIndex([]store.CredentialMeta{ + {Name: "oauth_cred", CredType: "oauth", TokenURL: "https://auth.example.com/token"}, + }) +} + +func flowWith(method, rawURL, ct string, body []byte) *mitmproxy.Flow { + u, _ := url.Parse(rawURL) + h := make(http.Header) + if ct != "" { + h.Set("Content-Type", ct) + } + return &mitmproxy.Flow{ + Request: &mitmproxy.Request{ + Method: method, + URL: u, + Header: h, + Body: body, + }, + } +} + +// TestRequestFlowGrantType_PreGate is the Finding 2 fail-before/pass-after +// guard: the body grant_type parse must run ONLY for an HTTP POST whose +// scheme+host matches a known OAuth token endpoint. A non-POST request, or a +// POST to a non-token host, must return "" WITHOUT parsing the body even +// though the body is a perfectly valid refresh-grant form. +func TestRequestFlowGrantType_PreGate(t *testing.T) { + idx := tokenHostIndex() + refreshBody := []byte("grant_type=refresh_token&refresh_token=rt-xxx") + const formCT = "application/x-www-form-urlencoded" + + cases := []struct { + name string + method string + url string + want string + }{ + { + // Pass-after: a real refresh POST to the token host is still + // parsed (the gate must not break the existing behavior). + name: "POST to token host parses", + method: "POST", + url: "https://auth.example.com/token", + want: "refresh_token", + }, + { + // Pre-gate: same valid refresh body, but a GET cannot be a + // token request -> not parsed, returns "". + name: "GET to token host skipped", + method: "GET", + url: "https://auth.example.com/token", + want: "", + }, + { + // Pre-gate: POST but to a non-token host (the vast majority of + // proxied traffic) -> not parsed, returns "". + name: "POST to non-token host skipped", + method: "POST", + url: "https://api.example.com/v1/chat", + want: "", + }, + { + // Method comparison is case-insensitive (RFC methods are + // uppercase but be defensive). + name: "lowercase post to token host parses", + method: "post", + url: "https://auth.example.com/token", + want: "refresh_token", + }, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + f := flowWith(c.method, c.url, formCT, refreshBody) + if got := requestFlowGrantType(f, idx); got != c.want { + t.Fatalf("requestFlowGrantType(%s %s) = %q, want %q", + c.method, c.url, got, c.want) + } + }) + } +} + +// TestRequestFlowGrantType_NilSafe verifies the gate degrades safely when the +// index is nil (startup race before UpdateOAuthIndex fires): with no known +// token hosts, nothing can be a token request, so it returns "". +func TestRequestFlowGrantType_NilSafe(t *testing.T) { + f := flowWith("POST", "https://auth.example.com/token", + "application/x-www-form-urlencoded", + []byte("grant_type=refresh_token&refresh_token=x")) + if got := requestFlowGrantType(f, nil); got != "" { + t.Fatalf("requestFlowGrantType with nil index = %q, want \"\"", got) + } + if got := requestFlowGrantType(nil, tokenHostIndex()); got != "" { + t.Fatalf("requestFlowGrantType(nil flow) = %q, want \"\"", got) + } +} + +// TestOAuthIndexMatchesHost covers the cheap host-only pre-gate matcher: +// scheme+host match ignoring path; default-port normalization; no match for a +// different host/scheme. +func TestOAuthIndexMatchesHost(t *testing.T) { + idx := tokenHostIndex() + mustParse := func(s string) *url.URL { + u, err := url.Parse(s) + if err != nil { + t.Fatalf("parse %q: %v", s, err) + } + return u + } + if !idx.MatchesHost(mustParse("https://auth.example.com/token")) { + t.Fatal("exact host+path should match") + } + if !idx.MatchesHost(mustParse("https://auth.example.com/some/other/path")) { + t.Fatal("same host, different path should still match (host-only gate)") + } + if !idx.MatchesHost(mustParse("https://auth.example.com:443/token")) { + t.Fatal("default https port must normalize to a match") + } + if idx.MatchesHost(mustParse("https://api.example.com/token")) { + t.Fatal("different host must not match") + } + if idx.MatchesHost(mustParse("http://auth.example.com/token")) { + t.Fatal("different scheme must not match") + } + if (*OAuthIndex)(nil).MatchesHost(mustParse("https://auth.example.com/token")) { + t.Fatal("nil index must not match") + } +} + +// TestRequestGrantType_CapRaisedAndObservable is the Finding 9 guard: the +// probe cap is now 64 KiB (a realistic large OAuth token request with a +// JWT client_assertion + long refresh token is still parsed), and a body +// over the cap is NOT parsed (bounded worst case, the original perf bug +// stays fixed). +func TestRequestGrantType_CapRaisedAndObservable(t *testing.T) { + const formCT = "application/x-www-form-urlencoded" + + // A ~40 KiB refresh-grant body (long JWT-shaped client_assertion). Under + // the old 8 KiB cap this returned "" (Finding 9 silent drop); under the + // 64 KiB cap it must parse correctly. + bigAssertion := strings.Repeat("A", 40<<10) + largeRefresh := []byte("grant_type=refresh_token&refresh_token=rt-xxx&client_assertion=" + bigAssertion) + if len(largeRefresh) <= 8<<10 { + t.Fatalf("test body must exceed the old 8 KiB cap, got %d", len(largeRefresh)) + } + if len(largeRefresh) >= maxGrantTypeProbeBody { + t.Fatalf("test body must stay under the new cap, got %d (cap %d)", + len(largeRefresh), maxGrantTypeProbeBody) + } + if got := requestGrantType(largeRefresh, formCT); got != "refresh_token" { + t.Fatalf("large (under-cap) refresh body grant_type = %q, want refresh_token", got) + } + + // A body over the new cap is still not probed (returns ""). + overCap := []byte("grant_type=refresh_token&x=" + strings.Repeat("B", maxGrantTypeProbeBody)) + if len(overCap) <= maxGrantTypeProbeBody { + t.Fatalf("over-cap body must exceed the cap, got %d", len(overCap)) + } + if got := requestGrantType(overCap, formCT); got != "" { + t.Fatalf("over-cap body grant_type = %q, want \"\" (probe skipped)", got) + } +} + +// TestExtractRequestRefreshToken_SingleParse is the Finding 3 guard: an +// explicit form Content-Type whose body happens to start with '{' is parsed +// ONLY as form (no JSON fallback double string(body)); behavior for the +// normal cases (form body, JSON body, headerless JSON) is unchanged. +func TestExtractRequestRefreshToken_SingleParse(t *testing.T) { + const formCT = "application/x-www-form-urlencoded" + + // form CT, real form body -> parsed as form. + if got := extractRequestRefreshToken([]byte("refresh_token=rt-1"), formCT); got != "rt-1" { + t.Fatalf("form body = %q, want rt-1", got) + } + // form CT, JSON-shaped body -> form parse yields nothing, JSON fallback + // NOT run (Finding 3) -> "". + if got := extractRequestRefreshToken([]byte(`{"refresh_token":"rt-2"}`), formCT); got != "" { + t.Fatalf("form-CT json body = %q, want \"\" (json fallback must not run)", got) + } + // json CT, JSON body -> parsed as JSON. + if got := extractRequestRefreshToken([]byte(`{"refresh_token":"rt-3"}`), "application/json"); got != "rt-3" { + t.Fatalf("json body = %q, want rt-3", got) + } + // absent CT, JSON body -> form parse misses, JSON fallback still + // reachable (headerless JSON token request must remain recoverable). + if got := extractRequestRefreshToken([]byte(`{"refresh_token":"rt-4"}`), ""); got != "rt-4" { + t.Fatalf("headerless json body = %q, want rt-4 (fallback must stay reachable)", got) + } +} diff --git a/internal/proxy/oauth_index.go b/internal/proxy/oauth_index.go index df69a75..a75fb09 100644 --- a/internal/proxy/oauth_index.go +++ b/internal/proxy/oauth_index.go @@ -120,6 +120,29 @@ func (idx *OAuthIndex) MatchAll(requestURL *url.URL) []string { return creds } +// MatchesHost reports whether the request URL's scheme+host matches any +// configured token endpoint, ignoring the path. This is a cheap pre-gate +// (no path normalization, no body copy) used to decide whether a request +// could plausibly be an OAuth token round-trip before paying the +// string(body)+ParseQuery grant_type probe on the proxy hot path. A host +// match without a path match is harmless: the grant_type parse that follows +// still returns "" for a non-token request to the same host, so this only +// trades a rare false-positive parse (same host, different path) for +// skipping the parse entirely on the vast majority of non-token traffic. +func (idx *OAuthIndex) MatchesHost(requestURL *url.URL) bool { + if idx == nil || requestURL == nil { + return false + } + reqHost := normalizeHost(requestURL.Host, requestURL.Scheme) + for _, e := range idx.entries { + if e.tokenURL.Scheme == requestURL.Scheme && + normalizeHost(e.tokenURL.Host, e.tokenURL.Scheme) == reqHost { + return true + } + } + return false +} + // Len returns the number of entries in the index. func (idx *OAuthIndex) Len() int { if idx == nil { diff --git a/internal/proxy/oauth_response.go b/internal/proxy/oauth_response.go index e2fbfd8..1c9185c 100644 --- a/internal/proxy/oauth_response.go +++ b/internal/proxy/oauth_response.go @@ -6,10 +6,13 @@ import ( "encoding/base64" "encoding/json" "fmt" + "log" "net/url" "strconv" "strings" + "sync/atomic" + mitmproxy "github.com/lqqyt2423/go-mitmproxy/proxy" "github.com/nemirovsky/sluice/internal/vault" ) @@ -139,12 +142,24 @@ func extractRequestRefreshToken(body []byte, contentType string) string { return "" } ct := strings.ToLower(contentType) - if strings.Contains(ct, "application/x-www-form-urlencoded") || !strings.Contains(ct, "json") { + isFormCT := strings.Contains(ct, "application/x-www-form-urlencoded") + if isFormCT || !strings.Contains(ct, "json") { if vals, err := url.ParseQuery(string(body)); err == nil { if rt := vals.Get("refresh_token"); rt != "" { return rt } } + // An explicit form Content-Type is authoritative: the body was + // already parsed as form, so do not also stringify+JSON-parse it + // when it merely starts with '{' (Finding 3, the double string(body) + // alloc). Behavior is identical for normal cases (form body -> form, + // form-CT-with-json-body -> no refresh_token -> ""). The JSON + // fallback below stays reachable only for absent/ambiguous CT, where + // a conformant-but-headerless JSON token request must still be + // recoverable. + if isFormCT { + return "" + } } if strings.Contains(ct, "json") || strings.HasPrefix(strings.TrimSpace(string(body)), "{") { var probe struct { @@ -157,6 +172,136 @@ func extractRequestRefreshToken(body []byte, contentType string) string { return "" } +// maxGrantTypeProbeBody bounds the request body requestGrantType will copy + +// parse. RFC 6749 §4 token requests are small, but an RFC 7523 +// client_assertion (a signed JWT, optionally with a long refresh token and a +// large key-bound assertion) can run to tens of KiB. The original 8 KiB cap +// was unsafe: a legitimately large refresh-grant payload at a pool's token +// host would probe as "" -> the pool token-host gate would treat it as a +// non-refresh grant and silently NOT expand the refresh phantom, the inverted +// form of the very failure the gate prevents (Finding 9). 64 KiB is well above +// any realistic OAuth token-request body (a 4 KiB RSA-signed JWT base64s to +// ~6 KiB; even several stacked assertions plus a JWT refresh token stay under +// 64 KiB) while still bounding the worst-case string()+ParseQuery so an +// unbounded body to the token host cannot become an O(body) hot-path cost +// (the original perf bug). Combined with requestFlowGrantType's POST + +// token-host pre-gate (Finding 2) the parse only runs for token-host requests +// at all, so the larger cap is effectively free. +const maxGrantTypeProbeBody = 64 << 10 // 64 KiB + +// grantTypeProbeTruncated counts requests whose body exceeded +// maxGrantTypeProbeBody and were therefore NOT probed for grant_type. A +// truncated probe at a pool token host silently degrades into "no refresh +// expansion", so it must be observable. Rate-limited like the DLP no-match +// log so a pathological client cannot spam production logs. +var grantTypeProbeTruncated uint64 + +// grantTypeProbeTruncLogEvery sets the rate-limit cadence for the +// cap-truncation warning (one line per N truncations). +const grantTypeProbeTruncLogEvery = 100 + +// requestGrantType pulls the `grant_type` value out of an outbound OAuth +// token-endpoint request body. RFC 6749 §4 mandates +// application/x-www-form-urlencoded for token requests; some non-conformant +// clients send JSON, so both are parsed (form first, JSON fallback) using the +// same shape as extractRequestRefreshToken. Returns "" when the body is empty, +// larger than maxGrantTypeProbeBody, or has no parseable grant_type (e.g. a +// malformed body or an opaque device-poll request) — the caller treats +// absent/unknown the same as a non-refresh grant and passes the request +// through unmodified. The form parse is restricted to bodies whose +// Content-Type indicates form-encoding (or is absent/ambiguous, since a +// well-formed token request omitting Content-Type is still form-encoded); +// JSON and large/binary bodies (octet-stream, multipart, text/*) are not +// run through string()+url.ParseQuery, keeping the proxy hot path cheap. +func requestGrantType(body []byte, contentType string) string { + if len(body) == 0 { + return "" + } + if len(body) > maxGrantTypeProbeBody { + // Cap hit: the grant_type is not probed, so a real refresh-grant + // payload this large would NOT get the pool refresh phantom expanded + // (Finding 9). Rate-limited WARNING so the silent degrade is + // observable in a log aggregator without spamming under a + // pathological client. + if n := atomic.AddUint64(&grantTypeProbeTruncated, 1); n%grantTypeProbeTruncLogEvery == 1 { + log.Printf("[ADDON-INJECT] WARNING: request body %d bytes exceeds grant_type probe cap %d; "+ + "grant_type not parsed, pool refresh-phantom expansion skipped for this request "+ + "(occurrence #%d)", len(body), maxGrantTypeProbeBody, n) + } + return "" + } + ct := strings.TrimSpace(strings.ToLower(contentType)) + // Parse as form only when the Content-Type is form-encoded, or absent — a + // conformant token request that omitted the header is still form-encoded. + // An explicit non-form CT (octet-stream, multipart, text/*, json) is not a + // form token request, so the O(body) string()+url.ParseQuery is skipped; + // JSON is handled by the dedicated fallback below. + isFormCT := strings.Contains(ct, "application/x-www-form-urlencoded") + if isFormCT || ct == "" { + if vals, err := url.ParseQuery(string(body)); err == nil { + if gt := vals.Get("grant_type"); gt != "" { + return gt + } + } + // An explicit form Content-Type is authoritative: skip the + // stringify+JSON-parse fallback for a body that merely starts with + // '{' (Finding 3, the double string(body) alloc). Behavior is + // identical for normal cases (form body -> form, + // form-CT-with-json-body -> no grant_type -> ""). The JSON fallback + // stays reachable only for absent CT (ct == ""), preserving + // recovery of a headerless JSON token request. + if isFormCT { + return "" + } + } + if strings.Contains(ct, "json") || strings.HasPrefix(strings.TrimSpace(string(body)), "{") { + var probe struct { + GrantType string `json:"grant_type"` + } + if err := json.Unmarshal(body, &probe); err == nil && probe.GrantType != "" { + return probe.GrantType + } + } + return "" +} + +// requestFlowGrantType extracts the OAuth grant_type from a flow's outbound +// request (body + Content-Type). Returns "" when the flow / request / body is +// nil-or-empty or the grant_type is not parseable, which the pool token-host +// expansion treats the same as a non-refresh grant (pass through unmodified). +// +// Cheap pre-gate (Finding 2): requestFlowGrantType runs on EVERY proxied +// request, but the only consumer (the pool token-host expansion in +// buildPhantomPairs) acts solely on grant_type=="refresh_token" requests to a +// pool's OAuth token host. A token request is, per RFC 6749 §3.2, an +// HTTP POST to the token endpoint. So unless the request is a POST whose +// scheme+host matches a known OAuth token endpoint (idx.MatchesHost — host +// only, no path normalization, no body copy), the request cannot be a token +// round-trip and we return "" without ever calling string(body)+ParseQuery. +// This skips the O(body) grant_type probe for the vast majority of +// non-OAuth-token traffic. classifyFailover keeps its own independent +// response-side parse (gated on the OAuthIndex path match), so the two paths +// stay consistent without sharing this request-side gate. +func requestFlowGrantType(f *mitmproxy.Flow, idx *OAuthIndex) string { + if f == nil || f.Request == nil { + return "" + } + // Pre-gate: only an HTTP POST to a known OAuth token host can be a token + // request. url.ParseQuery / json.Unmarshal of the body are skipped + // entirely otherwise. + if !strings.EqualFold(f.Request.Method, "POST") { + return "" + } + if !idx.MatchesHost(f.Request.URL) { + return "" + } + ct := "" + if f.Request.Header != nil { + ct = f.Request.Header.Get("Content-Type") + } + return requestGrantType(f.Request.Body, ct) +} + // tokenResponse is the parsed result from an OAuth token endpoint. Fields // match the RFC 6749 token response format. type tokenResponse struct { diff --git a/internal/proxy/pool_failover.go b/internal/proxy/pool_failover.go index e9931f4..567c124 100644 --- a/internal/proxy/pool_failover.go +++ b/internal/proxy/pool_failover.go @@ -155,6 +155,65 @@ type FailoverEvent struct { Epoch int64 } +// humanizeFailoverReason maps a short reason tag (the same tag embedded in the +// audit Reason) to operator-friendly words, keeping the raw tag in parentheses +// so the technical detail is still visible. Unknown tags degrade gracefully: +// the raw tag is shown as "unknown reason ()" so the surrounding +// "... after %s." / "... to (%s)." clauses still read naturally instead of the +// redundant "failed over ... after failover ()". +// +// The empty-tag case is handled by FormatFailoverNotice (its sole caller), +// which short-circuits an empty reason and drops the reason clause entirely +// before ever calling here — so this function never needs an explicit "" +// branch (single source of truth for empty-reason wording lives there). +func humanizeFailoverReason(tag string) string { + switch tag { + case "429": + return "rate limit (429)" + case "403": + return "quota exhausted (403)" + case "401": + return "auth failure (401)" + case "invalid_grant": + return "auth failure (invalid_grant)" + case "invalid_token": + return "auth failure (invalid_token)" + default: + return "unknown reason (" + tag + ")" + } +} + +// FormatFailoverNotice builds the plain-text, single-line operator notice for a +// completed pool failover. It is the human-facing Telegram/notice string only; +// it deliberately does NOT touch the audit Reason format. Kept as a pure +// function (no I/O, no server state) so it is directly unit-testable. +// +// Plain text only: the notice path (TelegramChannel.Notify) sends with no +// parse mode, so markdown/HTML would render literally — keep it sentence-style +// like sluice's other notices. +func FormatFailoverNotice(ev FailoverEvent) string { + // An empty reason tag yields humanizeFailoverReason("") == "unknown + // reason", which reads awkwardly inline ("... to fail over to (unknown + // reason)." / "... after unknown reason."). When the tag is empty, drop + // the reason clause entirely instead (Finding 5). The audit Reason format + // is untouched - this only shapes the human-facing notice. + if ev.Reason == "" { + if ev.Exhausted { + return fmt.Sprintf("Pool %q exhausted: all members are cooling down, no healthy account to fail over to.", + ev.Pool) + } + return fmt.Sprintf("Pool %q failed over from %q to %q.", + ev.Pool, ev.From, ev.To) + } + reason := humanizeFailoverReason(ev.Reason) + if ev.Exhausted { + return fmt.Sprintf("Pool %q exhausted: all members are cooling down, no healthy account to fail over to (%s).", + ev.Pool, reason) + } + return fmt.Sprintf("Pool %q failed over from %q to %q after %s.", + ev.Pool, ev.From, ev.To, reason) +} + // poolForResponse maps a response's CONNECT destination back to a pooled // binding and returns the pool name + the member that was active for this // request. Returns ok=false when the destination is not bound to a pool. diff --git a/internal/proxy/pool_splithost_test.go b/internal/proxy/pool_splithost_test.go index 50427bd..90736fc 100644 --- a/internal/proxy/pool_splithost_test.go +++ b/internal/proxy/pool_splithost_test.go @@ -527,7 +527,7 @@ func TestFinding1Round9_PoolNamespaceNotSuppressedByMemberPlainBinding(t *testin // The pool ACCESS phantom must also be swappable: build the pairs // directly and assert the pool access phantom maps to memA's real // access token exactly once (no double-emit, not suppressed). - pairs := addon.buildPhantomPairs("auth.example.com", 443, "https", reqFlow.Request.URL) + pairs := addon.buildPhantomPairs("auth.example.com", 443, "https", reqFlow.Request.URL, requestFlowGrantType(reqFlow, addon.oauthIndex.Load())) defer releasePhantomPairs(pairs) accessPhantom := poolStablePhantomAccess(poolName) refreshPhantom := "SLUICE_PHANTOM:" + poolName + ".refresh" diff --git a/internal/proxy/pool_token_grant_test.go b/internal/proxy/pool_token_grant_test.go new file mode 100644 index 0000000..f387cd2 --- /dev/null +++ b/internal/proxy/pool_token_grant_test.go @@ -0,0 +1,168 @@ +package proxy + +import ( + "net/http" + "strings" + "testing" +) + +// Task 4: the pool token-host phantom expansion must be scoped to the OAuth +// refresh round-trip. A pool's shared OAuth token host (e.g. auth.openai.com) +// also serves device_code (a fresh in-container `codex login --device-auth`) +// and authorization_code grants; rewriting those request bodies/headers with +// the pool refresh phantom corrupts them (Codex -> 400 +// token_exchange_user_error). Only `grant_type=refresh_token` may be expanded; +// every other grant (including an absent / unparseable grant_type) must reach +// upstream byte-identical to what the agent sent. + +// cloneHeader returns a deep copy of an http.Header for byte-comparison. +func cloneHeader(h http.Header) http.Header { + c := make(http.Header, len(h)) + for k, vs := range h { + cp := make([]string, len(vs)) + copy(cp, vs) + c[k] = cp + } + return c +} + +func headerEqual(a, b http.Header) bool { + if len(a) != len(b) { + return false + } + for k, av := range a { + bv, ok := b[k] + if !ok || len(av) != len(bv) { + return false + } + for i := range av { + if av[i] != bv[i] { + return false + } + } + } + return true +} + +// TestPoolTokenHost_RefreshGrantStillExpanded is the regression guard: a +// genuine `grant_type=refresh_token` POST to the pool token host must STILL be +// expanded exactly as before (pool refresh phantom -> active member's real +// refresh token, R1 attribution tag recorded). This mirrors +// TestSplitHost_RequestSidePhantomSwapOnTokenHost and protects the existing +// behavior against the Task-4 grant gate. +func TestPoolTokenHost_RefreshGrantStillExpanded(t *testing.T) { + addon, _, prPtr := setupPoolSplitHostWithPlainCred(t) + client := setupAddonConn(addon, "auth.example.com:443") + + if got, _ := prPtr.Load().ResolveActive("codex_pool"); got != "memA" { + t.Fatalf("pre-condition active = %q, want memA", got) + } + + reqFlow := newTestFlow(client, "POST", testOAuthTokenURL) + reqFlow.Request.Header.Set("Content-Type", "application/x-www-form-urlencoded") + reqFlow.Request.Body = refreshGrantBody("codex_pool") + + addon.Requestheaders(reqFlow) + addon.Request(reqFlow) + + body := string(reqFlow.Request.Body) + if strings.Contains(body, "SLUICE_PHANTOM:codex_pool.refresh") { + t.Fatalf("regression: refresh_token grant pool phantom NOT swapped; body=%q", body) + } + if !strings.Contains(body, "A-refresh-old") { + t.Fatalf("regression: active member memA real refresh token not injected; body=%q", body) + } + if owner, ok := addon.refreshAttr.Peek("A-refresh-old"); !ok || owner != "memA" { + t.Fatalf("regression: R1 attribution tag not recorded for memA; owner=%q ok=%v", owner, ok) + } +} + +// runNonRefreshGrantPassThrough drives a realistic non-refresh grant body +// (device_code / authorization_code / absent grant_type — none of which carry +// a refresh_token; a fresh in-container `codex login --device-auth` posts only +// device_code + client_id) through the addon and asserts the request body + +// headers reach upstream byte-identical. Before the Task-4 gate the pool +// token-host expansion would still build the pool pairs for this token URL and +// (via the unbound-phantom strip / pool-keyed swap) mutate the request, +// corrupting the in-container login into 400 token_exchange_user_error. +func runNonRefreshGrantPassThrough(t *testing.T, name string, body []byte) { + t.Helper() + addon, _, prPtr := setupPoolSplitHostWithPlainCred(t) + client := setupAddonConn(addon, "auth.example.com:443") + + if got, _ := prPtr.Load().ResolveActive("codex_pool"); got != "memA" { + t.Fatalf("[%s] pre-condition active = %q, want memA", name, got) + } + + reqFlow := newTestFlow(client, "POST", testOAuthTokenURL) + reqFlow.Request.Header.Set("Content-Type", "application/x-www-form-urlencoded") + reqFlow.Request.Body = append([]byte(nil), body...) + + wantBody := append([]byte(nil), body...) + wantHeader := cloneHeader(reqFlow.Request.Header) + + addon.Requestheaders(reqFlow) + addon.Request(reqFlow) + + if string(reqFlow.Request.Body) != string(wantBody) { + t.Fatalf("[%s] non-refresh grant body MUTATED; got=%q want=%q (codex login --device-auth would be corrupted)", + name, reqFlow.Request.Body, wantBody) + } + if !headerEqual(reqFlow.Request.Header, wantHeader) { + t.Fatalf("[%s] non-refresh grant headers MUTATED; got=%v want=%v", name, reqFlow.Request.Header, wantHeader) + } + // The R1 attribution tag must NOT be recorded: nothing was injected. + if owner, ok := addon.refreshAttr.Peek("A-refresh-old"); ok { + t.Fatalf("[%s] R1 attribution tag wrongly recorded (owner=%q) for a non-refresh grant", name, owner) + } +} + +// TestPoolTokenHost_DeviceCodeGrantPassThrough: a `grant_type=device_code` +// request (a fresh in-container `codex login --device-auth`) must reach +// upstream byte-unchanged so the login is not corrupted into 400 +// token_exchange_user_error. +func TestPoolTokenHost_DeviceCodeGrantPassThrough(t *testing.T) { + runNonRefreshGrantPassThrough(t, "device_code", + []byte("grant_type=device_code&device_code=abc123&client_id=codex-cli")) +} + +// TestPoolTokenHost_AuthorizationCodeGrantPassThrough: a +// `grant_type=authorization_code` request must also pass through unmodified. +func TestPoolTokenHost_AuthorizationCodeGrantPassThrough(t *testing.T) { + runNonRefreshGrantPassThrough(t, "authorization_code", + []byte("grant_type=authorization_code&code=xyz&client_id=codex-cli&redirect_uri=http://localhost")) +} + +// TestPoolTokenHost_AbsentGrantTypePassThrough: a body with NO grant_type +// (unparseable / non-token request to the same host) must pass through +// unmodified — absent grant is treated the same as a non-refresh grant. +func TestPoolTokenHost_AbsentGrantTypePassThrough(t *testing.T) { + runNonRefreshGrantPassThrough(t, "absent_grant", + []byte("client_id=codex-cli&scope=openid&foo=bar")) +} + +// TestRequestGrantType_ParsesFormAndJSON unit-tests the helper the gate relies +// on (form first, JSON fallback, "" on absent/empty/unparseable). +func TestRequestGrantType_ParsesFormAndJSON(t *testing.T) { + cases := []struct { + name string + body string + ct string + want string + }{ + {"form refresh", "grant_type=refresh_token&refresh_token=x", "application/x-www-form-urlencoded", "refresh_token"}, + {"form device", "grant_type=device_code&device_code=x", "application/x-www-form-urlencoded", "device_code"}, + {"form absent", "refresh_token=x", "application/x-www-form-urlencoded", ""}, + {"json refresh", `{"grant_type":"refresh_token"}`, "application/json", "refresh_token"}, + {"json absent", `{"foo":"bar"}`, "application/json", ""}, + {"empty body", "", "application/x-www-form-urlencoded", ""}, + {"form no content-type still parses", "grant_type=refresh_token", "", "refresh_token"}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + if got := requestGrantType([]byte(c.body), c.ct); got != c.want { + t.Fatalf("requestGrantType(%q,%q) = %q, want %q", c.body, c.ct, got, c.want) + } + }) + } +} diff --git a/internal/store/pools.go b/internal/store/pools.go index f77232b..2ee18e2 100644 --- a/internal/store/pools.go +++ b/internal/store/pools.go @@ -12,6 +12,47 @@ import ( // the column constrained to this value. const PoolStrategyFailover = "failover" +// ErrPoolNameConflict is returned (wrapped) by CreatePoolWithMembers when the +// requested pool name cannot be used because it already exists as a pool or +// collides with an existing credential (pool and credential names share one +// namespace). It is a typed sentinel so the REST layer maps this +// client-facing conflict to 409 Conflict and distinguishes it from genuine +// input-validation errors (no members, static member, unknown member) which +// stay 400. +var ErrPoolNameConflict = errors.New("pool name conflicts with an existing pool or credential") + +// ErrCredentialAlreadyPooled is returned (wrapped) by CreatePoolWithMembers +// when one of the requested members is already a member of another pool. A +// credential may belong to at most one pool (proxy attribution maps a member +// back to a single pool). Typed so the REST layer maps it to 409 Conflict +// rather than 400. +var ErrCredentialAlreadyPooled = errors.New("credential is already a member of another pool") + +// The following sentinels classify genuine client-input validation failures +// from CreatePoolWithMembers so every management channel (REST, CLI, +// Telegram) can tell a 400-class bad request apart from a 500-class +// internal/DB error without string matching. They are wrapped (%w) at their +// origin so the existing human-readable message is preserved verbatim while +// staying errors.Is-able. A failure NOT matching one of these or the two +// conflict sentinels above is an internal error (tx/DB/INSERT) and must +// surface as 500, never be misclassified as a client 400. +var ( + // ErrPoolNoMembers: the pool was requested with no members (or an + // empty pool name). Distinct from poolops.ErrNoMembers (the + // channel-agnostic pre-store guard) but classified the same (400). + ErrPoolNoMembers = errors.New("pool requires at least one member") + // ErrPoolStrategyInvalid: an unsupported strategy was requested. + ErrPoolStrategyInvalid = errors.New("unsupported pool strategy") + // ErrPoolMemberDuplicate: the submitted member list has a duplicate + // (or an empty member name). + ErrPoolMemberDuplicate = errors.New("pool member listed more than once") + // ErrPoolMemberNotFound: a requested member credential does not exist. + ErrPoolMemberNotFound = errors.New("pool member credential does not exist") + // ErrPoolMemberNotOAuth: a requested member is not an oauth credential + // with a token endpoint (pools are OAuth-specific). + ErrPoolMemberNotOAuth = errors.New("pool member must be an oauth credential") +) + // Pool is a named group of OAuth credentials backing a single phantom // identity. Members are returned ordered by position (failover order). type Pool struct { @@ -108,16 +149,16 @@ func validatePoolMemberTx(tx *sql.Tx, credential string) error { "SELECT cred_type, token_url FROM credential_meta WHERE name = ?", credential, ).Scan(&credType, &tokenURL) if errors.Is(err, sql.ErrNoRows) { - return fmt.Errorf("credential %q does not exist (add it with --type oauth first)", credential) + return fmt.Errorf("%w: credential %q does not exist (add it with --type oauth first)", ErrPoolMemberNotFound, credential) } if err != nil { return fmt.Errorf("look up credential %q: %w", credential, err) } if credType != "oauth" { - return fmt.Errorf("credential %q is %s, pools require oauth credentials", credential, credType) + return fmt.Errorf("%w: credential %q is %s, pools require oauth credentials", ErrPoolMemberNotOAuth, credential, credType) } if tokenURL.String == "" { - return fmt.Errorf("credential %q has no token_url; pools require oauth credentials with a token endpoint", credential) + return fmt.Errorf("%w: credential %q has no token_url; pools require oauth credentials with a token endpoint", ErrPoolMemberNotOAuth, credential) } return nil } @@ -141,7 +182,7 @@ func assertCredentialNotInAnotherPoolTx(tx *sql.Tx, credential, newPool string) if err != nil { return fmt.Errorf("check existing pool membership for %q: %w", credential, err) } - return fmt.Errorf("credential %q is already a member of pool %q; a credential may belong to at most one pool", credential, existing) + return fmt.Errorf("%w: credential %q is already a member of pool %q; a credential may belong to at most one pool", ErrCredentialAlreadyPooled, credential, existing) } // CreatePoolWithMembers creates a pool and its ordered members atomically. @@ -153,24 +194,24 @@ func assertCredentialNotInAnotherPoolTx(tx *sql.Tx, credential, newPool string) // permitted (it degrades to a plain indirection with no failover target). func (s *Store) CreatePoolWithMembers(name, strategy string, members []string) error { if name == "" { - return fmt.Errorf("pool name is required") + return fmt.Errorf("%w: pool name is required", ErrPoolNoMembers) } if strategy == "" { strategy = PoolStrategyFailover } if strategy != PoolStrategyFailover { - return fmt.Errorf("invalid pool strategy %q: only %q is supported", strategy, PoolStrategyFailover) + return fmt.Errorf("%w %q: only %q is supported", ErrPoolStrategyInvalid, strategy, PoolStrategyFailover) } if len(members) == 0 { - return fmt.Errorf("pool %q requires at least one member", name) + return fmt.Errorf("%w: pool %q requires at least one member", ErrPoolNoMembers, name) } seen := make(map[string]bool, len(members)) for _, m := range members { if m == "" { - return fmt.Errorf("pool %q has an empty member name", name) + return fmt.Errorf("%w: pool %q has an empty member name", ErrPoolMemberDuplicate, name) } if seen[m] { - return fmt.Errorf("pool %q lists credential %q more than once", name, m) + return fmt.Errorf("%w: pool %q lists credential %q more than once", ErrPoolMemberDuplicate, name, m) } seen[m] = true } @@ -186,13 +227,27 @@ func (s *Store) CreatePoolWithMembers(name, strategy string, members []string) e collErr := tx.QueryRow("SELECT name FROM credential_meta WHERE name = ?", name).Scan(&credName) switch { case collErr == nil: - return fmt.Errorf("name %q is already a credential; pool and credential names share one namespace", name) + return fmt.Errorf("%w: name %q is already a credential; pool and credential names share one namespace", ErrPoolNameConflict, name) case errors.Is(collErr, sql.ErrNoRows): // ok default: return fmt.Errorf("check name collision for %q: %w", name, collErr) } + // A pool with this name must not already exist. Detect it explicitly so + // the conflict is a typed ErrPoolNameConflict rather than a raw UNIQUE + // constraint string from the INSERT below. + var existingPool string + dupErr := tx.QueryRow("SELECT name FROM credential_pools WHERE name = ?", name).Scan(&existingPool) + switch { + case dupErr == nil: + return fmt.Errorf("%w: pool %q already exists", ErrPoolNameConflict, name) + case errors.Is(dupErr, sql.ErrNoRows): + // ok + default: + return fmt.Errorf("check existing pool %q: %w", name, dupErr) + } + if _, err := tx.Exec( "INSERT INTO credential_pools (name, strategy) VALUES (?, ?)", name, strategy, ); err != nil { diff --git a/internal/telegram/approval.go b/internal/telegram/approval.go index 0e266c6..fba74ff 100644 --- a/internal/telegram/approval.go +++ b/internal/telegram/approval.go @@ -228,6 +228,7 @@ func (tc *TelegramChannel) registerCommands() { {Command: "policy", Description: "Manage policy rules"}, {Command: "cred", Description: "Manage credentials"}, {Command: "mcp", Description: "Manage MCP upstreams"}, + {Command: "pool", Description: "Manage credential pools"}, {Command: "audit", Description: "Show audit log entries"}, {Command: "start", Description: "Show welcome message"}, {Command: "help", Description: "Show available commands"}, diff --git a/internal/telegram/approval_test.go b/internal/telegram/approval_test.go index f222d68..d8cc48c 100644 --- a/internal/telegram/approval_test.go +++ b/internal/telegram/approval_test.go @@ -594,6 +594,7 @@ func TestRegisterCommands(t *testing.T) { {Command: "policy", Description: "Manage policy rules"}, {Command: "cred", Description: "Manage credentials"}, {Command: "mcp", Description: "Manage MCP upstreams"}, + {Command: "pool", Description: "Manage credential pools"}, {Command: "audit", Description: "Show audit log entries"}, {Command: "start", Description: "Show welcome message"}, {Command: "help", Description: "Show available commands"}, diff --git a/internal/telegram/commands.go b/internal/telegram/commands.go index 0ef4d6f..c5c2af5 100644 --- a/internal/telegram/commands.go +++ b/internal/telegram/commands.go @@ -2,6 +2,7 @@ package telegram import ( "context" + "errors" "flag" "fmt" "io" @@ -20,6 +21,7 @@ import ( "github.com/nemirovsky/sluice/internal/flagutil" "github.com/nemirovsky/sluice/internal/mcp" "github.com/nemirovsky/sluice/internal/policy" + "github.com/nemirovsky/sluice/internal/poolops" "github.com/nemirovsky/sluice/internal/store" "github.com/nemirovsky/sluice/internal/vault" ) @@ -248,6 +250,8 @@ func (h *CommandHandler) Handle(cmd *Command) string { return h.handleCred(cmd.Args) case "mcp": return h.handleMCP(cmd.Args) + case "pool": + return h.handlePool(cmd.Args) case "status": return h.handleStatus() case "audit": @@ -1182,6 +1186,179 @@ func (h *CommandHandler) handleAudit(args []string) string { return b.String() } +// poolUsage is the usage banner for /pool. +const poolUsage = "Usage: /pool create | /pool list | /pool status | /pool rotate | /pool remove " + +// handlePool dispatches /pool subcommands to the channel-agnostic +// internal/poolops package, the same package the CLI and REST API call, so +// the three management surfaces cannot drift (channel feature-parity +// principle). +// +// Unlike /mcp add, pool arguments carry only credential NAMES (never secret +// values), so the chat message is NOT auto-deleted. +// +// reloadMu is not held: like MCP upstream changes, pool changes do not +// recompile the policy engine or the binding resolver. +func (h *CommandHandler) handlePool(args []string) string { + if h.store == nil { + return "Pool management is not available (policy store not configured)." + } + if len(args) == 0 { + return poolUsage + } + switch args[0] { + case "create": + return h.poolCreate(args[1:]) + case "list": + return h.poolList() + case "status": + if len(args) < 2 { + return "Usage: /pool status " + } + return h.poolStatus(args[1]) + case "rotate": + if len(args) < 2 { + return "Usage: /pool rotate " + } + return h.poolRotate(args[1]) + case "remove": + if len(args) < 2 { + return "Usage: /pool remove " + } + return h.poolRemove(args[1]) + default: + return fmt.Sprintf("Unknown pool subcommand: %s", args[0]) + } +} + +func (h *CommandHandler) poolCreate(args []string) string { + if len(args) < 2 { + return "Usage: /pool create " + } + name := args[0] + members, err := poolops.ParseMembers(args[1]) + if err != nil { + return fmt.Sprintf("Failed to create pool: %v", err) + } + if err := poolops.Create(h.store, name, "", members); err != nil { + return fmt.Sprintf("Failed to create pool: %v", err) + } + var b strings.Builder + fmt.Fprintf(&b, "Created pool %s (strategy: %s)\n", htmlCode(name), htmlCode(store.PoolStrategyFailover)) + for i, m := range members { + fmt.Fprintf(&b, " [%d] %s\n", i, htmlCode(m)) + } + b.WriteString("Bind it with /policy or " + htmlCode("sluice binding add "+name+" --destination ")) + return b.String() +} + +func (h *CommandHandler) poolList() string { + pools, err := poolops.List(h.store) + if err != nil { + return fmt.Sprintf("Failed to list pools: %v", err) + } + if len(pools) == 0 { + return "No credential pools configured." + } + var b strings.Builder + b.WriteString("Credential pools\n") + for _, p := range pools { + names := make([]string, 0, len(p.Members)) + for _, m := range p.Members { + names = append(names, m.Credential) + } + fmt.Fprintf(&b, "%s (strategy: %s): %s\n", + htmlCode(p.Name), htmlCode(p.Strategy), htmlCode(strings.Join(names, ", "))) + } + return b.String() +} + +func (h *CommandHandler) poolStatus(name string) string { + res, err := poolops.Status(h.store, name) + if err != nil { + var nf *poolops.PoolNotFoundError + if errors.As(err, &nf) { + return fmt.Sprintf("No pool named %s", htmlCode(name)) + } + return fmt.Sprintf("Failed to get pool status: %v", err) + } + var b strings.Builder + fmt.Fprintf(&b, "Pool %s (strategy: %s)\n", htmlCode(res.Name), htmlCode(res.Strategy)) + for _, m := range res.Members { + marker := " " + if m.Active { + marker = "* " + } + status := "healthy" + switch m.State { + case "cooldown": + status = fmt.Sprintf("cooldown until %s", m.CooldownUntil.Format(time.RFC3339)) + if m.LastFailureReason != "" { + // LastFailureReason is upstream error text (may contain + // < > &); this message is sent with HTML parse mode, so + // escape it like every other user-facing value here + // (htmlEscape, not htmlCode — it is prose, not an identifier). + status += " — " + htmlEscape(m.LastFailureReason) + } + case "healthy (cooldown expired)": + status = "healthy (cooldown expired)" + if m.LastFailureReason != "" { + status += " — " + htmlEscape(m.LastFailureReason) + } + } + fmt.Fprintf(&b, "%s[%d] %s %s\n", marker, m.Position, htmlCode(m.Credential), status) + } + fmt.Fprintf(&b, "active: %s\n", htmlCode(res.Active)) + return b.String() +} + +func (h *CommandHandler) poolRotate(name string) string { + res, err := poolops.Rotate(h.store, name) + if err != nil { + var nf *poolops.PoolNotFoundError + if errors.As(err, &nf) { + return fmt.Sprintf("No pool named %s", htmlCode(name)) + } + var race *poolops.RotateRaceError + if errors.As(err, &race) { + return fmt.Sprintf("Pool %s rotate raced a concurrent membership change; nothing was persisted. Re-check the pool with /pool status %s and retry.", + htmlCode(name), htmlCode(name)) + } + return fmt.Sprintf("Failed to rotate pool: %v", err) + } + return fmt.Sprintf("Rotated pool %s: %s -> %s (parked %s until %s)", + htmlCode(name), htmlCode(res.From), htmlCode(res.To), + htmlCode(res.From), res.ParkedUntil.Format(time.RFC3339)) +} + +func (h *CommandHandler) poolRemove(name string) string { + err := poolops.Remove(h.store, name) + if err != nil { + var refErr *store.PoolReferencedError + if errors.As(err, &refErr) { + details := make([]string, len(refErr.Bindings)) + for i, bnd := range refErr.Bindings { + details[i] = fmt.Sprintf("[%d] %s", bnd.ID, bnd.Destination) + } + return fmt.Sprintf("Pool %s is still referenced by %d binding(s): %s. Remove those bindings first, then retry.", + htmlCode(name), len(refErr.Bindings), htmlCode(strings.Join(details, ", "))) + } + // store.ErrCredentialInUseByPool is deliberately NOT checked here: it is + // raised only by the credential-removal path (RemoveCredentialStoreState + // in store.go, when deleting a credential that is still a live pool + // member) and is handled by its own errors.Is check in the credential + // handler. No poolops.Remove store path can return it, so checking it + // here would be dead code returning the same string as the fallthrough + // below (kept consistent with poolStatusError in internal/api/server.go). + var nf *poolops.PoolNotFoundError + if errors.As(err, &nf) { + return fmt.Sprintf("No pool named %s", htmlCode(name)) + } + return fmt.Sprintf("Failed to remove pool: %v", err) + } + return fmt.Sprintf("Removed pool %s", htmlCode(name)) +} + func (h *CommandHandler) handleStart() string { return "Sluice approval proxy is running.\nType /help for available commands." } @@ -1207,7 +1384,11 @@ Credentials MCP Upstreams /mcp list | /mcp add --command [--transport stdio|http|websocket] [--args "a,b"] [--env "K=V,K=V"] [--header "K=V"] [--timeout 120] -/mcp remove ` +/mcp remove + +Credential Pools +/pool create | /pool list | /pool status +/pool rotate | /pool remove ` } help += ` diff --git a/internal/telegram/commands_test.go b/internal/telegram/commands_test.go index 8dceebe..9c28dc4 100644 --- a/internal/telegram/commands_test.go +++ b/internal/telegram/commands_test.go @@ -8,6 +8,7 @@ import ( "sync" "sync/atomic" "testing" + "time" "github.com/nemirovsky/sluice/internal/channel" "github.com/nemirovsky/sluice/internal/policy" @@ -2079,3 +2080,255 @@ func TestHandleMCPAddRepeatableHeader(t *testing.T) { t.Errorf("repeatable --header must keep commas intact: %q", ups[0].Headers["X-Custom"]) } } + +// seedPoolOAuthMeta registers oauth credential_meta rows so the store-side +// pool-member validation (oauth + non-empty token_url) passes, mirroring how +// the real CLI/REST flows create OAuth credentials before pooling them. +func seedPoolOAuthMeta(t *testing.T, s *store.Store, names ...string) { + t.Helper() + for _, n := range names { + if err := s.AddCredentialMeta(n, "oauth", "https://auth.example.com/token"); err != nil { + t.Fatalf("seed oauth meta %q: %v", n, err) + } + } +} + +func TestHandlePoolNoStore(t *testing.T) { + // Build the engine from a transient store but omit SetStore on the handler. + s := newTestStore(t) + eng, err := policy.LoadFromStore(s) + if err != nil { + t.Fatal(err) + } + ptr := new(atomic.Pointer[policy.Engine]) + ptr.Store(eng) + h := NewCommandHandler(ptr, new(sync.Mutex), "") + // Deliberately do not call SetStore. + + got := h.Handle(&Command{Name: "pool", Args: []string{"list"}}) + if !strings.Contains(got, "not available") { + t.Errorf("pool with no store = %q, want unavailable message", got) + } +} + +func TestHandlePoolNoArgs(t *testing.T) { + s := newTestStore(t) + h := newTestHandlerWithStore(t, s, nil, "") + got := h.Handle(&Command{Name: "pool", Args: nil}) + if !strings.Contains(got, "Usage:") { + t.Errorf("pool no args = %q, want usage", got) + } +} + +func TestHandlePoolUnknownSubcommand(t *testing.T) { + s := newTestStore(t) + h := newTestHandlerWithStore(t, s, nil, "") + got := h.Handle(&Command{Name: "pool", Args: []string{"frobnicate"}}) + if !strings.Contains(got, "Unknown pool subcommand") { + t.Errorf("pool unknown subcommand = %q", got) + } +} + +func TestHandlePoolCreateListStatusRotateRemove(t *testing.T) { + s := newTestStore(t) + seedPoolOAuthMeta(t, s, "acct_a", "acct_b") + h := newTestHandlerWithStore(t, s, nil, "") + + // create + got := h.Handle(&Command{Name: "pool", Args: []string{"create", "codex", "acct_a,acct_b"}}) + if !strings.Contains(got, "Created pool") || !strings.Contains(got, "codex") { + t.Fatalf("pool create = %q", got) + } + + // list + got = h.Handle(&Command{Name: "pool", Args: []string{"list"}}) + if !strings.Contains(got, "codex") || !strings.Contains(got, "acct_a, acct_b") { + t.Fatalf("pool list = %q", got) + } + + // status + got = h.Handle(&Command{Name: "pool", Args: []string{"status", "codex"}}) + if !strings.Contains(got, "active:") || !strings.Contains(got, "acct_a") { + t.Fatalf("pool status = %q", got) + } + if !strings.Contains(got, "* ") { + t.Errorf("pool status should mark the active member with '* ': %q", got) + } + + // rotate + got = h.Handle(&Command{Name: "pool", Args: []string{"rotate", "codex"}}) + if !strings.Contains(got, "Rotated pool") || !strings.Contains(got, "acct_a") || !strings.Contains(got, "acct_b") { + t.Fatalf("pool rotate = %q", got) + } + + // status after rotate: acct_b active, acct_a cooled down + got = h.Handle(&Command{Name: "pool", Args: []string{"status", "codex"}}) + if !strings.Contains(got, "cooldown until") { + t.Errorf("post-rotate status should show acct_a cooldown: %q", got) + } + + // remove + got = h.Handle(&Command{Name: "pool", Args: []string{"remove", "codex"}}) + if !strings.Contains(got, "Removed pool") { + t.Fatalf("pool remove = %q", got) + } + // GetPool returns (nil, nil) for a missing pool, so assert the pool row + // is actually gone (p == nil) rather than testing the always-true err. + if p, err := s.GetPool("codex"); err != nil { + t.Fatalf("GetPool after remove errored: %v", err) + } else if p != nil { + t.Fatalf("pool still present after remove: %+v", p) + } + // And the operator-visible status command reports it not-found. + got = h.Handle(&Command{Name: "pool", Args: []string{"status", "codex"}}) + if !strings.Contains(got, "No pool named") { + t.Fatalf("status after remove = %q, want not-found", got) + } +} + +func TestHandlePoolCreateNoMembers(t *testing.T) { + s := newTestStore(t) + h := newTestHandlerWithStore(t, s, nil, "") + got := h.Handle(&Command{Name: "pool", Args: []string{"create", "p"}}) + if !strings.Contains(got, "Usage:") { + t.Errorf("pool create without members = %q, want usage", got) + } +} + +func TestHandlePoolCreateStaticMemberRejected(t *testing.T) { + s := newTestStore(t) + if err := s.AddCredentialMeta("static_one", "static", ""); err != nil { + t.Fatalf("add static meta: %v", err) + } + h := newTestHandlerWithStore(t, s, nil, "") + got := h.Handle(&Command{Name: "pool", Args: []string{"create", "p", "static_one"}}) + if !strings.Contains(got, "Failed to create pool") { + t.Errorf("pool create with static member = %q, want failure", got) + } +} + +func TestHandlePoolStatusUnknown(t *testing.T) { + s := newTestStore(t) + h := newTestHandlerWithStore(t, s, nil, "") + got := h.Handle(&Command{Name: "pool", Args: []string{"status", "ghost"}}) + if !strings.Contains(got, "No pool named") { + t.Errorf("status unknown pool = %q", got) + } +} + +func TestHandlePoolRotateUnknown(t *testing.T) { + s := newTestStore(t) + h := newTestHandlerWithStore(t, s, nil, "") + got := h.Handle(&Command{Name: "pool", Args: []string{"rotate", "ghost"}}) + if !strings.Contains(got, "No pool named") { + t.Errorf("rotate unknown pool = %q", got) + } +} + +func TestHandlePoolRemoveUnknown(t *testing.T) { + s := newTestStore(t) + h := newTestHandlerWithStore(t, s, nil, "") + got := h.Handle(&Command{Name: "pool", Args: []string{"remove", "ghost"}}) + if !strings.Contains(got, "No pool named") { + t.Errorf("remove unknown pool = %q", got) + } +} + +func TestHandlePoolRemoveReferencedByBinding(t *testing.T) { + s := newTestStore(t) + seedPoolOAuthMeta(t, s, "acct_a", "acct_b") + if err := s.CreatePoolWithMembers("codex", store.PoolStrategyFailover, []string{"acct_a", "acct_b"}); err != nil { + t.Fatalf("create pool: %v", err) + } + // A binding referencing the pool by name keeps it from being removed. + if _, err := s.AddBinding("api.example.com", "codex", store.BindingOpts{}); err != nil { + t.Fatalf("add binding: %v", err) + } + h := newTestHandlerWithStore(t, s, nil, "") + got := h.Handle(&Command{Name: "pool", Args: []string{"remove", "codex"}}) + if !strings.Contains(got, "still referenced") { + t.Errorf("remove referenced pool = %q, want referenced message", got) + } +} + +func TestPoolStatusFormatMatchesCLI(t *testing.T) { + // The Telegram status rendering must mirror the CLI: active marker, + // position index, healthy/cooldown line. This locks the format so it + // doesn't drift from cmd/sluice/pool.go. + s := newTestStore(t) + seedPoolOAuthMeta(t, s, "m0", "m1") + if err := s.CreatePoolWithMembers("p", store.PoolStrategyFailover, []string{"m0", "m1"}); err != nil { + t.Fatalf("create pool: %v", err) + } + // Park m0 so it shows a cooldown line with the reason. + until := time.Now().Add(time.Hour) + if err := s.SetCredentialHealth("m0", "cooldown", until, "rate_limited"); err != nil { + t.Fatalf("set health: %v", err) + } + h := newTestHandlerWithStore(t, s, nil, "") + got := h.Handle(&Command{Name: "pool", Args: []string{"status", "p"}}) + if !strings.Contains(got, "[0] m0") || !strings.Contains(got, "cooldown until") || !strings.Contains(got, "rate_limited") { + t.Errorf("status format = %q, want CLI-style cooldown line", got) + } + if !strings.Contains(got, "* [1] m1") || !strings.Contains(got, "active: m1") { + t.Errorf("status should make m1 active (m0 cooled): %q", got) + } +} + +func TestPoolStatusEscapesLastFailureReason(t *testing.T) { + // LastFailureReason carries upstream error text. The status message is + // sent with HTML parse mode (htmlCode emits ), so a reason with + // < > & must be HTML-escaped or the Bot API rejects/garbles the message. + s := newTestStore(t) + seedPoolOAuthMeta(t, s, "m0", "m1") + if err := s.CreatePoolWithMembers("p", store.PoolStrategyFailover, []string{"m0", "m1"}); err != nil { + t.Fatalf("create pool: %v", err) + } + rawReason := `429 & "retry"` + until := time.Now().Add(time.Hour) + if err := s.SetCredentialHealth("m0", "cooldown", until, rawReason); err != nil { + t.Fatalf("set health: %v", err) + } + h := newTestHandlerWithStore(t, s, nil, "") + got := h.Handle(&Command{Name: "pool", Args: []string{"status", "p"}}) + if strings.Contains(got, rawReason) { + t.Errorf("status leaked raw unescaped reason: %q", got) + } + if strings.Contains(got, "") { + t.Errorf("status contains unescaped angle brackets: %q", got) + } + want := `429 <too many> & "retry"` + if !strings.Contains(got, want) { + t.Errorf("status = %q, want escaped reason %q", got, want) + } +} + +func TestPoolCreateReplyIsHTMLSafe(t *testing.T) { + // The create reply embeds a "sluice binding add --destination + // " hint. The message is sent with HTML parse mode (htmlCode emits + // ), and a literal is an invalid HTML tag that makes the Bot + // API reject the whole send, so the success message must not contain a + // raw angle-bracket placeholder. + s := newTestStore(t) + seedPoolOAuthMeta(t, s, "m0", "m1") + h := newTestHandlerWithStore(t, s, nil, "") + got := h.Handle(&Command{Name: "pool", Args: []string{"create", "codex", "m0,m1"}}) + if !strings.Contains(got, "Created pool") { + t.Fatalf("create did not succeed: %q", got) + } + if strings.Contains(got, "") { + t.Errorf("create reply contains raw placeholder (Bot API will reject HTML send): %q", got) + } + if !strings.Contains(got, "<host>") { + t.Errorf("create reply = %q, want escaped <host> placeholder", got) + } +} + +func TestHandleHelpListsPool(t *testing.T) { + s := newTestStore(t) + h := newTestHandlerWithStore(t, s, nil, "") + got := h.handleHelp() + if !strings.Contains(got, "Credential Pools") || !strings.Contains(got, "/pool create") { + t.Errorf("help should list /pool: %q", got) + } +}