diff --git a/AGENTS.md b/AGENTS.md index a2609a1..26116b2 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -175,6 +175,16 @@ Before considering work complete, agents should: - Run `pulumi preview` (if a stack is configured) to verify no resource errors. - Ensure no new `any` types or type assertions without justification. +## Research Protocol (Critical — Follow Before Writing Code) + +When planning features or fixing bugs that involve Pulumi providers, Docker, or any infrastructure tooling: + +1. **DeepWiki first.** Query DeepWiki (`mcp__deepwiki__ask_question`) for the relevant provider/tool repository. Ask about the specific problem or API surface — do not lead with a proposed solution. DeepWiki indexes the actual source code and docs, so it surfaces first-class provider features that workarounds would miss. +2. **Provider docs second.** If DeepWiki doesn't cover it, check the official provider documentation (e.g. Pulumi registry pages, Docker docs). Use `WebFetch` or `context7` to read the docs directly. +3. **Neo as reaffirming opinion only.** Use Pulumi Neo (`neo-bridge`) to validate the approach against the stack's actual state and update history. Neo is good at reading Pulumi state, update diffs, and resource lifecycle events. It is **not** a domain expert on provider APIs — it's an LLM with limited additional context, not a Pulumi engineer. Do not treat its recommendations as authoritative for how a provider resource works. Always verify against docs. + +**Do not skip to implementation based on assumptions.** A workaround that ignores a provider's built-in solution creates tech debt and silent failures. The `RemoteImage` + `pullTriggers` + `getRegistryImage` pattern is an example — the provider had a documented first-class solution for dynamic image pulls, but it was missed because research was skipped. + ## Contribution Guidelines for Agents - **Research before changing infrastructure or build configuration.** Before modifying Docker builds, Pulumi resources, network config, or any infrastructure-affecting code: read the relevant provider/tool documentation, understand the full implications (caching, performance, cross-platform behavior, state management), and verify your approach handles all supported deployment targets (amd64 + arm64, all VPS providers). A one-line change to a build resource can break caching, double build times, or cause architecture-specific failures. Do not treat infrastructure changes as trivial — always think through second-order effects. diff --git a/components/gateway-image.ts b/components/gateway-image.ts index b374068..a5f9947 100644 --- a/components/gateway-image.ts +++ b/components/gateway-image.ts @@ -35,7 +35,7 @@ export interface GatewayImageArgs { } export class GatewayImage extends pulumi.ComponentResource { - /** The image tag, e.g. "openclaw-gateway-dev:latest" or "registry/openclaw-gateway-dev:latest" */ + /** Image reference for the gateway container. For pulled images this is the host-local image ID; for on-host builds it remains the tag. */ public readonly imageName: pulumi.Output; /** Stable image change token. Changes when build inputs change and, for pulled images, prefers the remote repo digest. */ public readonly imageDigest: pulumi.Output; @@ -155,14 +155,14 @@ export class GatewayImage extends pulumi.ComponentResource { }, ]; - // Ensure the named builder exists. Idempotent — skips if already created. - // This gives deterministic container/volume names so the provider reuses - // the same buildkit container and cache across deploys. + // Ensure the named builder exists and is running. --bootstrap starts the + // buildkit container if stopped (e.g. after buildkit-cleanup from a prior deploy). + // Without it, `inspect` succeeds on a stopped builder but the provider gets EOF. const ensureBuilder = new command.local.Command( `${name}-ensure-builder`, { create: - "docker buildx inspect openclaw-builder >/dev/null 2>&1 || docker buildx create --name openclaw-builder --driver docker-container", + "docker buildx inspect openclaw-builder --bootstrap >/dev/null 2>&1 || docker buildx create --name openclaw-builder --driver docker-container --bootstrap", }, { parent: this }, ); @@ -272,6 +272,7 @@ export class GatewayImage extends pulumi.ComponentResource { // (pulumi/pulumi-docker-build#65). Build cache is stored in named Docker volumes // and survives container stop — the provider restarts them on the next build. // Depends on commitTag (last local buildx operation) to avoid race conditions. + // Non-fatal: failure just means buildkit containers keep running (cache buildup). new command.local.Command( `${name}-buildkit-cleanup`, { @@ -300,54 +301,67 @@ export class GatewayImage extends pulumi.ComponentResource { { parent: this }, ); - // Pull by stable version tag — re-pull gated by stable build inputs. + // Pull by stable version tag — re-pull gated by registry digest. // Use docker.io/ prefix so the provider matches registryAuth address. - // Treat a first path segment as an explicit registry if it contains a dot, - // contains a port, or is "localhost" (Docker reference semantics). const pullTag = hasExplicitRegistry(remoteTag) ? remoteTag : `docker.io/${remoteTag}`; - // Remove stale local image before pulling. The Docker provider's findImage() - // short-circuits on local tag match and ignores the platform field, so a cached - // arm64 image prevents re-pulling the correct amd64 variant. - const removeStale = new command.remote.Command( - `${name}-remove-stale`, + // Query the registry for the current manifest digests. This is the source + // of truth — it reflects what was actually pushed, not what we built locally. + // Uses getRegistryImageManifests (not getRegistryImage) because we push + // multi-arch manifest lists. Auth is passed directly — no provider needed + // for registry API calls, which are independent of any Docker daemon. + const registryManifests = docker.getRegistryImageManifestsOutput( { - connection: args.connection, - create: `docker rmi ${pullTag} 2>/dev/null || true`, - triggers: [ - imageDigestTrigger, - ...(args.platform ? [args.platform] : []), - ], + name: remoteTag, + authConfig: { + address: "registry-1.docker.io", + username, + password, + }, }, - { parent: this, dependsOn: [image] }, + { parent: this }, ); - // Use triggers (not pullTriggers) to force resource replacement on digest change. - // pullTriggers does in-place update where findImage() can short-circuit on local tag. - // triggers forces delete+create = guaranteed fresh pull. - new docker.RemoteImage( + // Extract the digest for the target platform. For single-platform builds, + // there's only one manifest. For multi-platform, match the VPS architecture. + const targetArch = args.platform?.split("/")[1] ?? "amd64"; + const pullDigest = registryManifests.manifests.apply((manifests) => { + const match = manifests.find( + (m) => m.architecture === targetArch && m.os === "linux", + ); + if (!match) { + throw new Error( + `No manifest found for linux/${targetArch} in ${remoteTag}`, + ); + } + return match.sha256Digest; + }); + + // pullTriggers with registry digest as source of truth. When the digest + // changes, Pulumi replaces the resource (destroy + create). forceRemove + // ensures the destroy step removes the image even when running containers + // reference it — without this, findImage() finds the stale local tag and + // skips the pull entirely (kreuzwerker/terraform-provider-docker behavior). + const pulledImage = new docker.RemoteImage( `${name}-pull`, { name: pullTag, platform: args.platform, - triggers: { - digest: imageDigestTrigger, - ...(args.platform ? { platform: args.platform } : {}), - }, - keepLocally: true, + pullTriggers: [pullDigest], + forceRemove: true, }, { parent: this, provider: remoteDockerProvider, - dependsOn: [image, removeStale], + dependsOn: [image], }, ); return { - imageName: pulumi.output(pullTag), - imageDigest: imageDigestTrigger, + imageName: pulledImage.imageId, + imageDigest: pullDigest, }; } @@ -380,13 +394,14 @@ export class GatewayImage extends pulumi.ComponentResource { { parent: this }, ); - // Ensure the named builder exists on the VPS + // Ensure the named builder exists and is running on the VPS. + // --bootstrap starts the buildkit container if stopped (e.g. after buildkit-cleanup). const ensureBuilder = new command.remote.Command( `${name}-ensure-builder`, { connection: args.connection, create: - "docker buildx inspect openclaw-builder >/dev/null 2>&1 || docker buildx create --name openclaw-builder --driver docker-container", + "docker buildx inspect openclaw-builder --bootstrap >/dev/null 2>&1 || docker buildx create --name openclaw-builder --driver docker-container --bootstrap", }, { parent: this }, ); @@ -408,6 +423,7 @@ export class GatewayImage extends pulumi.ComponentResource { // Stop buildkit containers left behind by @pulumi/docker-build on the VPS // (pulumi/pulumi-docker-build#65). Build cache is stored in named Docker volumes // and survives container stop — the provider restarts them on the next build. + // Non-fatal: failure just means buildkit containers keep running (cache buildup). new command.remote.Command( `${name}-buildkit-cleanup`, { diff --git a/tests/components.test.ts b/tests/components.test.ts index d518bf6..7ae705d 100644 --- a/tests/components.test.ts +++ b/tests/components.test.ts @@ -73,6 +73,17 @@ beforeAll(() => { state.ref ?? "docker.io/mock/repo:tag@sha256:mockindexdigest"; } + // docker.RemoteImage — provide inspectable pull outputs + if (args.type === "docker:index/remoteImage:RemoteImage") { + const imageName = + (state.name as string | undefined) ?? "docker.io/mock/repo:tag"; + state.imageId = + state.imageId ?? `${imageName}@sha256:mockremoteimageid`; + state.repoDigest = + state.repoDigest ?? + `${imageName}@sha256:mockrepodigest1234567890abcdef`; + } + // command.remote.Command — provide stdout/stderr if (args.type === "command:remote:Command") { state.stdout = state.stdout ?? "mock-stdout"; @@ -82,6 +93,30 @@ beforeAll(() => { return { id: `${args.name}-id`, state }; }, call: (args: pulumi.runtime.MockCallArgs) => { + if ( + args.token === + "docker:index/getRegistryImageManifests:getRegistryImageManifests" + ) { + const name = + (args.inputs["name"] as string | undefined) ?? + "docker.io/mock/repo:tag"; + return { + name, + manifests: [ + { + architecture: "amd64", + os: "linux", + sha256Digest: "sha256:mockamd64manifestdigest1234567890", + }, + { + architecture: "arm64", + os: "linux", + sha256Digest: "sha256:mockarm64manifestdigest1234567890", + }, + ], + }; + } + // oci.core.getVnicAttachments — return a mock VNIC attachment if (args.token === "oci:Core/getVnicAttachments:getVnicAttachments") { return { @@ -949,7 +984,9 @@ describe("GatewayImage component", () => { }); const imageName = await promiseOf(img.imageName); - expect(imageName).toBe("docker.io/myuser/openclaw:dev-latest"); + expect(imageName).toBe( + "docker.io/myuser/openclaw:dev-latest@sha256:mockremoteimageid", + ); }); it("keeps localhost registries unprefixed in dockerhubPush mode", async () => { @@ -967,6 +1004,8 @@ describe("GatewayImage component", () => { }); const imageName = await promiseOf(img.imageName); - expect(imageName).toBe("localhost:5000/openclaw:dev-latest"); + expect(imageName).toBe( + "localhost:5000/openclaw:dev-latest@sha256:mockremoteimageid", + ); }); });