diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 7da25e4..fcc156d 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1,5 +1,16 @@ name: Release +# Per-OS build pipeline. Each platform builds on its native self-hosted +# runner via `mage build:`, packages the binary, and uploads it as +# a workflow artifact. A final `release` job downloads all artifacts, +# computes checksums, and creates a draft GitHub release. +# +# This split exists because cross-compiling Windows/Darwin binaries from +# a Linux host leaves their go:embed sections empty — the embedded VM +# kernel, initrd, and rootfs are platform-specific assets downloaded by +# mage targets that only run on the matching host. Each native build +# pulls its own assets and produces a fully working binary. + on: push: tags: @@ -12,35 +23,168 @@ env: GO_VERSION: "1.26.1" jobs: - release: - name: Build & Release + build-linux-amd64: + name: Build (Linux amd64) runs-on: [self-hosted, linux, x64] + steps: + - uses: actions/checkout@v6 + with: + fetch-depth: 0 # so `git describe --tags` returns the tag + + # mage comes from the runner-ci-linux image, same as ci.yml. + - name: Restore cached build deps + run: /opt/ephemerd-ci/entrypoint-cache.sh + continue-on-error: true + + - name: Build Linux amd64 binary + run: mage build:build + + - name: Verify binary + run: ./ephemerd --version + + - name: Package + run: tar -czf "ephemerd_${GITHUB_REF_NAME}_linux_amd64.tar.gz" ephemerd + + - uses: actions/upload-artifact@v4 + with: + name: ephemerd-linux-amd64 + path: ephemerd_*_linux_amd64.tar.gz + if-no-files-found: error + + build-linux-arm64: + name: Build (Linux arm64) + runs-on: [self-hosted, linux, arm64] + steps: + - uses: actions/checkout@v6 + with: + fetch-depth: 0 + + - uses: actions/setup-go@v6 + with: + go-version: ${{ env.GO_VERSION }} + + - name: Install mage + run: go install github.com/magefile/mage@latest + + - name: Build Linux arm64 binary + run: mage build:build + + - name: Verify binary + run: ./ephemerd --version + + - name: Package + run: tar -czf "ephemerd_${GITHUB_REF_NAME}_linux_arm64.tar.gz" ephemerd + + - uses: actions/upload-artifact@v4 + with: + name: ephemerd-linux-arm64 + path: ephemerd_*_linux_arm64.tar.gz + if-no-files-found: error + + build-windows-amd64: + name: Build (Windows amd64) + runs-on: [self-hosted, windows, x64] + steps: + - uses: actions/checkout@v6 + with: + fetch-depth: 0 + + - name: Restore cached build deps + run: C:\ephemerd-ci\entrypoint-cache.ps1 + continue-on-error: true + + - name: Ensure mage is available + run: | + if (Get-Command mage -ErrorAction SilentlyContinue) { return } + Write-Host "mage not found - installing via go install..." + $env:GOBIN = 'C:\go\bin' + go install github.com/magefile/mage@latest + + # Full two-stage build: cross-compiles ephemerd-linux for the Hyper-V + # VM, downloads kernel/initrd/rootfs, then builds ephemerd.exe with + # all assets embedded. ~700 MB output. + - name: Build Windows binary + run: mage build:windows + + - name: Verify binary + run: .\ephemerd.exe --version + + - name: Package + run: Compress-Archive -Path .\ephemerd.exe -DestinationPath "ephemerd_${env:GITHUB_REF_NAME}_windows_amd64.zip" -Force + + - uses: actions/upload-artifact@v4 + with: + name: ephemerd-windows-amd64 + path: ephemerd_*_windows_amd64.zip + if-no-files-found: error + + build-darwin-arm64: + name: Build (macOS arm64) + runs-on: [self-hosted, macos, arm64] steps: - uses: actions/checkout@v6 with: fetch-depth: 0 - # Go and mage come from the runner-ci-linux image (see images/runner-ci-linux/Dockerfile), - # so setup-go and `go install mage` are unnecessary here. - # - uses: actions/setup-go@v6 - # with: - # go-version: ${{ env.GO_VERSION }} - # - name: Install mage - # run: go install github.com/magefile/mage@latest + - uses: actions/setup-go@v6 + with: + go-version: ${{ env.GO_VERSION }} + + - name: Install mage + run: go install github.com/magefile/mage@latest + + # Full Darwin build: downloads aarch64 Linux kernel/initrd/rootfs, + # cross-compiles ephemerd-linux for arm64, builds ephemerd, then + # ad-hoc codesigns with the virtualization entitlement Vz requires. + - name: Build macOS binary + run: mage build:macos - - name: Download embedded dependencies + - name: Verify binary run: | - mage download:all - mage download:runnerwindows - mage download:rootfs + ./ephemerd --version + codesign -v ./ephemerd && echo "codesigned" || echo "NOT codesigned" - - name: Cross-compile Linux binary for Windows embedding - run: mage build:linuxembed + - name: Package + run: tar -czf "ephemerd_${GITHUB_REF_NAME}_darwin_arm64.tar.gz" ephemerd - - name: Run GoReleaser - uses: goreleaser/goreleaser-action@v6 + - uses: actions/upload-artifact@v4 with: - version: latest - args: release --clean + name: ephemerd-darwin-arm64 + path: ephemerd_*_darwin_arm64.tar.gz + if-no-files-found: error + + release: + name: Publish GitHub Release + needs: + - build-linux-amd64 + - build-linux-arm64 + - build-windows-amd64 + - build-darwin-arm64 + runs-on: [self-hosted, linux, x64] + steps: + - uses: actions/checkout@v6 + with: + fetch-depth: 0 + + - name: Download all artifacts + uses: actions/download-artifact@v4 + with: + path: dist + merge-multiple: true + + - name: Compute checksums + run: | + cd dist + sha256sum * > checksums.txt + cat checksums.txt + + - name: Create draft release env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + gh release create "${GITHUB_REF_NAME}" \ + --draft \ + --prerelease \ + --generate-notes \ + --title "${GITHUB_REF_NAME}" \ + dist/* diff --git a/.goreleaser.yml b/.goreleaser.yml deleted file mode 100644 index 759d1f6..0000000 --- a/.goreleaser.yml +++ /dev/null @@ -1,62 +0,0 @@ -version: 2 - -before: - hooks: - # Download assets for ALL platforms (Linux runner/CNI/shim come from download:all). - - cmd: mage download:all - # Windows cross-compile needs the real Linux binary, Alpine rootfs, and Windows runner. - - cmd: mage download:runnerwindows - - cmd: mage download:rootfs - - cmd: mage build:linuxembed - -builds: - - id: ephemerd - main: ./cmd/ephemerd/ - binary: ephemerd - env: - - CGO_ENABLED=0 - ldflags: - - -s -w - - -X main.version={{.Version}} - - -X github.com/ephpm/ephemerd/pkg/runner.Version=2.333.1 - - -X github.com/ephpm/ephemerd/pkg/cni.Version=1.6.2 - goos: - - linux - - windows - - darwin - goarch: - - amd64 - - arm64 - ignore: - - goos: windows - goarch: arm64 - - goos: darwin - goarch: amd64 - -archives: - - id: default - formats: - - tar.gz - format_overrides: - - goos: windows - formats: - - zip - name_template: "{{ .ProjectName }}_{{ .Version }}_{{ .Os }}_{{ .Arch }}" - -checksum: - name_template: "checksums.txt" - -release: - github: - owner: ephpm - name: ephemerd - draft: true - prerelease: auto - -changelog: - sort: asc - filters: - exclude: - - "^docs:" - - "^test:" - - "^chore:" diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..62157f0 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,51 @@ +# AGENTS.md + +Hard rules for any AI agent (Claude Code, Cursor, Aider, etc.) working in this repo. CLAUDE.md has the longer-form expectations; this file is the short list of things that get caught in CI and shouldn't. + +## Before every `git push` + +Run the CI pipeline locally and fix every error before pushing. **No exceptions.** Not "looks fine to me", not "the change is small", not "my local cgo is broken so I'll skip it" — those are the exact reasons CI gets surprised. If the local environment can't run `mage ci`, fix that environment before pushing, or run the targets individually and document what was skipped *and why* in the commit message. + +``` +mage ci +``` + +That target runs download, lint, test, and build — the same sequence CI runs. If `mage ci` is too slow for an iteration loop, run the relevant subset: + +``` +mage lint # golangci-lint — errcheck and errorlint enabled +mage test # go test ./... +mage build # cross-compile for current OS +mage e2egithub # GitHub provider e2e against fake server +mage e2emodproxy # Go module proxy e2e +``` + +If any of those fail, fix the failure before pushing. A red CI run on a PR you opened is a process failure, not a discovery. + +## Specifically: the `golangci-lint` failure modes that have bitten us + +- **errcheck** — every fallible call must be checked. Closures over `http.ResponseWriter` writes, `Close()` in defers, `io.Copy` return values, `fmt.Fprintf` return values. The repo policy is to wrap them in `if err := foo(); err != nil { log.Warn(...) }`, never `_ = foo()`. +- **staticcheck SA9003** — empty `if` branches. If the branch is empty because the comment is "either is acceptable", invert the condition and `t.Errorf`/`return err` for the *un*expected case. +- **typecheck failures on Windows** — `miekg/pkcs11` cgo preprocessing fails on Windows. This is a *local* problem, not a CI problem. Running lint on Linux works. If you can't run lint locally, you push at your own risk and the user's annoyance. + +## Specifically: the test failure modes that have bitten us + +- **`pkg/dind/TestPushHandlerEndToEnd`** has been flaky in CI in ways that aren't obviously reproducible locally. Don't paper over a flake with a `cs.Info()` "warm-up" call or a `time.Sleep` — that's flake-masking, and the real bug will resurface in the next iteration. If a test is genuinely flaky, find the race or the missing lease/label and fix it; if you can't, mark it `t.Skip` with an issue number, not a silent diagnostic. + +## Pushing workflow yaml changes + +Workflow YAML doesn't go through `mage lint`, but the consequences of a broken `.github/workflows/*.yml` are worse than Go lint errors — a bad workflow doesn't tell you anything when it fails on the *next* trigger. For non-trivial workflow changes: + +- Use [`actionlint`](https://github.com/rhysd/actionlint) if installed, or paste into [rhysd.github.io/actionlint](https://rhysd.github.io/actionlint/) for a quick sanity check. +- Sanity-check `runs-on` labels exist on registered self-hosted runners (ephemerd JIT-registers based on host `goruntime.GOARCH`; cross-arch runs require the matching host to be online). +- Confirm any `secrets.*` references exist in the repo/org secrets before pushing. + +## Pushing release-pipeline / tag-triggered changes + +`.github/workflows/release.yml` only fires on `push: tags: v*`, so you cannot test it on a branch. Sequence: + +1. Push the workflow change in a regular PR. Get it merged. +2. Push a release-candidate tag like `v0.0.1-rc1` to validate the full pipeline end-to-end before pushing the real tag. +3. Only push `v0.0.1` once the rc has produced a clean draft release. + +Do **not** push the real tag as the first test of a changed release workflow. diff --git a/pkg/dind/registry_e2e_test.go b/pkg/dind/registry_e2e_test.go index 819944a..e245f80 100644 --- a/pkg/dind/registry_e2e_test.go +++ b/pkg/dind/registry_e2e_test.go @@ -22,6 +22,7 @@ import ( "github.com/containerd/containerd/v2/core/content" "github.com/containerd/containerd/v2/core/images" + "github.com/containerd/containerd/v2/core/leases" "github.com/containerd/containerd/v2/pkg/namespaces" containerdpkg "github.com/ephpm/ephemerd/pkg/containerd" "github.com/opencontainers/go-digest" @@ -154,6 +155,25 @@ func TestPushHandlerEndToEnd(t *testing.T) { ctx, cancel := context.WithTimeout(namespaces.WithNamespace(context.Background(), bkNamespace), 60*time.Second) defer cancel() + // Hold a lease across the entire staging→push lifecycle. Without this, + // content.WriteBlob registers the blob in the namespace bucket but + // attaches no lease (addContentLease is a no-op without leases.FromContext) + // and no GC-ref labels are written for plain child blobs. The buildkit + // namespace can then have orphan content that is racy with respect to + // containerd's internal flushing/visibility paths — this manifested as + // CI flakes where TestPushHandlerEndToEnd would fail mid-push with + // "content digest sha256:...layer...: not found". + lease, err := ctrdClient.LeasesService().Create(ctx, leases.WithExpiration(5*time.Minute)) + if err != nil { + t.Fatalf("create lease: %v", err) + } + t.Cleanup(func() { + if err := ctrdClient.LeasesService().Delete(context.Background(), lease); err != nil { + t.Logf("delete lease: %v", err) + } + }) + ctx = leases.WithLease(ctx, lease.ID) + // Stage a synthetic OCI image: empty layer + tiny config + manifest // pointing at both. Image record `mockRef` so /push GetImage finds it. imgDesc, err := stageSyntheticImage(ctx, ctrdClient, mockRef) @@ -162,20 +182,6 @@ func TestPushHandlerEndToEnd(t *testing.T) { } t.Logf("staged image %s -> %s (%d bytes)", mockRef, imgDesc.Digest, imgDesc.Size) - // Diagnostic: confirm the three staged blobs are visible via the same - // content store the push handler will use, in the same namespace, right - // now. If any of these Info calls reports NotFound, the write didn't - // register the digest in the buildkit-namespace bucket — distinct from - // the symptom where push later fails to ReaderAt the layer. - cs := ctrdClient.ContentStore() - layerBytes := []byte("synthetic-layer-for-push-e2e") - layerDgst := digest.FromBytes(layerBytes) - for _, d := range []digest.Digest{layerDgst, imgDesc.Digest} { - info, infoErr := cs.Info(ctx, d) - t.Logf("post-stage Info(%s): err=%v size=%d labels=%v", - d, infoErr, info.Size, info.Labels) - } - // Bring up the dind server. s, err := New(Config{ JobID: "push-e2e",