From c900f9d880d2f8ae0fbfeff0b9b29c97a14f22aa Mon Sep 17 00:00:00 2001 From: Luther Monson Date: Tue, 26 May 2026 21:30:00 -0600 Subject: [PATCH 1/2] fix(dind): translate sibling bind sources to runner snapshot upperdir MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The dind shim used to silently drop any `-v` source that didn't os.Stat on the daemon's filesystem. GHA `container:` workflows ask for sources inside the runner container's mount namespace (/home/runner/_work/_temp/.sh), which the dind daemon can't see — so every bind was dropped, the sibling started without the script directory, and `docker exec sh -e /__w/_temp/.sh` failed with "cannot open". Resolve each requested source against (1) the non-rootfs bind table ephemerd installed into the runner (/var/run/docker.sock and friends), then (2) the runner snapshot's overlayfs upperdir (rw — runner-written workspace files), then (3) the lowerdirs (ro — image layers, shared across jobs so writes would corrupt the cache). Anything that doesn't match returns 400 with a clear "bind mount X -> Y rejected" message instead of being quietly dropped. Linux-only for v1; Windows-native jobs use a different snapshotter and mount model and are deferred. Architecture doc at docs/arch/dind-bind-translation.md. Closes the GHA `container:` regression for jobs running on ephemerd self-hosted runners. --- docs/arch/dind-bind-translation.md | 244 +++++++++++++++++++ pkg/dind/bindtranslate.go | 108 +++++++++ pkg/dind/bindtranslate_e2e_test.go | 303 +++++++++++++++++++++++ pkg/dind/bindtranslate_test.go | 377 +++++++++++++++++++++++++++++ pkg/dind/containers.go | 66 +++-- pkg/dind/dind.go | 50 +++- pkg/dind/exec.go | 31 +++ pkg/runtime/runtime.go | 28 +++ 8 files changed, 1191 insertions(+), 16 deletions(-) create mode 100644 docs/arch/dind-bind-translation.md create mode 100644 pkg/dind/bindtranslate.go create mode 100644 pkg/dind/bindtranslate_e2e_test.go create mode 100644 pkg/dind/bindtranslate_test.go diff --git a/docs/arch/dind-bind-translation.md b/docs/arch/dind-bind-translation.md new file mode 100644 index 0000000..d1b9d0f --- /dev/null +++ b/docs/arch/dind-bind-translation.md @@ -0,0 +1,244 @@ +# Dind Bind Mount Translation + +> **Status: implemented in PR `fix/dind-bind-mount-translate`.** Covers the v1 +> shape and the deliberately-deferred follow-ups. + +## Problem + +GitHub Actions workflows can request a per-job container with the +`container:` directive: + +```yaml +jobs: + build: + runs-on: [self-hosted, linux, x64] + container: ghcr.io/your-org/ci-image:latest + steps: + - uses: actions/checkout@v4 + - run: make test +``` + +When this runs on an ephemerd self-hosted runner, the upstream GHA runner +binary handles `container:` by: + +1. `docker pull` the requested image. +2. `docker create` a sibling container with a long bind list: + ``` + -v /home/runner/_work:/__w + -v /home/runner/_work/_temp:/__w/_temp + -v /home/runner/_work/_actions:/__w/_actions + -v /home/runner/_work/_tool:/__w/_tool + -v /home/runner/_work/_temp/_github_home:/github/home + -v /home/runner/_work/_temp/_github_workflow:/github/workflow + -v /home/runner/externals:/__e:ro + -v /var/run/docker.sock:/var/run/docker.sock + ``` +3. `docker start`, then for each step: write the wrapper script to + `/home/runner/_work/_temp/.sh` and `docker exec sh -e + /__w/_temp/.sh`. + +Before this PR, ephemerd's dind shim accepted every `-v` in the API request +but silently dropped any bind whose source did not `os.Stat` on the dind +daemon's filesystem. Because the source paths live inside the runner +container's mount namespace, the dind daemon (running outside that namespace) +saw none of them. Every bind was dropped. The sibling started fine, but the +step's first `docker exec` failed with `sh: 0: cannot open +/__w/_temp/.sh: No such file` — confusing because the wrapper script +exists, just not where the sibling looked. + +This breaks every workflow that uses `container:`. Anthropic-style +workflows, ephpm, and most projects that want a reproducible toolchain +without baking a custom runner image depend on it. + +## Two-container model + +The fix has to acknowledge that at the moment of `docker create`, two +containers exist: + +- **A: the runner container.** Created by `pkg/runtime`, lives in + containerd namespace `"ephemerd"`, owns snapshot `-snapshot` + on the `overlayfs` snapshotter. Inside A, the GHA runner binary writes + workspace files under `/home/runner/_work/...` (upperdir), and reads + pre-baked tools under `/home/runner/externals/...` (lowerdir). +- **B: the sibling about to be created** by the dind shim. Lives in + `s.jobNamespace` (`"ephemerd-dind-"`), will get its own + snapshot. + +A's filesystem decomposes into two distinct categories that need +different handling: + +1. **Overlayfs rootfs:** upperdir (mutable, where the runner's writes + land) plus lowerdirs (immutable image layers). All real paths on the + dind daemon's filesystem, discoverable from + `snapshotter.Mounts(ctx, "-snapshot")`. +2. **Special binds ephemerd installed into A:** + - `/var/run/docker.sock` → `/jobs//docker/d.sock` (the + dind socket file). + - `/etc/hosts`, `/etc/resolv.conf` → per-runner config files written + by `withHostsMount` / `withDNSMount`. + - `r.cfg.RunnerMount` (e.g. `/home/runner/runner`) → `jobRunnerDir` + (the per-job copy of the embedded runner directory, used on Windows + and on custom images). + + These mounts are not in A's snapshot — they're explicit `Type:bind` + entries in A's OCI spec. + +When B asks for `-v /X:/Y`, `/X` is a path in A's mount namespace. To +hand the right thing to containerd as B's bind source, the dind shim has +to translate `/X` to wherever it actually lives on the dind daemon's +filesystem. + +## Resolution policy + +`pkg/dind/bindtranslate.go::translateBindSource` resolves in this order: + +1. **Longest-prefix match against A's bind table.** If `/X` is under a + destination ephemerd installed into A (e.g. `/var/run/docker.sock`), + use the corresponding host source. The leftover suffix is appended. + Longest-prefix wins so a child mount (`/etc/hosts`) is preferred over + a parent (`/etc`). +2. **Upperdir match.** If `/X` exists, B's bind source is that + path. Returned `rw` — A's upperdir is writable, and the GHA `_temp` + case requires the sibling to read the next step's script written + *after* `docker create`, so the directory mount must stay live. +3. **Lowerdir match.** If `/X` only exists in an image layer, B's bind + source is that lowerdir path but the mount is forced `ro`. The + lowerdir is shared with every other container using the same base + image; a rw mount on top of it would corrupt the cache for unrelated + jobs. +4. **No match → error.** Surfaced as HTTP 400 from + `handleContainerCreate`. The pre-fix behavior was to silently drop; + the new behavior fails loudly so the user sees a clear "bind mount + /X -> /Y rejected" instead of a downstream "cannot open". + +`filepath.Clean`/`path.Clean` normalizes `..` before the join, so a +malicious `/home/runner/../../etc/shadow` resolves to `/etc/shadow` and +either falls into A's rootfs (which means the sibling sees A's own +`/etc/shadow` — exactly what A could already see) or fails to resolve at +all. There is no source path that escapes the runner's rootfs envelope. + +## Security envelope + +The sibling B can only see what A could already see. There is no +privilege expansion: + +- Bind table entries point at host paths that ephemerd itself installed + into A. B reaches what A reached, no more. +- Upperdir / lowerdir entries point at A's snapshot. B mounts a path + inside A's rootfs; A already had access to that same content. +- Anything not in the bind table or A's snapshot is rejected. There is + no code path that takes an attacker-supplied `/etc/shadow` or `/` and + hands it to containerd — the silent-drop bug accidentally provided + this property and the loud-fail fix preserves it explicitly. + +The high-risk anti-pattern this design avoids is the standard +"mount the Docker socket into a runner" model, where the dind sees real +host paths and arbitrary `-v` sources are honored. That model is +well-known to be root-on-host. Ours is not, because dind never resolves +sibling sources against the host filesystem directly. + +## Lifecycle + +`pkg/runtime.Destroy` cleans up in this order: + +1. Kill the runner task. +2. Delete the task. +3. Teardown networking. +4. **`env.Dind.Stop()`** — calls `destroyAllContainers()`, which kills + every sibling and deletes them, then drops the per-job containerd + namespace. +5. **`container.Delete(WithSnapshotCleanup)`** — removes A's container + and its snapshot (the upperdir disappears from disk). + +Step 4 runs before step 5, so siblings are gone before A's upperdir is +removed. A sibling cannot end up with a stale bind in normal teardown. +If step 4 fails to fully clean a sibling (containerd wedged, kill +timeout), step 5 still proceeds and the zombie sibling's mount becomes +stale — but since the sibling's task is already killed at that point, +nothing tries to use the stale mount and the leak is bounded to whatever +namespace-cleanup pass eventually reaps it. + +No snapshot lease extension is needed. The earlier draft of this +design proposed leases for siblings outliving the runner; that scenario +doesn't exist in ephemerd's job model. + +## Wiring + +`pkg/runtime/runtime.go`, right after `r.client.NewContainer(...)` succeeds +and before `task.Start(...)`: + +```go +if dindServer != nil && goruntime.GOOS != "windows" { + bindMappings := map[string]string{} + if dindServer.SocketPath() != "" { + bindMappings["/var/run/docker.sock"] = dindServer.SocketPath() + } + hostDataDir := filepath.Dir(r.cfg.LogDir) + bindMappings["/etc/hosts"] = filepath.Join(hostDataDir, "hosts", id+".hosts") + bindMappings["/etc/resolv.conf"] = filepath.Join(hostDataDir, "dns", id+".conf") + if jobRunnerDir != "" && r.cfg.RunnerMount != "" { + bindMappings[r.cfg.RunnerMount] = jobRunnerDir + } + dindServer.SetRunnerRootfs(snapshotName, bindMappings) +} +``` + +`pkg/dind/dind.go::SetRunnerRootfs` stores the snapshot key and a copy of +the bind table on the Server. `pkg/dind/containers.go::buildBindMounts` +consults them when translating each `-v` from +`req.HostConfig.Binds`. The translation runs *before* the OCI spec is +finalized, so a rejection turns into HTTP 400 cleanly. + +## Windows + +The `goruntime.GOOS != "windows"` guard skips the registration only on +the Windows-native runner code path. There are two scenarios: + +- **Linux job on a Windows host.** ephemerd's host daemon spawns a + Hyper-V Linux VM (`pkg/vm/linuxvm_windows.go`); the runner container + inside is created by a *separate* ephemerd process running as Linux + inside the VM. That in-VM process sees `goruntime.GOOS == "linux"`, + so its `pkg/runtime.Create()` registers the rootfs normally. The + translation works. +- **Windows-native job on a Windows host.** Hyper-V isolated Windows + container, `windowsfilter` snapshotter. There is no overlay + upperdir/lowerdir to walk, and Windows bind semantics differ + (`Mount.Type` is empty rather than `"bind"`, `Options` uses different + flags, junctions instead of `rbind`). Translation needs a separate + design. The GHA `container:` directive for `runs-on: windows-*` is + unusual enough that this is deferred. + +## Tests + +`pkg/dind/bindtranslate_test.go`: + +- **9 pure-function tests** for `translateBindSource`: upperdir match + returns rw, lowerdir match forces ro, runner-bind translation + including subpath, longest-prefix wins over parent, unknown source + rejection, relative-path rejection, `..` traversal stays bounded, + upper-over-lower preference (overlay copy-up semantics). +- **3 integration tests** for `Server.buildBindMounts`: the full + 8-bind set from a real GHA `container:` failure log (asserts + docker.sock translation, `_temp` lands in upperdir rw, `externals` + lands in lowerdir ro), unknown source surfaces a 400-shaped error, + no-rootfs-registered rejects rather than silently allowing. + +All tests pass with `CGO_ENABLED=0 go test ./pkg/dind/` and don't +require a real containerd. + +## Deferred follow-ups + +- **Windows-native `container:`.** Different snapshotter and mount + semantics; needs its own translation layer or a clean "not supported" + rejection at request time. +- **Symlink hardening.** `filepath.Clean` handles `..` but not symlinks + that resolve outside the runner rootfs. The current upperdir/lowerdir + walk only honors paths that exist as plain files/dirs within the + layers, so we don't currently *open* the door — but if a future + layer walk were to add `filepath.EvalSymlinks`, the call needs an + after-the-fact prefix check to confirm the resolved path stays inside + the snapshot directory. +- **Resolved-path caching.** Each `buildBindMounts` call queries the + snapshotter and `os.Stat`s every source. A given runner doesn't + change its layers within a job, so the resolution can be cached + per-runner. Not worth optimizing until we see it in a profile. diff --git a/pkg/dind/bindtranslate.go b/pkg/dind/bindtranslate.go new file mode 100644 index 0000000..4c7f25d --- /dev/null +++ b/pkg/dind/bindtranslate.go @@ -0,0 +1,108 @@ +package dind + +import ( + "fmt" + "os" + "path" + "sort" + "strings" +) + +// bindResolution is the outcome of translating a sibling container's -v +// source from the runner container's mount namespace to a real path on the +// dind daemon's filesystem. +type bindResolution struct { + // HostPath is the path the dind daemon will hand to containerd as the + // OCI bind source. It is always on the dind daemon's filesystem. + HostPath string + // ForceReadOnly is set when the source resolved to a shared image + // layer (lowerdir). Writes through that mount would corrupt the + // cached image for every other job using the same base, so the bind + // is downgraded to ro regardless of what the client requested. + ForceReadOnly bool +} + +// translateBindSource maps a bind source path the sibling container received +// (which the runner specified relative to its own mount namespace) to a real +// path on the dind daemon's filesystem. +// +// runnerBinds is a map of (runner mount destination → host source) covering +// non-rootfs mounts ephemerd installed into the runner (/var/run/docker.sock, +// /etc/hosts, /etc/resolv.conf, the embedded runner directory, etc.). +// +// upperdir / lowerdirs come from the runner snapshot's overlayfs mount +// options. upperdir is the mutable layer where the runner's writes land; +// lowerdirs are the shared, read-only image layers. +// +// Resolution order: +// 1. Longest-prefix match against runnerBinds — /var/run/docker.sock and +// anything under a known bind destination is translated via the bind +// table, never re-resolved against the rootfs. +// 2. upperdir match → returned rw. This is the common GHA `container:` +// case: the runner writes /home/runner/_work/_temp/.sh which +// lives in the runner's upperdir, and the sibling needs to read it +// (and the next step's wrapper script likewise needs to land back +// in the same _temp directory, so the mount has to stay rw). +// 3. lowerdir match → returned ro. Image-layer files (e.g. +// /home/runner/externals) are shared across every container using the +// same base image, so a rw mount on top of one would corrupt the +// cache. +// 4. No match → error. The pre-fix shim silently dropped these, which +// surfaced downstream as "cannot open /__w/_temp/.sh". +func translateBindSource(src string, runnerBinds map[string]string, upperdir string, lowerdirs []string) (bindResolution, error) { + // Sources are POSIX paths from the runner's Linux mount namespace; + // use path (not filepath) so this evaluates consistently on Windows + // build hosts during testing. Host-side joins below use filepath + // because the dind daemon's filesystem is native. + if !path.IsAbs(src) { + return bindResolution{}, fmt.Errorf("bind source %q must be absolute", src) + } + cleaned := path.Clean(src) + + if host, suffix, ok := matchBindPrefix(cleaned, runnerBinds); ok { + return bindResolution{HostPath: path.Join(host, suffix)}, nil + } + + if upperdir != "" { + candidate := path.Join(upperdir, cleaned) + if _, err := os.Stat(candidate); err == nil { + return bindResolution{HostPath: candidate}, nil + } + } + + for _, lower := range lowerdirs { + if lower == "" { + continue + } + candidate := path.Join(lower, cleaned) + if _, err := os.Stat(candidate); err == nil { + return bindResolution{HostPath: candidate, ForceReadOnly: true}, nil + } + } + + return bindResolution{}, fmt.Errorf("bind source %q is not visible to ephemerd dind (not in runner rootfs or known bind table)", src) +} + +// matchBindPrefix returns the host source for the longest runnerBinds key +// that contains src, along with the leftover suffix within that bind. +// Longest-prefix wins so a child mount (e.g. /etc/hosts) is preferred over +// a parent (/etc) when both are registered. +func matchBindPrefix(src string, binds map[string]string) (host string, suffix string, ok bool) { + if len(binds) == 0 { + return "", "", false + } + keys := make([]string, 0, len(binds)) + for k := range binds { + keys = append(keys, k) + } + sort.Slice(keys, func(i, j int) bool { return len(keys[i]) > len(keys[j]) }) + for _, k := range keys { + if src == k { + return binds[k], "", true + } + if strings.HasPrefix(src, k+"/") { + return binds[k], strings.TrimPrefix(src, k+"/"), true + } + } + return "", "", false +} diff --git a/pkg/dind/bindtranslate_e2e_test.go b/pkg/dind/bindtranslate_e2e_test.go new file mode 100644 index 0000000..7c8041b --- /dev/null +++ b/pkg/dind/bindtranslate_e2e_test.go @@ -0,0 +1,303 @@ +//go:build !darwin + +package dind + +import ( + "context" + "io" + "log/slog" + "os" + "path/filepath" + goruntime "runtime" + "strings" + "testing" + "time" + + "github.com/containerd/containerd/v2/core/leases" + "github.com/containerd/containerd/v2/core/mount" + "github.com/containerd/containerd/v2/pkg/namespaces" +) + +// TestBindTranslation_RealContainerd is the high-fidelity check that the +// translation actually pairs with a live overlayfs snapshotter. It does NOT +// stub rootfsSearchDirsFn — every layer path is resolved through the real +// snapshot service. +// +// The flow mirrors what happens in production for a GHA `container:` +// workflow on a self-hosted ephemerd runner: +// +// 1. The runtime creates a runner container with an overlay snapshot. +// Here we Prepare the snapshot directly via the snapshotter API and +// stage a marker file into its upperdir, which is the same place the +// real GHA runner writes /home/runner/_work/_temp/.sh. +// 2. SetRunnerRootfs registers the snapshot key with dind. +// 3. The shim is asked to translate the exact bind shape the upstream +// GHA runner emits when handling `container:`. +// 4. The translation has to find the marker via the snapshot's upperdir +// and produce a real on-disk source path. +// +// Without the translation fix this test fails: the legacy +// `os.Stat`-skip path drops every bind whose source isn't on the dind +// daemon's filesystem, and buildBindMounts now returns an error (the +// loud-failure replacement) so the test gets a clear "source not visible" +// instead of the original silent-drop confusion. +func TestBindTranslation_RealContainerd(t *testing.T) { + if testing.Short() { + t.Skip("skipping bind-translation e2e in short mode") + } + if goruntime.GOOS != "linux" { + // The fix is Linux-only (overlayfs snapshotter). Windows-native + // jobs use a different snapshotter and bind model — see arch + // doc, deferred follow-ups. + t.Skipf("bind translation requires overlayfs snapshotter; goos=%s", goruntime.GOOS) + } + + ctrdClient := sharedTestContainerd(t) + log := slog.New(slog.NewTextHandler(io.Discard, &slog.HandlerOptions{Level: slog.LevelInfo})) + + // The runner snapshot lives in the runtime's "ephemerd" namespace — + // runnerRootfsLayers explicitly switches namespaces before consulting + // the snapshotter, so the e2e has to stage it there too. + ctx, cancel := context.WithTimeout(namespaces.WithNamespace(context.Background(), sharedNamespace), 60*time.Second) + defer cancel() + + // Hold a lease so containerd's GC doesn't reap our scratch snapshot + // mid-test. Same pattern as registry_e2e. + lease, err := ctrdClient.LeasesService().Create(ctx, leases.WithExpiration(5*time.Minute)) + if err != nil { + t.Fatalf("create lease: %v", err) + } + t.Cleanup(func() { + if err := ctrdClient.LeasesService().Delete(context.Background(), lease); err != nil { + t.Logf("delete lease: %v", err) + } + }) + ctx = leases.WithLease(ctx, lease.ID) + + snapshotter := ctrdClient.SnapshotService("overlayfs") + if snapshotter == nil { + t.Skip("overlayfs snapshotter unavailable in this containerd build") + } + + const snapshotKey = "bind-translate-e2e" + t.Cleanup(func() { + cleanup, cancelCleanup := context.WithTimeout(context.Background(), 10*time.Second) + defer cancelCleanup() + cleanup = namespaces.WithNamespace(cleanup, sharedNamespace) + if err := snapshotter.Remove(cleanup, snapshotKey); err != nil && !strings.Contains(err.Error(), "not found") { + t.Logf("snapshot cleanup: %v", err) + } + }) + + // Empty parent → empty rootfs. We don't need image content, just the + // upperdir to plant the marker into. + mounts, err := snapshotter.Prepare(ctx, snapshotKey, "") + if err != nil { + t.Fatalf("snapshotter.Prepare: %v", err) + } + upperdir := extractUpperdir(t, mounts) + if upperdir == "" { + t.Fatalf("no upperdir in snapshot mounts: %+v", mounts) + } + + // Plant the directory the GHA runner creates on startup, and a marker + // file inside it standing in for the step's wrapper script. Translation + // will resolve `/home/runner/_work/_temp` against this upperdir and + // produce a bind source that points at this exact file. + if err := os.MkdirAll(filepath.Join(upperdir, "home", "runner", "_work", "_temp"), 0o755); err != nil { + t.Fatalf("plant _temp dir: %v", err) + } + markerPath := filepath.Join(upperdir, "home", "runner", "_work", "_temp", "marker.sh") + const markerBody = "#!/bin/sh\necho hello-from-runner-upperdir\n" + if err := os.WriteFile(markerPath, []byte(markerBody), 0o755); err != nil { + t.Fatalf("write marker: %v", err) + } + + // Bring up dind backed by the same containerd. The job namespace is + // distinct from the runner namespace — that's the same separation + // production uses. + dataDir, err := os.MkdirTemp("", "ephemerd-bind-e2e-*") + if err != nil { + t.Fatalf("temp dir: %v", err) + } + t.Cleanup(func() { + if err := os.RemoveAll(dataDir); err != nil { + t.Logf("cleanup: %v", err) + } + }) + + s, err := New(Config{ + JobID: "bind-translate-e2e", + DataDir: dataDir, + Client: ctrdClient, + Log: log, + }) + if err != nil { + t.Fatalf("dind New: %v", err) + } + + // Wire the runner snapshot the way the runtime does. The bind mappings + // cover the non-rootfs paths ephemerd installs into the runner — we + // only need docker.sock for this test, but supply the rest to match + // the production shape. + socketPath := filepath.Join(dataDir, "jobs", "bind-translate-e2e", "docker", "d.sock") + bindMappings := map[string]string{ + "/var/run/docker.sock": socketPath, + "/etc/hosts": filepath.Join(dataDir, "hosts", "fake.hosts"), + "/etc/resolv.conf": filepath.Join(dataDir, "dns", "fake.conf"), + } + s.SetRunnerRootfs(snapshotKey, bindMappings) + + // Drive buildBindMounts with the exact bind set the upstream GHA + // runner emits for `container:` workflows (verbatim from the ephpm + // failure log). Translation must succeed for every entry — pre-fix + // behavior silently dropped them all. + binds := []string{ + "/var/run/docker.sock:/var/run/docker.sock", + "/home/runner/_work:/__w", + "/home/runner/_work/_temp:/__w/_temp", + "/home/runner/_work/_actions:/__w/_actions", + "/home/runner/_work/_tool:/__w/_tool", + "/home/runner/_work/_temp/_github_home:/github/home", + "/home/runner/_work/_temp/_github_workflow:/github/workflow", + } + // Pre-plant the directories the runner creates so translation can + // resolve them in upperdir. We only planted _temp above; the rest are + // siblings the runner makes at startup. + for _, p := range []string{ + "home/runner/_work", + "home/runner/_work/_actions", + "home/runner/_work/_tool", + "home/runner/_work/_temp/_github_home", + "home/runner/_work/_temp/_github_workflow", + } { + if err := os.MkdirAll(filepath.Join(upperdir, p), 0o755); err != nil { + t.Fatalf("plant %s: %v", p, err) + } + } + + opts, err := s.buildBindMounts(ctx, binds) + if err != nil { + t.Fatalf("buildBindMounts: %v", err) + } + spec := applyOpts(t, opts) + + if len(spec.Mounts) != len(binds) { + t.Fatalf("got %d mounts, want %d", len(spec.Mounts), len(binds)) + } + + byDest := map[string]string{} + for _, m := range spec.Mounts { + byDest[m.Destination] = m.Source + } + + // docker.sock should route to the per-job socket path the bind + // mappings registered, not to anything inside the snapshot. + if got := byDest["/var/run/docker.sock"]; got != socketPath { + t.Errorf("docker.sock translated to %q, want %q", got, socketPath) + } + + // _temp must resolve into the snapshot upperdir, and the marker file + // must be reachable from that path — proves the snapshot's actual + // on-disk layout is what translation hands to containerd. + tempSrc := byDest["/__w/_temp"] + wantTempPrefix := filepath.Join(upperdir, "home", "runner", "_work", "_temp") + if !strings.HasPrefix(filepath.Clean(tempSrc), filepath.Clean(wantTempPrefix)) { + t.Errorf("_temp source %q does not point into upperdir %q", tempSrc, wantTempPrefix) + } + gotMarker, err := os.ReadFile(filepath.Join(tempSrc, "marker.sh")) + if err != nil { + t.Fatalf("read marker through translated bind source: %v", err) + } + if string(gotMarker) != markerBody { + t.Errorf("marker round-trip: got %q, want %q", string(gotMarker), markerBody) + } +} + +// TestBindTranslation_RejectsForeignSource is the loud-failure regression +// guard. Against today's main this would *pass silently* — the legacy code +// drops the bind and continues, leaving the test no way to notice. Against +// the fix, `/etc/shadow` is not in the runner rootfs or bind table, so +// buildBindMounts returns an error that the handler will surface as 400. +func TestBindTranslation_RejectsForeignSource(t *testing.T) { + if testing.Short() { + t.Skip("skipping bind-translation e2e in short mode") + } + if goruntime.GOOS != "linux" { + t.Skipf("bind translation requires overlayfs snapshotter; goos=%s", goruntime.GOOS) + } + + ctrdClient := sharedTestContainerd(t) + log := slog.New(slog.NewTextHandler(io.Discard, &slog.HandlerOptions{Level: slog.LevelInfo})) + + ctx, cancel := context.WithTimeout(namespaces.WithNamespace(context.Background(), sharedNamespace), 30*time.Second) + defer cancel() + lease, err := ctrdClient.LeasesService().Create(ctx, leases.WithExpiration(2*time.Minute)) + if err != nil { + t.Fatalf("create lease: %v", err) + } + t.Cleanup(func() { + if err := ctrdClient.LeasesService().Delete(context.Background(), lease); err != nil { + t.Logf("delete lease: %v", err) + } + }) + ctx = leases.WithLease(ctx, lease.ID) + + snapshotter := ctrdClient.SnapshotService("overlayfs") + if snapshotter == nil { + t.Skip("overlayfs snapshotter unavailable in this containerd build") + } + const snapshotKey = "bind-translate-reject-e2e" + t.Cleanup(func() { + cleanup, cancelCleanup := context.WithTimeout(context.Background(), 10*time.Second) + defer cancelCleanup() + cleanup = namespaces.WithNamespace(cleanup, sharedNamespace) + if err := snapshotter.Remove(cleanup, snapshotKey); err != nil && !strings.Contains(err.Error(), "not found") { + t.Logf("snapshot cleanup: %v", err) + } + }) + if _, err := snapshotter.Prepare(ctx, snapshotKey, ""); err != nil { + t.Fatalf("snapshotter.Prepare: %v", err) + } + + dataDir, err := os.MkdirTemp("", "ephemerd-bind-reject-*") + if err != nil { + t.Fatalf("temp dir: %v", err) + } + t.Cleanup(func() { _ = os.RemoveAll(dataDir) }) + + s, err := New(Config{ + JobID: "bind-reject-e2e", + DataDir: dataDir, + Client: ctrdClient, + Log: log, + }) + if err != nil { + t.Fatalf("dind New: %v", err) + } + s.SetRunnerRootfs(snapshotKey, nil) + + _, err = s.buildBindMounts(ctx, []string{"/etc/shadow:/x"}) + if err == nil { + t.Fatal("expected error rejecting /etc/shadow, got nil — silent-drop regression") + } + if !strings.Contains(err.Error(), "/etc/shadow") { + t.Errorf("error %q should name the offending source", err) + } +} + +// extractUpperdir pulls the upperdir path out of containerd's overlayfs +// mount options. Format: comma-separated "upperdir=" within Options. +func extractUpperdir(t *testing.T, mounts []mount.Mount) string { + t.Helper() + for _, m := range mounts { + for _, opt := range m.Options { + for _, part := range strings.Split(opt, ",") { + if strings.HasPrefix(part, "upperdir=") { + return strings.TrimPrefix(part, "upperdir=") + } + } + } + } + return "" +} diff --git a/pkg/dind/bindtranslate_test.go b/pkg/dind/bindtranslate_test.go new file mode 100644 index 0000000..08ca2c3 --- /dev/null +++ b/pkg/dind/bindtranslate_test.go @@ -0,0 +1,377 @@ +package dind + +import ( + "context" + "io" + "log/slog" + "os" + "path" + "path/filepath" + "strings" + "sync" + "testing" + + "github.com/containerd/containerd/v2/pkg/oci" + ocispec "github.com/opencontainers/runtime-spec/specs-go" +) + +// applyOpts invokes a list of oci.SpecOpts against an empty spec so tests +// can assert what they produced. withBindMount and friends don't touch the +// oci.Client / containers.Container args, so nil values are fine. +func applyOpts(t *testing.T, opts []oci.SpecOpts) *ocispec.Spec { + t.Helper() + spec := &ocispec.Spec{} + for _, opt := range opts { + if err := opt(context.Background(), nil, nil, spec); err != nil { + t.Fatalf("apply opt: %v", err) + } + } + return spec +} + +func testServer() *Server { + return &Server{ + log: slog.New(slog.NewTextHandler(io.Discard, nil)), + mu: sync.Mutex{}, + } +} + +// TestTranslateBindSource_UpperdirMatch_ReturnsReadWrite covers the canonical +// GHA `container:` case: the runner writes /home/runner/_work/_temp/.sh +// to its upperdir, then asks dind to mount /home/runner/_work/_temp into a +// sibling container. The sibling must see the file *and* be able to write +// the next step's wrapper script back into the same directory. +func TestTranslateBindSource_UpperdirMatch_ReturnsReadWrite(t *testing.T) { + upper := t.TempDir() + if err := os.MkdirAll(filepath.Join(upper, "home", "runner", "_work", "_temp"), 0o755); err != nil { + t.Fatalf("planting upperdir: %v", err) + } + + got, err := translateBindSource("/home/runner/_work/_temp", nil, upper, nil) + if err != nil { + t.Fatalf("translate: %v", err) + } + want := path.Join(upper, "home/runner/_work/_temp") + if got.HostPath != want { + t.Errorf("HostPath = %q, want %q", got.HostPath, want) + } + if got.ForceReadOnly { + t.Error("ForceReadOnly = true, want false (upperdir is writable)") + } +} + +// TestTranslateBindSource_LowerdirMatch_ForcesReadOnly guards the security +// property that image-layer files cannot be made writable from a sibling. +// /home/runner/externals lives in the runner image's lowerdir and is shared +// across every container using that base image — a rw mount would corrupt +// the cache for every other job. +func TestTranslateBindSource_LowerdirMatch_ForcesReadOnly(t *testing.T) { + lower := t.TempDir() + if err := os.MkdirAll(filepath.Join(lower, "home", "runner", "externals"), 0o755); err != nil { + t.Fatalf("planting lowerdir: %v", err) + } + + got, err := translateBindSource("/home/runner/externals", nil, t.TempDir(), []string{lower}) + if err != nil { + t.Fatalf("translate: %v", err) + } + want := path.Join(lower, "home/runner/externals") + if got.HostPath != want { + t.Errorf("HostPath = %q, want %q", got.HostPath, want) + } + if !got.ForceReadOnly { + t.Error("ForceReadOnly = false, want true (image layer must stay immutable)") + } +} + +// TestTranslateBindSource_RunnerBind_Translates covers the special case of +// /var/run/docker.sock — that path inside the runner is itself a bind mount +// ephemerd installed (it points at the per-job dind socket file). The +// sibling's -v /var/run/docker.sock:/var/run/docker.sock must redirect to +// the actual socket path on the dind daemon's filesystem. +func TestTranslateBindSource_RunnerBind_Translates(t *testing.T) { + binds := map[string]string{ + "/var/run/docker.sock": "/run/ephemerd/jobs/abc/docker/d.sock", + } + got, err := translateBindSource("/var/run/docker.sock", binds, "", nil) + if err != nil { + t.Fatalf("translate: %v", err) + } + if got.HostPath != "/run/ephemerd/jobs/abc/docker/d.sock" { + t.Errorf("HostPath = %q, want translated socket path", got.HostPath) + } + if got.ForceReadOnly { + t.Error("ForceReadOnly = true on runner-bind translation; that path category should preserve writability") + } +} + +// TestTranslateBindSource_RunnerBindSubpath_Translates handles sibling +// requests like -v /workspace/foo:/x when the runner has /workspace bound +// to a host scratch dir. The leftover suffix must be appended to the host +// source. +func TestTranslateBindSource_RunnerBindSubpath_Translates(t *testing.T) { + binds := map[string]string{"/workspace": "/srv/ephemerd/scratch"} + got, err := translateBindSource("/workspace/foo/bar", binds, "", nil) + if err != nil { + t.Fatalf("translate: %v", err) + } + want := "/srv/ephemerd/scratch/foo/bar" + if got.HostPath != want { + t.Errorf("HostPath = %q, want %q", got.HostPath, want) + } +} + +// TestTranslateBindSource_LongestPrefixWins guards against a parent bind +// (/etc) shadowing a child bind (/etc/hosts) when both are registered. +// Order in the map is unspecified, so the function has to sort by length. +func TestTranslateBindSource_LongestPrefixWins(t *testing.T) { + binds := map[string]string{ + "/etc": "/host/etc", + "/etc/hosts": "/host/etc/hosts.runtime", + } + got, err := translateBindSource("/etc/hosts", binds, "", nil) + if err != nil { + t.Fatalf("translate: %v", err) + } + if got.HostPath != "/host/etc/hosts.runtime" { + t.Errorf("HostPath = %q — longest prefix /etc/hosts should win over /etc", got.HostPath) + } +} + +// TestTranslateBindSource_NotInRootfs_Rejects asserts the post-fix +// guarantee: bind sources that the dind daemon cannot honor produce an +// error (which surfaces as HTTP 400 at the API layer) instead of silently +// dropping the mount and leaving the workflow to fail with a confusing +// "cannot open" downstream. +func TestTranslateBindSource_NotInRootfs_Rejects(t *testing.T) { + _, err := translateBindSource("/etc/shadow", nil, t.TempDir(), []string{t.TempDir()}) + if err == nil { + t.Fatal("expected error rejecting unknown bind source, got nil") + } +} + +// TestTranslateBindSource_RelativePath_Rejects keeps the contract simple — +// every source coming from a `docker create -v` is normalized to an +// absolute path by the Docker CLI, so a relative path here is a bug +// somewhere upstream. +func TestTranslateBindSource_RelativePath_Rejects(t *testing.T) { + _, err := translateBindSource("relative/path", nil, t.TempDir(), nil) + if err == nil { + t.Fatal("expected error on non-absolute source, got nil") + } +} + +// TestTranslateBindSource_DotDotTraversal_StaysInsideUpperdir guards +// against parent traversal escaping upperdir. path.Clean resolves `..` +// within the source path before any join: /home/runner/../foo cleans to +// /home/foo, /.. cleans to /, etc. There is no source path that path.Clean +// turns into anything starting with `..`, so filepath.Join always plants +// the candidate inside the upperdir tree. +func TestTranslateBindSource_DotDotTraversal_StaysInsideUpperdir(t *testing.T) { + upper := t.TempDir() + // path.Clean("/home/runner/../home/etc/shadow") = /home/etc/shadow. + // Plant the file where the cleaned path will land. + if err := os.MkdirAll(filepath.Join(upper, "home", "etc"), 0o755); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(filepath.Join(upper, "home", "etc", "shadow"), []byte("fake"), 0o644); err != nil { + t.Fatal(err) + } + got, err := translateBindSource("/home/runner/../etc/shadow", nil, upper, nil) + if err != nil { + t.Fatalf("translate: %v", err) + } + want := path.Join(upper, "home/etc/shadow") + if got.HostPath != want { + t.Errorf("HostPath = %q, want %q (must stay inside upperdir tree)", got.HostPath, want) + } + + // A path that climbs above /: path.Clean(/../../etc/shadow) = /etc/shadow. + // Resolution is bounded — even a malicious `..` chain can't escape /. + // Since we never planted /etc/shadow in upperdir, this should reject. + if _, err := translateBindSource("/../../../etc/shadow", nil, upper, nil); err == nil { + t.Error("expected rejection of climb-above-root traversal, got success") + } +} + +// TestTranslateBindSource_PreferUpperOverLower asserts that when a path +// exists in both upperdir and a lowerdir (overlay copy-up), the upperdir +// copy wins and the mount stays writable. This is the standard overlayfs +// semantic; the sibling must see the runner's modified version, not the +// pristine image layer. +func TestTranslateBindSource_PreferUpperOverLower(t *testing.T) { + upper := t.TempDir() + lower := t.TempDir() + rel := filepath.Join("home", "runner", "_work", "config") + if err := os.MkdirAll(filepath.Join(upper, "home", "runner", "_work"), 0o755); err != nil { + t.Fatal(err) + } + if err := os.MkdirAll(filepath.Join(lower, "home", "runner", "_work"), 0o755); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(filepath.Join(upper, rel), []byte("modified"), 0o644); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(filepath.Join(lower, rel), []byte("original"), 0o644); err != nil { + t.Fatal(err) + } + + got, err := translateBindSource("/"+filepath.ToSlash(rel), nil, upper, []string{lower}) + if err != nil { + t.Fatalf("translate: %v", err) + } + want := path.Join(upper, filepath.ToSlash(rel)) + if got.HostPath != want { + t.Errorf("HostPath = %q, want upperdir copy %q", got.HostPath, want) + } + if got.ForceReadOnly { + t.Error("ForceReadOnly = true, want false — upperdir copy is writable") + } +} + +// TestBuildBindMounts_GHARunnerContainer is the canonical regression test +// for the ephpm `container:` failure. The GHA runner inside a job +// container asks dind to create a sibling with the exact bind set the +// upstream runner emits — workspace, _temp, _actions, _tool, externals, +// _github_home, _github_workflow, and the docker socket. Every source +// must translate correctly; the pre-fix shim silently dropped every one +// of them and the sibling exec'd against an empty mountpoint. +func TestBuildBindMounts_GHARunnerContainer(t *testing.T) { + upper := t.TempDir() + lower := t.TempDir() + // Plant the directories the runner creates on startup. _work and + // children live in upperdir (mutable). externals lives in the image + // (lowerdir). + for _, p := range []string{ + "home/runner/_work", + "home/runner/_work/_temp", + "home/runner/_work/_actions", + "home/runner/_work/_tool", + "home/runner/_work/_temp/_github_home", + "home/runner/_work/_temp/_github_workflow", + } { + if err := os.MkdirAll(filepath.Join(upper, p), 0o755); err != nil { + t.Fatalf("plant upperdir %s: %v", p, err) + } + } + if err := os.MkdirAll(filepath.Join(lower, "home/runner/externals"), 0o755); err != nil { + t.Fatalf("plant lowerdir externals: %v", err) + } + + s := testServer() + s.runnerSnapshotKey = "runner-snapshot" + s.runnerBindMappings = map[string]string{ + "/var/run/docker.sock": "/run/ephemerd/jobs/x/docker/d.sock", + } + s.rootfsSearchDirsFn = func(_ context.Context, key string) ([]string, error) { + if key != "runner-snapshot" { + t.Fatalf("unexpected snapshot key %q", key) + } + return []string{upper, lower}, nil + } + + binds := []string{ + "/var/run/docker.sock:/var/run/docker.sock", + "/home/runner/_work:/__w", + "/home/runner/externals:/__e:ro", + "/home/runner/_work/_temp:/__w/_temp", + "/home/runner/_work/_actions:/__w/_actions", + "/home/runner/_work/_tool:/__w/_tool", + "/home/runner/_work/_temp/_github_home:/github/home", + "/home/runner/_work/_temp/_github_workflow:/github/workflow", + } + opts, err := s.buildBindMounts(context.Background(), binds) + if err != nil { + t.Fatalf("buildBindMounts: %v", err) + } + spec := applyOpts(t, opts) + + if len(spec.Mounts) != len(binds) { + t.Fatalf("got %d mounts, want %d", len(spec.Mounts), len(binds)) + } + + // Quick lookup of mounts by destination. + byDest := map[string]ocispec.Mount{} + for _, m := range spec.Mounts { + byDest[m.Destination] = m + } + + // /var/run/docker.sock must redirect to the per-job socket path — + // that's how the sibling reaches dind back, and the path inside the + // runner is itself a bind ephemerd installed. + if got := byDest["/var/run/docker.sock"].Source; got != "/run/ephemerd/jobs/x/docker/d.sock" { + t.Errorf("docker.sock source = %q, want translated socket path", got) + } + + // _temp must land in upperdir and stay rw — the runner writes + // _temp/.sh between docker create and docker exec, and the + // sibling needs to read it. + tempMount, ok := byDest["/__w/_temp"] + if !ok { + t.Fatal("missing /__w/_temp mount") + } + wantTemp := path.Join(upper, "home/runner/_work/_temp") + if tempMount.Source != wantTemp { + t.Errorf("_temp source = %q, want %q", tempMount.Source, wantTemp) + } + if !containsOpt(tempMount.Options, "rw") { + t.Errorf("_temp options = %v, want rw (runner writes its wrapper script here)", tempMount.Options) + } + + // externals is in the image lowerdir; it must be ro (the image is + // shared across every container using this base — a rw mount could + // corrupt the cached layer for unrelated jobs). + extMount, ok := byDest["/__e"] + if !ok { + t.Fatal("missing /__e mount") + } + wantExt := path.Join(lower, "home/runner/externals") + if extMount.Source != wantExt { + t.Errorf("externals source = %q, want %q", extMount.Source, wantExt) + } + if !containsOpt(extMount.Options, "ro") { + t.Errorf("externals options = %v, want ro (image layers must be immutable)", extMount.Options) + } +} + +// TestBuildBindMounts_RejectsUnknownSource is the regression guard for +// the silent-drop bug. Anything the translator can't honor must surface +// as a returned error so the caller can write a 400 — not be quietly +// skipped, which leaves the workflow to fail later with a confusing +// "cannot open" message. +func TestBuildBindMounts_RejectsUnknownSource(t *testing.T) { + s := testServer() + s.runnerSnapshotKey = "runner-snapshot" + s.rootfsSearchDirsFn = func(_ context.Context, _ string) ([]string, error) { + return []string{t.TempDir()}, nil + } + + _, err := s.buildBindMounts(context.Background(), []string{"/etc/shadow:/x"}) + if err == nil { + t.Fatal("expected error rejecting unknown bind source, got nil") + } + if !strings.Contains(err.Error(), "/etc/shadow") { + t.Errorf("error %q should name the offending source", err) + } +} + +// TestBuildBindMounts_NoRunnerRegistered keeps the failure mode clear +// when SetRunnerRootfs hasn't been called yet (shouldn't happen in +// production but worth guarding). With no rootfs, any source that isn't +// already in the runner-bind table gets rejected loudly. +func TestBuildBindMounts_NoRunnerRegistered(t *testing.T) { + s := testServer() + _, err := s.buildBindMounts(context.Background(), []string{"/home/runner/_work/_temp:/x"}) + if err == nil { + t.Fatal("expected error when runner rootfs is unregistered, got nil") + } +} + +func containsOpt(opts []string, want string) bool { + for _, o := range opts { + if o == want { + return true + } + } + return false +} diff --git a/pkg/dind/containers.go b/pkg/dind/containers.go index b3cc446..d817701 100644 --- a/pkg/dind/containers.go +++ b/pkg/dind/containers.go @@ -395,22 +395,21 @@ func (s *Server) handleContainerCreate(w http.ResponseWriter, r *http.Request) { opts = append(opts, oci.WithNamespacedCgroup()) } - // Bind mounts. Skip sources that don't exist rather than failing. - for _, bind := range req.HostConfig.Binds { - bindParts := strings.SplitN(bind, ":", 3) - if len(bindParts) >= 2 { - src := bindParts[0] - if _, err := os.Stat(src); os.IsNotExist(err) { - s.log.Debug("skipping bind mount, source does not exist", "source", src, "dest", bindParts[1]) - continue - } - mountOpts := []string{"rbind", "rw"} - if len(bindParts) == 3 && bindParts[2] == "ro" { - mountOpts = []string{"rbind", "ro"} - } - opts = append(opts, withBindMount(src, bindParts[1], mountOpts)) - } + // Bind mounts. Sources arrive from inside the runner container's + // mount namespace — e.g. /home/runner/_work/_temp where the GHA + // runner writes its step wrapper scripts. Those paths don't exist + // on the dind daemon's filesystem; we have to translate them to + // the runner snapshot's overlayfs upperdir (or to the host source + // of a non-rootfs bind ephemerd installed into the runner, like + // /var/run/docker.sock). The pre-fix shim silently dropped any + // bind whose source didn't os.Stat — leaving GHA `container:` jobs + // failing downstream with "sh: cannot open /__w/_temp/.sh". + bindOpts, berr := s.buildBindMounts(r.Context(), req.HostConfig.Binds) + if berr != nil { + writeJSON(w, http.StatusBadRequest, map[string]string{"message": berr.Error()}) + return } + opts = append(opts, bindOpts...) // tmpfs mounts (--tmpfs /tmp, --tmpfs /run). for dest, options := range req.HostConfig.Tmpfs { @@ -1347,6 +1346,43 @@ func generateContainerID() string { return hex.EncodeToString(b) } +// buildBindMounts translates every Docker -v entry from the runner's +// mount namespace into OCI bind mount spec options. Returns a user-facing +// error (and no opts) on the first untranslatable source so the caller can +// surface HTTP 400 — the pre-fix shim silently dropped these, which left +// GHA `container:` jobs to fail downstream with confusing "cannot open" +// errors. See translateBindSource for the resolution policy. +func (s *Server) buildBindMounts(ctx context.Context, binds []string) ([]oci.SpecOpts, error) { + upperdir, lowerdirs, layerErr := s.runnerRootfsLayers(ctx) + if layerErr != nil { + s.log.Warn("could not load runner rootfs layers for bind translation", "error", layerErr) + } + s.mu.Lock() + runnerBinds := s.runnerBindMappings + s.mu.Unlock() + + out := make([]oci.SpecOpts, 0, len(binds)) + for _, bind := range binds { + parts := strings.SplitN(bind, ":", 3) + if len(parts) < 2 { + continue + } + src, dst := parts[0], parts[1] + requestedRO := len(parts) == 3 && parts[2] == "ro" + + resolved, terr := translateBindSource(src, runnerBinds, upperdir, lowerdirs) + if terr != nil { + return nil, fmt.Errorf("bind mount %s -> %s rejected: %w", src, dst, terr) + } + mountOpts := []string{"rbind", "rw"} + if requestedRO || resolved.ForceReadOnly { + mountOpts = []string{"rbind", "ro"} + } + out = append(out, withBindMount(resolved.HostPath, dst, mountOpts)) + } + return out, nil +} + func withBindMount(src, dst string, options []string) oci.SpecOpts { return func(_ context.Context, _ oci.Client, _ *containers.Container, s *ocispec.Spec) error { if s.Mounts == nil { diff --git a/pkg/dind/dind.go b/pkg/dind/dind.go index 53905a6..1d731e6 100644 --- a/pkg/dind/dind.go +++ b/pkg/dind/dind.go @@ -54,7 +54,24 @@ type Server struct { buildkit *buildkit.Server // shared embedded BuildKit solver (nil → fall back to platform default) runnerNetNS string // path to runner container's net namespace; used to install DNAT rules for port bindings allowPrivileged bool // gate for docker run --privileged / --cap-add; see config.DindConfig.AllowPrivileged - log *slog.Logger + + // runnerSnapshotKey identifies the containerd snapshot backing the + // runner container that owns this dind socket. Used to resolve sibling + // -v bind sources from the runner's mount namespace (e.g. + // /home/runner/_work/_temp) to the real overlayfs upperdir/lowerdir + // paths on the dind daemon's filesystem. The snapshot lives in the + // runtime's "ephemerd" namespace, not in s.jobNamespace. Empty until + // SetRunnerRootfs is called. + runnerSnapshotKey string + // runnerBindMappings is a copy of the non-rootfs bind table ephemerd + // installed into the runner: /var/run/docker.sock → the per-job dind + // socket file, /etc/hosts and /etc/resolv.conf → per-runner files, + // the runner-mount directory → its job-specific copy on host. Keyed + // by container destination, value is host source. Consulted before + // the rootfs walk in translateBindSource. + runnerBindMappings map[string]string + + log *slog.Logger mu sync.Mutex images map[string]*imageEntry // in-memory image store scoped to this job @@ -180,6 +197,37 @@ func (s *Server) SetRunnerNetNS(netnsPath string) { s.runnerNetNS = netnsPath } +// SetRunnerRootfs registers the runner container's snapshot and the +// non-rootfs bind table ephemerd installed into it, so that subsequent +// docker create requests from inside the runner can have their -v sources +// translated from the runner's mount namespace to real host paths. +// +// snapshotKey is the containerd snapshot name (typically +// "-snapshot" in the runtime's "ephemerd" namespace). +// +// bindMappings keys are container destination paths (what the runner sees, +// e.g. "/var/run/docker.sock"); values are host source paths (what the dind +// daemon hands to containerd, e.g. "/jobs//docker/d.sock"). +// The map is copied so the caller may continue to mutate it. +// +// Must be called after the runner container is created (so the snapshot +// exists) and before any docker create from inside the runner. The runtime +// pairs this call with SetRunnerNetNS at the same point in startup. +func (s *Server) SetRunnerRootfs(snapshotKey string, bindMappings map[string]string) { + s.mu.Lock() + defer s.mu.Unlock() + s.runnerSnapshotKey = snapshotKey + if len(bindMappings) == 0 { + s.runnerBindMappings = nil + return + } + cp := make(map[string]string, len(bindMappings)) + for k, v := range bindMappings { + cp[k] = v + } + s.runnerBindMappings = cp +} + // Endpoint returns the value a container should set DOCKER_HOST to in order // to reach this fake daemon. Linux/macOS return the unix socket path directly // (e.g. "unix:///var/run/docker.sock" once bind-mounted); Windows returns a diff --git a/pkg/dind/exec.go b/pkg/dind/exec.go index 8e7c488..0a2b2d9 100644 --- a/pkg/dind/exec.go +++ b/pkg/dind/exec.go @@ -732,6 +732,37 @@ func (s *Server) copyToViaExec(w http.ResponseWriter, r *http.Request, entry *co w.WriteHeader(http.StatusOK) } +// runnerRootfsLayers returns the runner container's overlayfs layers split +// into the writable upperdir and the read-only lowerdirs. Used by +// translateBindSource to map sibling -v sources from the runner's mount +// namespace to real host paths. +// +// The runner snapshot lives in the runtime's "ephemerd" namespace (it was +// created by pkg/runtime, not by dind), so this helper switches namespaces +// before consulting the snapshotter. Returns ("", nil, nil) when no runner +// rootfs has been registered yet — callers treat that as "rootfs empty", +// which makes every non-runner-bind source fail translation with a clear +// error rather than silently dropping the mount. +func (s *Server) runnerRootfsLayers(ctx context.Context) (upperdir string, lowerdirs []string, err error) { + s.mu.Lock() + key := s.runnerSnapshotKey + s.mu.Unlock() + if key == "" { + return "", nil, nil + } + runnerCtx := namespaces.WithNamespace(ctx, sharedNamespace) + dirs, err := s.rootfsSearchDirs(runnerCtx, key) + if err != nil { + return "", nil, err + } + if len(dirs) == 0 { + return "", nil, nil + } + // rootfsSearchDirs returns upperdir first, then lowerdirs in order. + // In test stubs that return a single entry, treat it as upperdir-only. + return dirs[0], dirs[1:], nil +} + // rootfsSearchDirs returns the host-filesystem directories whose union forms // the container's merged rootfs view — upperdir first (container-written // files take precedence), then each lowerdir. Used by both HEAD and GET diff --git a/pkg/runtime/runtime.go b/pkg/runtime/runtime.go index 21078fe..0c1b96a 100644 --- a/pkg/runtime/runtime.go +++ b/pkg/runtime/runtime.go @@ -805,6 +805,34 @@ func (r *Runtime) Create(ctx context.Context, cfg CreateConfig) (*RunnerEnv, err return nil, fmt.Errorf("creating container %s: %w", id, err) } + // Register the runner snapshot + the non-rootfs bind table with the + // dind server so it can translate sibling `-v` sources from the + // runner's mount namespace to real host paths. Without this, the GHA + // runner's `container:` flow asks dind for binds like + // `/home/runner/_work/_temp` that don't exist on the dind daemon's + // filesystem — the shim used to silently drop them and the resulting + // `docker exec sh -e /__w/_temp/.sh` failed with "cannot open". + // + // The GOOS guard only skips the Windows-native job path (Hyper-V + // isolated Windows containers with the "windowsfilter" snapshotter — + // no overlay upperdir to walk, and bind semantics differ enough that + // translation needs a separate design). Linux jobs on Windows hosts + // reach this code via the in-VM ephemerd process running as Linux, + // so they take the registration branch normally. + if dindServer != nil && goruntime.GOOS != "windows" { + bindMappings := map[string]string{} + if dindServer.SocketPath() != "" { + bindMappings["/var/run/docker.sock"] = dindServer.SocketPath() + } + hostDataDir := filepath.Dir(r.cfg.LogDir) + bindMappings["/etc/hosts"] = filepath.Join(hostDataDir, "hosts", id+".hosts") + bindMappings["/etc/resolv.conf"] = filepath.Join(hostDataDir, "dns", id+".conf") + if jobRunnerDir != "" && r.cfg.RunnerMount != "" { + bindMappings[r.cfg.RunnerMount] = jobRunnerDir + } + dindServer.SetRunnerRootfs(snapshotName, bindMappings) + } + // Create and start the task with per-job log capture. // On Windows, cio.LogFile uses file:// URIs which runhcs rejects // (it only accepts binary:// scheme), and cio.WithStdio fails with From 4359f7df915bab979112c05241ed89280a5ada11 Mon Sep 17 00:00:00 2001 From: Luther Monson Date: Tue, 26 May 2026 22:15:00 -0600 Subject: [PATCH 2/2] fix(dind): stage committed parent snapshot for bind-translation e2e MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Prepare(ctx, key, "") with an empty parent returns a plain bind mount, not an overlay — so the e2e never saw an upperdir= option to extract. Stage an empty committed parent first, then Prepare on top, so the active snapshot is a real overlayfs mount with upperdir/lowerdir layout the translator can walk. Also fix the lease-delete callback to set the runtime namespace before calling the LeasesService, which was failing on cleanup with "namespace is required". --- pkg/dind/bindtranslate_e2e_test.go | 37 +++++++++++++++++++++++------- 1 file changed, 29 insertions(+), 8 deletions(-) diff --git a/pkg/dind/bindtranslate_e2e_test.go b/pkg/dind/bindtranslate_e2e_test.go index 7c8041b..f88ae3d 100644 --- a/pkg/dind/bindtranslate_e2e_test.go +++ b/pkg/dind/bindtranslate_e2e_test.go @@ -68,7 +68,8 @@ func TestBindTranslation_RealContainerd(t *testing.T) { t.Fatalf("create lease: %v", err) } t.Cleanup(func() { - if err := ctrdClient.LeasesService().Delete(context.Background(), lease); err != nil { + ctx := namespaces.WithNamespace(context.Background(), sharedNamespace) + if err := ctrdClient.LeasesService().Delete(ctx, lease); err != nil { t.Logf("delete lease: %v", err) } }) @@ -79,19 +80,34 @@ func TestBindTranslation_RealContainerd(t *testing.T) { t.Skip("overlayfs snapshotter unavailable in this containerd build") } - const snapshotKey = "bind-translate-e2e" + const ( + parentPrepKey = "bind-translate-e2e-parent-prep" + parentKey = "bind-translate-e2e-parent" + snapshotKey = "bind-translate-e2e" + ) t.Cleanup(func() { cleanup, cancelCleanup := context.WithTimeout(context.Background(), 10*time.Second) defer cancelCleanup() cleanup = namespaces.WithNamespace(cleanup, sharedNamespace) - if err := snapshotter.Remove(cleanup, snapshotKey); err != nil && !strings.Contains(err.Error(), "not found") { - t.Logf("snapshot cleanup: %v", err) + for _, k := range []string{snapshotKey, parentKey, parentPrepKey} { + if err := snapshotter.Remove(cleanup, k); err != nil && !strings.Contains(err.Error(), "not found") { + t.Logf("snapshot cleanup %s: %v", k, err) + } } }) - // Empty parent → empty rootfs. We don't need image content, just the - // upperdir to plant the marker into. - mounts, err := snapshotter.Prepare(ctx, snapshotKey, "") + // Stage an empty committed parent so the active snapshot on top is + // a real overlay mount with upperdir= / lowerdir= options. A direct + // Prepare(..., "") returns a plain bind to the snapshot fs dir, + // which has no overlayfs layout for translation to find. + if _, err := snapshotter.Prepare(ctx, parentPrepKey, ""); err != nil { + t.Fatalf("prepare parent: %v", err) + } + if err := snapshotter.Commit(ctx, parentKey, parentPrepKey); err != nil { + t.Fatalf("commit parent: %v", err) + } + + mounts, err := snapshotter.Prepare(ctx, snapshotKey, parentKey) if err != nil { t.Fatalf("snapshotter.Prepare: %v", err) } @@ -237,7 +253,8 @@ func TestBindTranslation_RejectsForeignSource(t *testing.T) { t.Fatalf("create lease: %v", err) } t.Cleanup(func() { - if err := ctrdClient.LeasesService().Delete(context.Background(), lease); err != nil { + ctx := namespaces.WithNamespace(context.Background(), sharedNamespace) + if err := ctrdClient.LeasesService().Delete(ctx, lease); err != nil { t.Logf("delete lease: %v", err) } }) @@ -247,6 +264,10 @@ func TestBindTranslation_RejectsForeignSource(t *testing.T) { if snapshotter == nil { t.Skip("overlayfs snapshotter unavailable in this containerd build") } + // The rejection path doesn't need an overlay mount — buildBindMounts + // fails out at the bind-table / rootfs lookup before any layer walk + // matters. A bare snapshot is enough; we only need the key to be + // registered so SetRunnerRootfs takes the on-path branch. const snapshotKey = "bind-translate-reject-e2e" t.Cleanup(func() { cleanup, cancelCleanup := context.WithTimeout(context.Background(), 10*time.Second)