diff --git a/docs/arch/dind-bind-translation.md b/docs/arch/dind-bind-translation.md index d1b9d0f..c13ccb6 100644 --- a/docs/arch/dind-bind-translation.md +++ b/docs/arch/dind-bind-translation.md @@ -95,23 +95,38 @@ filesystem. 1. **Longest-prefix match against A's bind table.** If `/X` is under a destination ephemerd installed into A (e.g. `/var/run/docker.sock`), use the corresponding host source. The leftover suffix is appended. - Longest-prefix wins so a child mount (`/etc/hosts`) is preferred over - a parent (`/etc`). -2. **Upperdir match.** If `/X` exists, B's bind source is that - path. Returned `rw` — A's upperdir is writable, and the GHA `_temp` - case requires the sibling to read the next step's script written - *after* `docker create`, so the directory mount must stay live. -3. **Lowerdir match.** If `/X` only exists in an image layer, B's bind - source is that lowerdir path but the mount is forced `ro`. The - lowerdir is shared with every other container using the same base - image; a rw mount on top of it would corrupt the cache for unrelated - jobs. -4. **No match → error.** Surfaced as HTTP 400 from - `handleContainerCreate`. The pre-fix behavior was to silently drop; - the new behavior fails loudly so the user sees a clear "bind mount - /X -> /Y rejected" instead of a downstream "cannot open". - -`filepath.Clean`/`path.Clean` normalizes `..` before the join, so a + Longest-prefix wins so a child mount (`/etc/hosts`) is preferred + over a parent (`/etc`). +2. **`/proc//root/X`.** When the runner's task PID is + registered, sources resolve through procfs's magic root link, which + the kernel walks in the runner's mount namespace. This is the + runner's *merged overlay view* — the same filesystem A sees from + inside. Returned `rw`; writes copy-up into A's upperdir, which is + A's own writable layer (no cross-job leak, no image-cache + corruption). +3. **Upperdir match (fallback).** If no PID is registered (test path), + walk A's snapshot upperdir directly. Returned `rw`. +4. **Lowerdir match (fallback).** Walk A's snapshot lowerdirs. Returned + `ro` — sibling writes through the bind would land on a shared image + layer. +5. **No match → error.** Surfaced as HTTP 400 from + `handleContainerCreate`. Pre-fix behavior was to silently drop; the + new behavior fails loudly so the user sees "bind mount /X -> /Y + rejected" instead of a downstream "cannot open". + +**Why PID resolution beats per-layer walk.** The first cut of this fix +walked the snapshot's upperdir then each lowerdir in order and returned +the first match. That broke for paths whose contents span multiple +image layers: in the GHA `actions/runner` image, layer 4 creates the +empty directory entry `/home/runner/externals/`, layer 22 adds +`node20/bin/node` deep inside it. The per-layer walk picked layer 4 +(first match — dir exists), bound an empty tree, and +`actions/checkout` failed downstream with +`exec: "/__e/node20/bin/node": no such file`. Resolving through +`/proc//root` delegates the merge to the kernel and always +produces the same view A has. + +`path.Clean` normalizes `..` before the join, so a malicious `/home/runner/../../etc/shadow` resolves to `/etc/shadow` and either falls into A's rootfs (which means the sibling sees A's own `/etc/shadow` — exactly what A could already see) or fails to resolve at diff --git a/pkg/dind/bindtranslate.go b/pkg/dind/bindtranslate.go index 4c7f25d..37f590f 100644 --- a/pkg/dind/bindtranslate.go +++ b/pkg/dind/bindtranslate.go @@ -30,26 +30,28 @@ type bindResolution struct { // non-rootfs mounts ephemerd installed into the runner (/var/run/docker.sock, // /etc/hosts, /etc/resolv.conf, the embedded runner directory, etc.). // -// upperdir / lowerdirs come from the runner snapshot's overlayfs mount -// options. upperdir is the mutable layer where the runner's writes land; -// lowerdirs are the shared, read-only image layers. +// runnerTaskPID is the runner container's main process PID on the host. When +// non-zero, rootfs sources resolve through /proc//root, which is the +// kernel's view of the runner's merged overlay. Without this, the previous +// per-layer walk could pick the first lowerdir that has `/X` as a directory +// entry while the actual contents live in deeper layers — that's how +// `/__e/node20/bin/node` failed: layer 4 had `home/runner/externals/` as +// an empty dir, layer 22 had `node20/bin/node`, the per-layer walk picked +// layer 4 and bound an empty tree. +// +// upperdir / lowerdirs are the explicit layer paths for the test path — +// real production calls always pass runnerTaskPID > 0. // // Resolution order: -// 1. Longest-prefix match against runnerBinds — /var/run/docker.sock and -// anything under a known bind destination is translated via the bind -// table, never re-resolved against the rootfs. -// 2. upperdir match → returned rw. This is the common GHA `container:` -// case: the runner writes /home/runner/_work/_temp/.sh which -// lives in the runner's upperdir, and the sibling needs to read it -// (and the next step's wrapper script likewise needs to land back -// in the same _temp directory, so the mount has to stay rw). -// 3. lowerdir match → returned ro. Image-layer files (e.g. -// /home/runner/externals) are shared across every container using the -// same base image, so a rw mount on top of one would corrupt the -// cache. -// 4. No match → error. The pre-fix shim silently dropped these, which -// surfaced downstream as "cannot open /__w/_temp/.sh". -func translateBindSource(src string, runnerBinds map[string]string, upperdir string, lowerdirs []string) (bindResolution, error) { +// 1. Longest-prefix match against runnerBinds. +// 2. /proc//root/ when PID > 0 — the merged overlay, +// i.e. the same filesystem view the runner sees. Returned rw; writes +// copy-up into the runner's upperdir, which is the runner's own +// writable layer (no cross-job leak, no image-cache corruption). +// 3. Upperdir match (fallback for tests where PID == 0). +// 4. Lowerdir match (fallback for tests; forced ro). +// 5. No match → error. Loud failure replaces the pre-fix silent drop. +func translateBindSource(src string, runnerBinds map[string]string, runnerTaskPID uint32, upperdir string, lowerdirs []string) (bindResolution, error) { // Sources are POSIX paths from the runner's Linux mount namespace; // use path (not filepath) so this evaluates consistently on Windows // build hosts during testing. Host-side joins below use filepath @@ -63,6 +65,14 @@ func translateBindSource(src string, runnerBinds map[string]string, upperdir str return bindResolution{HostPath: path.Join(host, suffix)}, nil } + if runnerTaskPID > 0 { + procPath := fmt.Sprintf("/proc/%d/root%s", runnerTaskPID, cleaned) + if _, err := os.Stat(procPath); err == nil { + return bindResolution{HostPath: procPath}, nil + } + return bindResolution{}, fmt.Errorf("bind source %q is not visible in the runner's mount namespace (/proc/%d/root%s does not exist)", src, runnerTaskPID, cleaned) + } + if upperdir != "" { candidate := path.Join(upperdir, cleaned) if _, err := os.Stat(candidate); err == nil { diff --git a/pkg/dind/bindtranslate_e2e_test.go b/pkg/dind/bindtranslate_e2e_test.go index f88ae3d..19297f9 100644 --- a/pkg/dind/bindtranslate_e2e_test.go +++ b/pkg/dind/bindtranslate_e2e_test.go @@ -162,7 +162,12 @@ func TestBindTranslation_RealContainerd(t *testing.T) { "/etc/hosts": filepath.Join(dataDir, "hosts", "fake.hosts"), "/etc/resolv.conf": filepath.Join(dataDir, "dns", "fake.conf"), } - s.SetRunnerRootfs(snapshotKey, bindMappings) + // PID 0 keeps the e2e on the snapshot-layer fallback so it + // exercises the upperdir/lowerdir walk against a real overlayfs + // snapshot. Production calls pass task.Pid() and resolve via + // /proc//root — covered by the in-VM workflow run, not by + // this hermetic e2e. + s.SetRunnerRootfs(snapshotKey, 0, bindMappings) // Drive buildBindMounts with the exact bind set the upstream GHA // runner emits for `container:` workflows (verbatim from the ephpm @@ -296,7 +301,7 @@ func TestBindTranslation_RejectsForeignSource(t *testing.T) { if err != nil { t.Fatalf("dind New: %v", err) } - s.SetRunnerRootfs(snapshotKey, nil) + s.SetRunnerRootfs(snapshotKey, 0, nil) _, err = s.buildBindMounts(ctx, []string{"/etc/shadow:/x"}) if err == nil { diff --git a/pkg/dind/bindtranslate_test.go b/pkg/dind/bindtranslate_test.go index 08ca2c3..39f522f 100644 --- a/pkg/dind/bindtranslate_test.go +++ b/pkg/dind/bindtranslate_test.go @@ -7,6 +7,8 @@ import ( "os" "path" "path/filepath" + "runtime" + "strconv" "strings" "sync" "testing" @@ -15,6 +17,7 @@ import ( ocispec "github.com/opencontainers/runtime-spec/specs-go" ) + // applyOpts invokes a list of oci.SpecOpts against an empty spec so tests // can assert what they produced. withBindMount and friends don't touch the // oci.Client / containers.Container args, so nil values are fine. @@ -47,7 +50,7 @@ func TestTranslateBindSource_UpperdirMatch_ReturnsReadWrite(t *testing.T) { t.Fatalf("planting upperdir: %v", err) } - got, err := translateBindSource("/home/runner/_work/_temp", nil, upper, nil) + got, err := translateBindSource("/home/runner/_work/_temp", nil, 0, upper, nil) if err != nil { t.Fatalf("translate: %v", err) } @@ -71,7 +74,7 @@ func TestTranslateBindSource_LowerdirMatch_ForcesReadOnly(t *testing.T) { t.Fatalf("planting lowerdir: %v", err) } - got, err := translateBindSource("/home/runner/externals", nil, t.TempDir(), []string{lower}) + got, err := translateBindSource("/home/runner/externals", nil, 0, t.TempDir(), []string{lower}) if err != nil { t.Fatalf("translate: %v", err) } @@ -93,7 +96,7 @@ func TestTranslateBindSource_RunnerBind_Translates(t *testing.T) { binds := map[string]string{ "/var/run/docker.sock": "/run/ephemerd/jobs/abc/docker/d.sock", } - got, err := translateBindSource("/var/run/docker.sock", binds, "", nil) + got, err := translateBindSource("/var/run/docker.sock", binds, 0, "", nil) if err != nil { t.Fatalf("translate: %v", err) } @@ -111,7 +114,7 @@ func TestTranslateBindSource_RunnerBind_Translates(t *testing.T) { // source. func TestTranslateBindSource_RunnerBindSubpath_Translates(t *testing.T) { binds := map[string]string{"/workspace": "/srv/ephemerd/scratch"} - got, err := translateBindSource("/workspace/foo/bar", binds, "", nil) + got, err := translateBindSource("/workspace/foo/bar", binds, 0, "", nil) if err != nil { t.Fatalf("translate: %v", err) } @@ -129,7 +132,7 @@ func TestTranslateBindSource_LongestPrefixWins(t *testing.T) { "/etc": "/host/etc", "/etc/hosts": "/host/etc/hosts.runtime", } - got, err := translateBindSource("/etc/hosts", binds, "", nil) + got, err := translateBindSource("/etc/hosts", binds, 0, "", nil) if err != nil { t.Fatalf("translate: %v", err) } @@ -144,7 +147,7 @@ func TestTranslateBindSource_LongestPrefixWins(t *testing.T) { // dropping the mount and leaving the workflow to fail with a confusing // "cannot open" downstream. func TestTranslateBindSource_NotInRootfs_Rejects(t *testing.T) { - _, err := translateBindSource("/etc/shadow", nil, t.TempDir(), []string{t.TempDir()}) + _, err := translateBindSource("/etc/shadow", nil, 0, t.TempDir(), []string{t.TempDir()}) if err == nil { t.Fatal("expected error rejecting unknown bind source, got nil") } @@ -155,7 +158,7 @@ func TestTranslateBindSource_NotInRootfs_Rejects(t *testing.T) { // absolute path by the Docker CLI, so a relative path here is a bug // somewhere upstream. func TestTranslateBindSource_RelativePath_Rejects(t *testing.T) { - _, err := translateBindSource("relative/path", nil, t.TempDir(), nil) + _, err := translateBindSource("relative/path", nil, 0, t.TempDir(), nil) if err == nil { t.Fatal("expected error on non-absolute source, got nil") } @@ -177,7 +180,7 @@ func TestTranslateBindSource_DotDotTraversal_StaysInsideUpperdir(t *testing.T) { if err := os.WriteFile(filepath.Join(upper, "home", "etc", "shadow"), []byte("fake"), 0o644); err != nil { t.Fatal(err) } - got, err := translateBindSource("/home/runner/../etc/shadow", nil, upper, nil) + got, err := translateBindSource("/home/runner/../etc/shadow", nil, 0, upper, nil) if err != nil { t.Fatalf("translate: %v", err) } @@ -189,7 +192,7 @@ func TestTranslateBindSource_DotDotTraversal_StaysInsideUpperdir(t *testing.T) { // A path that climbs above /: path.Clean(/../../etc/shadow) = /etc/shadow. // Resolution is bounded — even a malicious `..` chain can't escape /. // Since we never planted /etc/shadow in upperdir, this should reject. - if _, err := translateBindSource("/../../../etc/shadow", nil, upper, nil); err == nil { + if _, err := translateBindSource("/../../../etc/shadow", nil, 0, upper, nil); err == nil { t.Error("expected rejection of climb-above-root traversal, got success") } } @@ -216,7 +219,7 @@ func TestTranslateBindSource_PreferUpperOverLower(t *testing.T) { t.Fatal(err) } - got, err := translateBindSource("/"+filepath.ToSlash(rel), nil, upper, []string{lower}) + got, err := translateBindSource("/"+filepath.ToSlash(rel), nil, 0, upper, []string{lower}) if err != nil { t.Fatalf("translate: %v", err) } @@ -229,6 +232,67 @@ func TestTranslateBindSource_PreferUpperOverLower(t *testing.T) { } } +// TestTranslateBindSource_TaskPIDResolvesViaProcRoot is the regression +// test for the /__e/node20 failure mode. When runnerTaskPID > 0, sources +// resolve through /proc//root — the kernel's merged overlay view of +// the runner's filesystem. This bypasses the per-layer walk that picked +// the first lowerdir holding a path entry, even when the actual contents +// lived in deeper layers. +// +// We test using the current process's PID. /proc/self/root is whatever +// the test binary sees as its root (/), so a file we put in t.TempDir() +// is reachable via the proc path. Linux-only — /proc semantics aren't +// available elsewhere. +func TestTranslateBindSource_TaskPIDResolvesViaProcRoot(t *testing.T) { + if runtime.GOOS != "linux" { + t.Skipf("/proc//root resolution is Linux-only; goos=%s", runtime.GOOS) + } + scratch := t.TempDir() + const marker = "marker.sh" + markerPath := filepath.Join(scratch, marker) + if err := os.WriteFile(markerPath, []byte("#!/bin/sh\n"), 0o755); err != nil { + t.Fatal(err) + } + + pid := uint32(os.Getpid()) + got, err := translateBindSource(filepath.ToSlash(markerPath), nil, pid, "", nil) + if err != nil { + t.Fatalf("translate via /proc/self/root: %v", err) + } + wantPrefix := "/proc/" + strconv.Itoa(int(pid)) + "/root" + if !strings.HasPrefix(got.HostPath, wantPrefix) { + t.Errorf("HostPath = %q, want it to begin with %q (resolution must go through /proc//root)", got.HostPath, wantPrefix) + } + // Round-trip: reading through the proc-prefixed path must return the + // same bytes we planted in the underlying tempdir. + body, err := os.ReadFile(got.HostPath) + if err != nil { + t.Fatalf("read via translated path: %v", err) + } + if string(body) != "#!/bin/sh\n" { + t.Errorf("round-trip body = %q, want planted contents", string(body)) + } +} + +// TestTranslateBindSource_TaskPIDRejectsMissingSource is the loud-failure +// guard for the proc-path. When PID > 0 and the source doesn't exist in +// the runner's namespace, translation must error out — not fall through +// to the layer-walk, which would mask the real "runner can't see this +// either" situation. +func TestTranslateBindSource_TaskPIDRejectsMissingSource(t *testing.T) { + if runtime.GOOS != "linux" { + t.Skipf("/proc//root resolution is Linux-only; goos=%s", runtime.GOOS) + } + pid := uint32(os.Getpid()) + _, err := translateBindSource("/this/path/does/not/exist/anywhere", nil, pid, "", nil) + if err == nil { + t.Fatal("expected rejection of missing source under PID path, got nil") + } + if !strings.Contains(err.Error(), "/proc/") { + t.Errorf("error should mention the proc path it tried: %v", err) + } +} + // TestBuildBindMounts_GHARunnerContainer is the canonical regression test // for the ephpm `container:` failure. The GHA runner inside a job // container asks dind to create a sibling with the exact bind set the diff --git a/pkg/dind/containers.go b/pkg/dind/containers.go index d817701..12a1172 100644 --- a/pkg/dind/containers.go +++ b/pkg/dind/containers.go @@ -1359,6 +1359,7 @@ func (s *Server) buildBindMounts(ctx context.Context, binds []string) ([]oci.Spe } s.mu.Lock() runnerBinds := s.runnerBindMappings + runnerPID := s.runnerTaskPID s.mu.Unlock() out := make([]oci.SpecOpts, 0, len(binds)) @@ -1370,7 +1371,7 @@ func (s *Server) buildBindMounts(ctx context.Context, binds []string) ([]oci.Spe src, dst := parts[0], parts[1] requestedRO := len(parts) == 3 && parts[2] == "ro" - resolved, terr := translateBindSource(src, runnerBinds, upperdir, lowerdirs) + resolved, terr := translateBindSource(src, runnerBinds, runnerPID, upperdir, lowerdirs) if terr != nil { return nil, fmt.Errorf("bind mount %s -> %s rejected: %w", src, dst, terr) } diff --git a/pkg/dind/dind.go b/pkg/dind/dind.go index 1d731e6..e4edc3b 100644 --- a/pkg/dind/dind.go +++ b/pkg/dind/dind.go @@ -71,6 +71,15 @@ type Server struct { // the rootfs walk in translateBindSource. runnerBindMappings map[string]string + // runnerTaskPID is the host-PID of the runner container's init + // process. translateBindSource uses it to address the runner's + // merged overlay view through /proc//root, which is the only + // way to expose files whose contents span multiple image layers + // (e.g. /home/runner/externals/node20/bin/node, where the dir entry + // and the file live in different lowerdirs). Zero until + // SetRunnerRootfs is called. + runnerTaskPID uint32 + log *slog.Logger mu sync.Mutex @@ -197,26 +206,34 @@ func (s *Server) SetRunnerNetNS(netnsPath string) { s.runnerNetNS = netnsPath } -// SetRunnerRootfs registers the runner container's snapshot and the -// non-rootfs bind table ephemerd installed into it, so that subsequent +// SetRunnerRootfs registers the runner container's snapshot, task PID, and +// the non-rootfs bind table ephemerd installed into it, so that subsequent // docker create requests from inside the runner can have their -v sources // translated from the runner's mount namespace to real host paths. // // snapshotKey is the containerd snapshot name (typically -// "-snapshot" in the runtime's "ephemerd" namespace). +// "-snapshot" in the runtime's "ephemerd" namespace). Kept for +// the (now-fallback) layer walk used by the unit tests; production +// resolution goes through taskPID. +// +// taskPID is the host-side PID of the runner's init process. Required for +// the /proc//root resolution path that gives the runner's merged +// overlay view — without it, sibling binds for paths split across image +// layers (e.g. /home/runner/externals) bind incomplete trees. // // bindMappings keys are container destination paths (what the runner sees, // e.g. "/var/run/docker.sock"); values are host source paths (what the dind -// daemon hands to containerd, e.g. "/jobs//docker/d.sock"). -// The map is copied so the caller may continue to mutate it. +// daemon hands to containerd). The map is copied so the caller may continue +// to mutate it. // -// Must be called after the runner container is created (so the snapshot -// exists) and before any docker create from inside the runner. The runtime -// pairs this call with SetRunnerNetNS at the same point in startup. -func (s *Server) SetRunnerRootfs(snapshotKey string, bindMappings map[string]string) { +// Must be called after the runner task starts (PID is known) and before any +// docker create from inside the runner. Pairs with SetRunnerNetNS in the +// runtime, called at the same point in startup. +func (s *Server) SetRunnerRootfs(snapshotKey string, taskPID uint32, bindMappings map[string]string) { s.mu.Lock() defer s.mu.Unlock() s.runnerSnapshotKey = snapshotKey + s.runnerTaskPID = taskPID if len(bindMappings) == 0 { s.runnerBindMappings = nil return diff --git a/pkg/runtime/runtime.go b/pkg/runtime/runtime.go index 0c1b96a..7baf434 100644 --- a/pkg/runtime/runtime.go +++ b/pkg/runtime/runtime.go @@ -805,34 +805,6 @@ func (r *Runtime) Create(ctx context.Context, cfg CreateConfig) (*RunnerEnv, err return nil, fmt.Errorf("creating container %s: %w", id, err) } - // Register the runner snapshot + the non-rootfs bind table with the - // dind server so it can translate sibling `-v` sources from the - // runner's mount namespace to real host paths. Without this, the GHA - // runner's `container:` flow asks dind for binds like - // `/home/runner/_work/_temp` that don't exist on the dind daemon's - // filesystem — the shim used to silently drop them and the resulting - // `docker exec sh -e /__w/_temp/.sh` failed with "cannot open". - // - // The GOOS guard only skips the Windows-native job path (Hyper-V - // isolated Windows containers with the "windowsfilter" snapshotter — - // no overlay upperdir to walk, and bind semantics differ enough that - // translation needs a separate design). Linux jobs on Windows hosts - // reach this code via the in-VM ephemerd process running as Linux, - // so they take the registration branch normally. - if dindServer != nil && goruntime.GOOS != "windows" { - bindMappings := map[string]string{} - if dindServer.SocketPath() != "" { - bindMappings["/var/run/docker.sock"] = dindServer.SocketPath() - } - hostDataDir := filepath.Dir(r.cfg.LogDir) - bindMappings["/etc/hosts"] = filepath.Join(hostDataDir, "hosts", id+".hosts") - bindMappings["/etc/resolv.conf"] = filepath.Join(hostDataDir, "dns", id+".conf") - if jobRunnerDir != "" && r.cfg.RunnerMount != "" { - bindMappings[r.cfg.RunnerMount] = jobRunnerDir - } - dindServer.SetRunnerRootfs(snapshotName, bindMappings) - } - // Create and start the task with per-job log capture. // On Windows, cio.LogFile uses file:// URIs which runhcs rejects // (it only accepts binary:// scheme), and cio.WithStdio fails with @@ -876,6 +848,24 @@ func (r *Runtime) Create(ctx context.Context, cfg CreateConfig) (*RunnerEnv, err // here when a sibling container is created with PortBindings. if dindServer != nil { dindServer.SetRunnerNetNS(netns) + // Register the runner snapshot + task PID + non-rootfs bind + // table so dind can translate sibling `-v` sources from the + // runner's mount namespace to real host paths. The task PID + // drives /proc//root resolution — without it, paths + // whose contents span multiple image layers (e.g. + // /home/runner/externals/node20/bin/node) bind incomplete + // trees and downstream exec fails with "no such file". + bindMappings := map[string]string{} + if dindServer.SocketPath() != "" { + bindMappings["/var/run/docker.sock"] = dindServer.SocketPath() + } + hostDataDir := filepath.Dir(r.cfg.LogDir) + bindMappings["/etc/hosts"] = filepath.Join(hostDataDir, "hosts", id+".hosts") + bindMappings["/etc/resolv.conf"] = filepath.Join(hostDataDir, "dns", id+".conf") + if jobRunnerDir != "" && r.cfg.RunnerMount != "" { + bindMappings[r.cfg.RunnerMount] = jobRunnerDir + } + dindServer.SetRunnerRootfs(snapshotName, pid, bindMappings) } if _, err := r.cfg.Network.Setup(ctx, id, netns); err != nil { stopDind()