From c9d71d52f8919f77da6ffe32bd97f8b2d9e82c86 Mon Sep 17 00:00:00 2001 From: Juliusz Sosinowicz Date: Thu, 25 Jun 2026 09:35:13 +0000 Subject: [PATCH 1/2] parallel-make-check.py: add generic pool extensions for arbitrary commands Let any command ride the build/check pool, not just wolfSSL builds: build false skips configure/make/check (config is just prepare+run) netns true runs each command under 'bwrap --unshare-net --cap-add CAP_NET_ADMIN' (its own network namespace) so parallel network tests can't collide on ports and can configure that namespace shards fan a config out into N instances, each with $SHARD (1..N) and $SHARDS=N in its env and its own build-- dir, so a command can split its work N ways (the pool load-balances them) Error out, rather than silently degrade, on two misconfigurations that otherwise surface as confusing test failures: netns requested but bwrap missing (commands would share the host namespace and collide on ports), and config-name collisions after shard fan-out (two jobs would share a build dir and race). --- .github/scripts/parallel-make-check.py | 124 +++++++++++++++++++++---- 1 file changed, 107 insertions(+), 17 deletions(-) diff --git a/.github/scripts/parallel-make-check.py b/.github/scripts/parallel-make-check.py index 4e8febcf318..039b01648d1 100755 --- a/.github/scripts/parallel-make-check.py +++ b/.github/scripts/parallel-make-check.py @@ -32,6 +32,20 @@ # checks, e.g. [["wolfcrypt/test/testwolfcrypt"]] # comment ignored; JSON has no comment syntax, so notes go here # +# The pool is not wolfSSL-specific; these keys let any command ride it: +# +# build false skips configure/make/check, so the config is just its +# prepare+run commands (default true). Use it to run an +# arbitrary command across the pool. +# netns true runs each command under "bwrap --unshare-net" (its own +# network namespace), so parallel network tests can't collide +# on ports (default false; needs bubblewrap). +# shards fan the config out into N instances run as separate jobs, +# each with $SHARD (1..N) and $SHARDS=N in its environment and +# its own build-- dir, so a command can split work +# N ways (default 1). The pool (--threads) still bounds how +# many run at once, so N>threads load-balances dynamically. +# # For example: # # [ @@ -71,7 +85,7 @@ import time from collections.abc import Callable from concurrent.futures import ThreadPoolExecutor -from dataclasses import dataclass, field +from dataclasses import dataclass, field, replace from pathlib import Path from typing import NoReturn @@ -94,9 +108,25 @@ class Config: # Whether "minutes" was given in the JSON (vs the 1.0 default); only an # explicit estimate is checked for >50% drift against the real time. minutes_provided: bool = False + # Generic-command extensions. Defaults keep a config behaving as a + # wolfSSL build. With build=false a config is just its prepare+run + # commands (no configure/make/check), so any command can ride the pool. + build: bool = True + # netns=true runs each command under "bwrap --unshare-net" so parallel + # network tests can't collide on ports (same isolation as the .test scripts). + netns: bool = False + # shards>1 fans the config out into that many instances, each run with + # $SHARD (1..N) and $SHARDS=N in its environment so the command can pick + # its slice of the work; each instance gets its own build-- dir. + shards: int = 1 + # Extra environment for the commands (set by the shard fan-out). + env: dict[str, str] = field(default_factory=dict) SRCDIR = Path(__file__).resolve().parents[2] ON_GITHUB = os.environ.get("GITHUB_ACTIONS") == "true" +# Used by configs with "netns": true to give each command its own network +# namespace (so parallel network tests can't collide on ports). +BWRAP = shutil.which("bwrap") print_lock = threading.Lock() # Fail-fast state: the first failure sets stop_event (under fail_lock, so @@ -162,7 +192,8 @@ def load_configs(opts: argparse.Namespace, error(f"{opts.json}: config entries must be objects: {entry!r}") unknown = set(entry) - {"name", "configure", "cc", "cflags", "ldflags", "minutes", "user_settings", - "check", "prepare", "run", "comment"} + "check", "prepare", "run", "comment", + "build", "netns", "shards"} if unknown: error(f"{opts.json}: unknown key(s) in {entry.get('name', entry)!r}: " f"{' '.join(sorted(unknown))}") @@ -198,6 +229,12 @@ def load_configs(opts: argparse.Namespace, check = entry.get("check", True) if not isinstance(check, bool): error(f"{opts.json}: \"check\" must be a boolean in {name!r}") + for key in ("build", "netns"): + if not isinstance(entry.get(key, False), bool): + error(f"{opts.json}: \"{key}\" must be a boolean in {name!r}") + shards = entry.get("shards", 1) + if isinstance(shards, bool) or not isinstance(shards, int) or shards < 1: + error(f"{opts.json}: \"shards\" must be an integer >= 1 in {name!r}") cc = entry.get("cc", opts.cc or "") if not isinstance(cc, str): error(f"{opts.json}: \"cc\" must be a string in {name!r}") @@ -215,7 +252,10 @@ def load_configs(opts: argparse.Namespace, float(minutes), user_settings, check, list(entry.get("prepare", [])), list(entry.get("run", [])), - minutes_provided="minutes" in entry)) + minutes_provided="minutes" in entry, + build=entry.get("build", True), + netns=entry.get("netns", False), + shards=shards)) if not configs: error(f"{opts.json}: no configs") return configs @@ -323,16 +363,23 @@ def run_config(cfg: Config, opts: argparse.Namespace) -> tuple[str | None, lambda: shutil.copy(SRCDIR / cfg.user_settings, bdir / "user_settings.h"))) steps += [(" ".join(cmd), cmd) for cmd in cfg.prepare] - steps += [("configure", configure), ("make", make)] - if cfg.check: - steps += [ - # Prebuild the check programs without running any tests so - # "make check" below is pure test execution. - ("make check TESTS=", make + ["check", "TESTS="]), - ("private dirs", lambda: privatize_dirs(bdir, opts.private_dir)), - ("make check", ["make"] + flags + ["check"]), - ] + if cfg.build: + steps += [("configure", configure), ("make", make)] + if cfg.check: + steps += [ + # Prebuild the check programs without running any tests so + # "make check" below is pure test execution. + ("make check TESTS=", make + ["check", "TESTS="]), + ("private dirs", lambda: privatize_dirs(bdir, opts.private_dir)), + ("make check", ["make"] + flags + ["check"]), + ] steps += [(" ".join(cmd), cmd) for cmd in cfg.run] + # With "netns", each command runs in its own network namespace; --chdir + # keeps the build dir as cwd inside the sandbox. CAP_NET_ADMIN lets the + # command configure that netns (bring interfaces up, add addresses). + netns = ([BWRAP, "--unshare-net", "--cap-add", "CAP_NET_ADMIN", + "--dev-bind", "/", "/", "--chdir", str(bdir)] + if cfg.netns and BWRAP else []) failed: str | None = None start = time.monotonic() log = bdir / "make-check.log" @@ -363,12 +410,14 @@ def record_failure(step: str) -> str: failed = record_failure(step) break continue + cmd = netns + cmd print(f"+ {' '.join(cmd)}", file=logf, flush=True) # stdin=DEVNULL so a test that reads stdin sees EOF (as in CI) # instead of blocking forever on an interactive/socket stdin. proc = subprocess.Popen(cmd, cwd=bdir, stdout=logf, stderr=subprocess.STDOUT, stdin=subprocess.DEVNULL, + env={**os.environ, **cfg.env}, start_new_session=True) with procs_lock: live_procs.add(proc) @@ -438,14 +487,21 @@ def summarize(results: list[tuple[Config, str | None, float]], # (serial configure/link/test phases show up here). busy_min = sum(minutes for _, _, minutes in results) ncpu = nproc() + thread_min = wall_min * nthreads + cpu_avail = wall_min * ncpu + # Guard the ratios against a zero wall time (e.g. every job a no-op, which + # can happen when there are more shards than work) so the line never + # divides by zero. + occupancy = 100 * busy_min / thread_min if thread_min else 0 + cpu_util = 100 * cpu_min / cpu_avail if cpu_avail else 0 lines += [ "", f"{len(results)} configs in {wall_min:.1f} min on {nthreads} " f"threads / {ncpu} CPUs: " - f"thread occupancy {100 * busy_min / (wall_min * nthreads):.0f}% " - f"({busy_min:.1f} of {wall_min * nthreads:.1f} thread-min), " - f"CPU utilization {100 * cpu_min / (wall_min * ncpu):.0f}% " - f"({cpu_min:.1f} of {wall_min * ncpu:.1f} CPU-min)", + f"thread occupancy {occupancy:.0f}% " + f"({busy_min:.1f} of {thread_min:.1f} thread-min), " + f"CPU utilization {cpu_util:.0f}% " + f"({cpu_min:.1f} of {cpu_avail:.1f} CPU-min)", ] table = "\n".join(lines) print(table) @@ -455,6 +511,18 @@ def summarize(results: list[tuple[Config, str | None, float]], print(f"### make check\n\n{table}", file=f) +def shard_instances(cfg: Config) -> list[Config]: + # A config that asks for shards>1 becomes that many independent jobs: each + # gets its index as $SHARD (1..N) / $SHARDS=N and its own build-- + # dir, so its command can run one slice of the work. A config with the + # default shards=1 is left as a single unchanged job. + if cfg.shards <= 1: + return [cfg] + return [replace(cfg, name=f"{cfg.name}-{k}", shards=1, + env={**cfg.env, "SHARD": str(k), "SHARDS": str(cfg.shards)}) + for k in range(1, cfg.shards + 1)] + + def main() -> int: p = argparse.ArgumentParser( description="Build and make check every configuration from a JSON " @@ -537,6 +605,28 @@ def main() -> int: loads[i] += cfg.minutes selected = shards[k - 1] + # Replace each config with its shard instances (a no-op for shards=1), + # then re-sort so the pool still takes the longest jobs first. Done after + # --shard so a CI-level split and in-job fan-out compose. + expanded = [] + for cfg in selected: + expanded.extend(shard_instances(cfg)) + expanded.sort(key=lambda cfg: -cfg.minutes) + selected = expanded + + # A fanned-out name (-) could collide with another config's name, + # which would make two jobs share a build- dir and race. Catch it, + # like the duplicate-name check in load_configs. + names = [cfg.name for cfg in selected] + dups = sorted({n for n in names if names.count(n) > 1}) + if dups: + p.error(f"config names collide after shard fan-out: {' '.join(dups)}") + + if any(cfg.netns for cfg in selected) and not BWRAP: + p.error("netns requested but bwrap not found; install bubblewrap " + "(without it the commands share the host network namespace " + "and collide on ports)") + if opts.list: for cfg in selected: print(f"{cfg.name} [{cfg.minutes:g} min]: " @@ -546,7 +636,7 @@ def main() -> int: print(f"shard {opts.shard}: no configs to run") return 0 - if not (SRCDIR / "configure").exists(): + if any(cfg.build for cfg in selected) and not (SRCDIR / "configure").exists(): subprocess.run(["./autogen.sh"], cwd=SRCDIR, check=True) nthreads = max(1, min(opts.threads, len(selected))) From f2fa741badc89b6032f226d0e61778498ba87a80 Mon Sep 17 00:00:00 2001 From: Juliusz Sosinowicz Date: Thu, 25 Jun 2026 09:35:13 +0000 Subject: [PATCH 2/2] socat CI: run the test suite as parallel netns shards The socat suite is sleep-bound and slow run serially. Drive it through parallel-make-check.py as ~6 shards per CPU, 2 running per CPU at once: each shard runs a round-robin slice of the tests in its own bwrap network namespace (so parallel shards don't collide on ports) and its own build-dir copy. The work is almost all waiting, so the oversubscription just overlaps the waits. Install bubblewrap so the netns isolation actually happens (without it the runner silently shares one namespace and the shards collide). Each fresh netns is IPv4-loopback only, so re-create IPv6 loopback (CAP_NET_ADMIN) for the ::1 / dual-stack tests, and add non-loopback placeholders (fc00::1, 192.0.2.1) so glibc's AI_ADDRCONFIG still returns both families - without them socat's getaddrinfo fails on numeric non-loopback addresses, e.g. the multicast tests. Relax the AppArmor unprivileged-userns restriction so the bwrap netns + CAP_NET_ADMIN work on ubuntu-24.04. --- .github/workflows/socat.yml | 56 ++++++++++++++++++++++++++++++++----- 1 file changed, 49 insertions(+), 7 deletions(-) diff --git a/.github/workflows/socat.yml b/.github/workflows/socat.yml index 75b5431bcc3..6b285653d0b 100644 --- a/.github/workflows/socat.yml +++ b/.github/workflows/socat.yml @@ -39,10 +39,11 @@ jobs: socat_check: + name: socat ${{ matrix.socat_version }} if: ${{ (github.repository_owner == 'wolfssl') && (github.event_name != 'pull_request' || github.event.pull_request.draft == false) }} runs-on: ubuntu-24.04 - # This should be a safe limit for the tests to run. - timeout-minutes: 30 + # This should be a safe limit for the parallel tests to run. + timeout-minutes: 15 needs: build_wolfssl strategy: fail-fast: false @@ -56,13 +57,15 @@ jobs: - name: Checkout wolfSSL CI actions uses: actions/checkout@v5 with: - sparse-checkout: .github/actions + sparse-checkout: | + .github/actions + .github/scripts fetch-depth: 1 - name: Install prereqs uses: ./.github/actions/install-apt-deps with: - packages: build-essential autoconf libtool pkg-config clang libc++-dev + packages: build-essential autoconf libtool pkg-config clang libc++-dev bubblewrap ghcr-debs-tag: ubuntu-24.04-full - name: Download lib @@ -91,9 +94,48 @@ jobs: ./configure --with-wolfssl=$GITHUB_WORKSPACE/build-dir --enable-default-ipv=4 make -j + # Ubuntu 24.04 can restrict unprivileged user namespaces via AppArmor, + # which leaves CAP_NET_ADMIN ineffective inside bwrap's netns; the shards + # need it to re-create IPv6 loopback there. Relax the restriction. + - name: Allow unprivileged user namespaces (for bwrap) + run: sudo sysctl -w kernel.apparmor_restrict_unprivileged_userns=0 || true + - name: Run socat tests - working-directory: ./socat-${{ matrix.socat_version }} + env: + SOCAT_SRC: ${{ github.workspace }}/socat-${{ matrix.socat_version }} + EXPECT_FAIL: ${{ matrix.expect_fail }} run: | export LD_LIBRARY_PATH=$GITHUB_WORKSPACE/build-dir/lib:$LD_LIBRARY_PATH - export SHELL=/bin/bash - SOCAT=$GITHUB_WORKSPACE/socat-${{ matrix.socat_version }}/socat ./test.sh -t 1.0 --expect-fail ${{ matrix.expect_fail }} + # The socat suite is sleep-bound, so run it as parallel shards via the + # shared parallel runner. The work is almost all waiting (timeouts and + # sleeps; only ~16% CPU even when packed), so oversubscribe: ~6 shards + # per CPU below, run 2 per CPU at once (--threads), so several overlap + # their waits (bigger runners get proportionally more). Each shard runs + # a round-robin slice of the tests ($SHARD/$SHARDS) in its own bwrap + # network namespace (no port collisions) and its own build-dir copy. + # ${tests:-0} keeps a shard that drew no test numbers a no-op (test 0 + # matches nothing) instead of letting test.sh fall back to running the + # whole suite. + # + # bwrap --unshare-net gives each shard a fresh netns with loopback up + # but IPv4-only; re-create IPv6 loopback (CAP_NET_ADMIN is granted by + # the runner) so the suite's ::1 / dual-stack tests work as in the host + # namespace. fc00::1 and 192.0.2.1 are non-loopback placeholders so + # glibc's AI_ADDRCONFIG still returns IPv6/IPv4: with only loopback + # configured it drops the family, and socat's getaddrinfo then fails on + # numeric non-loopback addresses (e.g. the multicast tests). Best-effort + # (|| true), errors left visible so a runner without IPv6 still runs the + # IPv4 tests and any failure stays diagnosable in the log. + cat > socat-configs.json <<'EOF' + [{ + "name": "socat", "build": false, "netns": true, "shards": __SHARDS__, + "run": [["bash", "-c", "set -e; ip link set lo up || true; sysctl -wq net.ipv6.conf.lo.disable_ipv6=0 || true; ip addr add ::1/128 dev lo || true; ip addr add fc00::1/128 dev lo || true; ip addr add 192.0.2.1/32 dev lo || true; sysctl -wq net.ipv6.bindv6only=0 || true; cp -a \"$SOCAT_SRC/.\" .; tests=$(seq \"$SHARD\" \"$SHARDS\" 999); SOCAT=\"$PWD/socat\" SHELL=/bin/bash ./test.sh -t 1.0 --expect-fail \"$EXPECT_FAIL\" ${tests:-0}"]] + }] + EOF + sed -i "s/__SHARDS__/$(( 6 * $(nproc) ))/" socat-configs.json + # Run 2 shards per CPU at once: the per-shard netns isolates ports, so + # the only real cost of overlap is CPU, and the suite barely uses any + # (mostly waiting), so this just overlaps the waits. fail-fast (the + # default) aborts the rest on the first failure. + .github/scripts/parallel-make-check.py \ + --threads "$(( 2 * $(nproc) ))" socat-configs.json