diff --git a/CHANGELOG.md b/CHANGELOG.md index 7bbc85d0..2888b205 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -51,6 +51,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). - **`ww shell` "AI agents:" startup hint.** The line `AI agents: ipfs cat /ipns/releases.wetware.run/.agents/prompt.md` is gone from `src/cli/shell.rs`. The hint pointed at a host-shell command (`ipfs cat`) that's awkward to surface from inside a Glia REPL (the user can't paste it), and the obvious Glia-form rewrite — `(perform fs :read-str "/ipns/…")` — fails today because the WASI fs interceptor (`crates/cell/src/fs_intercept.rs:481-520`) only recognizes `ipfs//…` paths (`parse_ipfs_path` at line 72 strips the `ipfs/` prefix; there's no `ipns/` sibling). `/ipfs//…` reads through the cap *do* work — `open_ipfs` lazily materializes content from the pinset cache — so a hint pointing at a stable CID would work today; what's missing is IPNS resolution at the intercept layer (or a sibling cap method that calls Kubo `name/resolve` first, then routes through the existing pinset path). Restoring a pasteable Glia-form hint is the natural reward for that follow-up. ### Fixed +- **CI IPFS release publishing now tolerates slow pod staging.** The publish helper uses a unique pod staging path, retries `kubectl cp`, keeps repo stats and pod cleanup best-effort, and logs each production publish phase so slow k3s API behavior is diagnosable without changing release pin retention semantics. - **WAGI HTTP requests now time out and kill hung cells (#535).** `HttpListener` bounds each spawned request's stdin/stdout/wait phase with a 30s wall-clock timeout, returns `504 Gateway Timeout` on expiry, and calls `Process.kill()` best-effort while preserving the existing oversized-response kill path. - **Epoch advances now update the live CidTree root before broadcasting (#536).** Daemon startup wires the runtime `CidTree` into `EpochService`, so epoch commit handling swaps the virtual filesystem root to the committed event CID before the delayed epoch notification is released. - **`/status` host introspection now degrades instead of hanging during cold start (#534).** Bounded best-effort timeouts around `host.id`, `host.addrs`, and `host.peers` let slow startup host RPCs render individual JSON fields as `null` instead of blocking the status response indefinitely. diff --git a/scripts/ipfs_publish_release.sh b/scripts/ipfs_publish_release.sh index 0cf26bf5..c396fcf6 100755 --- a/scripts/ipfs_publish_release.sh +++ b/scripts/ipfs_publish_release.sh @@ -7,10 +7,11 @@ set -euo pipefail : "${POD:?POD is required}" REMOTE_RELEASE_TREE="${REMOTE_RELEASE_TREE:-/tmp/ww-release-tree}" -POD_RELEASE_TREE="${POD_RELEASE_TREE:-/tmp/release-tree}" +POD_RELEASE_TREE="${POD_RELEASE_TREE:-/tmp/ww-release-tree-publish-$(date +%s)-$$}" STATE_FILE="${WW_RELEASE_PIN_STATE:-/data/ipfs/ww-release-pins.txt}" RETAIN="${WW_RELEASE_PIN_RETAIN:-10}" -KUBECTL_TIMEOUT="${KUBECTL_TIMEOUT:-5m}" +KUBECTL_TIMEOUT="${KUBECTL_TIMEOUT:-10m}" +KUBECTL_BEST_EFFORT_TIMEOUT="${KUBECTL_BEST_EFFORT_TIMEOUT:-45s}" case "$RETAIN" in ''|*[!0-9]*) @@ -27,27 +28,60 @@ k() { kubectl --request-timeout="$KUBECTL_TIMEOUT" "$@" } +best_effort_k() { + kubectl --request-timeout="$KUBECTL_BEST_EFFORT_TIMEOUT" "$@" +} + pod() { k exec "$POD" -- "$@" } +log() { + printf 'ipfs-publish: %s\n' "$*" >&2 +} + cleanup() { - pod rm -rf "$POD_RELEASE_TREE" >/dev/null 2>&1 || true + best_effort_k exec "$POD" -- rm -rf "$POD_RELEASE_TREE" >/dev/null 2>&1 || true } trap cleanup EXIT repo_stat_size() { - pod sh -c 'if command -v timeout >/dev/null 2>&1; then timeout 30 ipfs repo stat --size-only; else ipfs repo stat --size-only; fi' 2>/dev/null \ + best_effort_k exec "$POD" -- sh -c 'if command -v timeout >/dev/null 2>&1; then timeout 30 ipfs repo stat --size-only; else ipfs repo stat --size-only; fi' 2>/dev/null \ | tail -n 1 \ | tr -d '\r' \ || true } +copy_release_tree() { + local attempt backoff + + for attempt in 1 2 3; do + log "copying release tree into pod staging path $POD_RELEASE_TREE (attempt $attempt)" + if k cp --retries=3 "$REMOTE_RELEASE_TREE" "$POD:$POD_RELEASE_TREE"; then + return 0 + fi + + if [ "$attempt" -lt 3 ]; then + backoff="$((attempt * 20))" + log "release tree copy failed; retrying in ${backoff}s" + sleep "$backoff" + fi + done + + return 1 +} + +if [ ! -d "$REMOTE_RELEASE_TREE" ]; then + echo "ERROR: release tree is missing on VPS: $REMOTE_RELEASE_TREE" >&2 + exit 1 +fi + +log "collecting repo stat before publish (best effort)" repo_stat_before="$(repo_stat_size)" -pod rm -rf "$POD_RELEASE_TREE" -k cp "$REMOTE_RELEASE_TREE" "$POD:$POD_RELEASE_TREE" +copy_release_tree +log "adding release tree to IPFS with implicit pinning disabled" CID="$(pod ipfs add --pin=false -rQ --cid-version=1 "$POD_RELEASE_TREE" | tail -n 1 | tr -d '\r')" if [ -z "$CID" ]; then echo "ERROR: ipfs add produced an empty CID" >&2 @@ -55,13 +89,17 @@ if [ -z "$CID" ]; then fi echo "CID=$CID" +log "pinning release CID $CID" pod ipfs pin add "$CID" +log "publishing IPNS ww-release to $CID" pod ipfs name publish --key=ww-release "/ipfs/$CID" +log "announcing release CID to the DHT (best effort)" if ! pod sh -c "if command -v timeout >/dev/null 2>&1; then timeout 60 ipfs routing provide -r '$CID'; else ipfs routing provide -r '$CID'; fi"; then echo "WARNING: provide announce timed out or failed; DHT propagation may lag" >&2 fi +log "updating managed release pin state" state_output="$( k exec "$POD" -- sh -s -- "$CID" "$RETAIN" "$STATE_FILE" <<'POD_STATE_SH' set -eu @@ -152,11 +190,13 @@ printf '%s\n' "$state_output" unpinned_count="$(printf '%s\n' "$state_output" | awk -F= '$1 == "UNPINNED_COUNT" { value=$2 } END { print value + 0 }')" if [ "$unpinned_count" -gt 0 ]; then + log "running IPFS repo GC after managed stale release unpins" if ! pod sh -c 'if command -v timeout >/dev/null 2>&1; then timeout 120 ipfs repo gc; else ipfs repo gc; fi'; then echo "WARNING: ipfs repo gc timed out or failed after stale release unpins" >&2 fi fi +log "collecting repo stat after cleanup (best effort)" repo_stat_after="$(repo_stat_size)" rm -rf "$REMOTE_RELEASE_TREE" diff --git a/tests/test_ipfs_release_publish.sh b/tests/test_ipfs_release_publish.sh index 80e855af..7ab8d8f9 100755 --- a/tests/test_ipfs_release_publish.sh +++ b/tests/test_ipfs_release_publish.sh @@ -32,6 +32,11 @@ grep -Fq "[ ! -f \"\$state_file\" ]" "$PUBLISH_SCRIPT" \ || fail "release script must handle first run without bulk cleanup" grep -Fq 'ipfs repo gc' "$PUBLISH_SCRIPT" \ || fail "release script must run repo gc after stale unpins" +# shellcheck disable=SC2016 +grep -Fq 'POD_RELEASE_TREE:-/tmp/ww-release-tree-publish-$(date +%s)-$$' "$PUBLISH_SCRIPT" \ + || fail "release script must use a unique pod staging path" +grep -Fq 'k cp --retries=3' "$PUBLISH_SCRIPT" \ + || fail "release script must retry kubectl cp under slow k3s API behavior" pin_add_line="$(line_number "ipfs pin add \"\$CID\"" "$PUBLISH_SCRIPT")" publish_line="$(line_number "ipfs name publish --key=ww-release \"/ipfs/\$CID\"" "$PUBLISH_SCRIPT")"