From 6f1b21b6a3cad9ba8a816c8ec288a743b0dba5bc Mon Sep 17 00:00:00 2001 From: Pawel Date: Fri, 26 Jun 2026 14:57:21 +0200 Subject: [PATCH 1/5] fix(install): give the tarball cache a 1h TTL so in-archive manifest changes self-heal (#270) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The agent-install tarball cache key is sha256(url + index snapshot fingerprint), which busts the moment registry-index.json changes but is blind to a manifest edit made INSIDE the rolling main.tar.gz (a status: flip, a keyword fix) that leaves the index byte-identical — so a warm cache served the stale archive forever (the cache had no TTL, unlike the index/catalog). Add the same 1h CACHE_TTL the index/catalog already use (fetch::CACHE_TTL) to the tarball cache: a warm cache self-refreshes within an hour, bounded to one re-download per snapshot (not per agent). The fingerprint still busts instantly on index changes; the TTL is the backstop for the residual in-archive case. Verified end-to-end on the real binary: an archive shipping status:planned installs planned; flipping it to available inside the same archive (index untouched) keeps serving planned while the cache is warm, then self-heals to available once the cache ages past the TTL. --- cli/src/install/registry.rs | 182 +++++++++++++++++++++++++++++++++++- cli/src/registry/fetch.rs | 6 +- 2 files changed, 185 insertions(+), 3 deletions(-) diff --git a/cli/src/install/registry.rs b/cli/src/install/registry.rs index fd62feb4a..92b0cc76a 100644 --- a/cli/src/install/registry.rs +++ b/cli/src/install/registry.rs @@ -3,6 +3,7 @@ //! off to `local::install_agent_from_path`. use std::path::{Path, PathBuf}; +use std::time::SystemTime; use flate2::read::GzDecoder; use sha2::{Digest, Sha256}; @@ -13,6 +14,7 @@ use crate::install::local::{copy_dir_recursive, install_agent_from_path}; use crate::manifest::loader::load_agent; use crate::paths::Paths; use crate::registry::Index; +use crate::registry::fetch::CACHE_TTL; use crate::validate::{Severity, has_errors, validate_agent_on_disk}; pub fn install_agent_from_registry( @@ -55,12 +57,18 @@ fn stage_agent_from_registry( // replaces keying on `updated-at` ALONE here (#254): that field is hand-maintained // and went stale, so a newly-added agent's subdir stayed absent from the cached // archive forever — hashing the content too busts the cache regardless. + // + // The fingerprint busts the cache the instant `registry-index.json` changes, but a + // manifest edit made INSIDE the rolling `main.tar.gz` (a `status:` flip, a keyword + // fix) that leaves the index byte-identical never rotates it — so the cache also + // carries the same 1h TTL the index/catalog use (`fetch::CACHE_TTL`), bounding that + // residual staleness to one self-healing re-download per snapshot (#270). let cache_file = cache_dir.join(tarball_cache_name( &entry.tarball, &index.snapshot_fingerprint(), )); - if cache_file.is_file() { + if cache_file.is_file() && cache_is_fresh(&cache_file) { std::fs::copy(&cache_file, &tarball_path)?; } else if let Some(path) = entry.tarball.strip_prefix("file://") { std::fs::copy(path, &tarball_path)?; @@ -106,6 +114,21 @@ fn tarball_cache_name(tarball: &str, snapshot: &str) -> String { format!("tarball-{:x}.tar.gz", h.finalize()) } +/// The cached tarball is fresh if it was (re)written within `CACHE_TTL`. The snapshot +/// fingerprint busts the cache the instant `registry-index.json` changes, but a manifest +/// edit made INSIDE the rolling `main.tar.gz` that leaves the index byte-identical never +/// rotates the fingerprint — so without a TTL that change would be served stale forever +/// (#270). This mirrors the index/catalog 1h TTL (`fetch::CACHE_TTL`): a warm cache +/// self-refreshes within an hour, bounded to one re-download per snapshot (not per agent). +/// Any failure to read the mtime falls to `false` (re-download) — prefer fresh over stale. +fn cache_is_fresh(cache_file: &Path) -> bool { + std::fs::metadata(cache_file) + .and_then(|m| m.modified()) + .ok() + .and_then(|modified| SystemTime::now().duration_since(modified).ok()) + .is_some_and(|age| age < CACHE_TTL) +} + /// Atomically update an installed agent to the latest registry version. /// /// `id` is the agent as it is installed (its `manifest.agent` / folder name) or @@ -535,6 +558,163 @@ mod tests { ); } + /// Like `write_repo_tarball` but for a single `alpha` agent whose `display-name` + /// carries a caller-chosen marker, so a test can tell one archive *state* from + /// another while the registry index stays byte-identical (same fingerprint). + fn write_alpha_archive(path: &Path, display_marker: &str) { + let enc = flate2::write::GzEncoder::new( + std::fs::File::create(path).unwrap(), + flate2::Compression::default(), + ); + let mut tar = tar::Builder::new(enc); + + let repo = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .parent() + .unwrap() + .to_path_buf(); + let tekla_manifest = + std::fs::read_to_string(repo.join("20-agents/aeco/engineering/tekla/manifest.yaml")) + .unwrap(); + let agent: crate::manifest::Agent = serde_yaml::from_str(&tekla_manifest).unwrap(); + let skills_src = repo.join("20-agents/aeco/engineering/tekla/skills"); + + let header = |len: usize| { + let mut h = tar::Header::new_gnu(); + h.set_size(len as u64); + h.set_mode(0o644); + h.set_cksum(); + h + }; + + // Rename `agent:` → alpha and stamp `display-name:` with the marker (both single lines). + let manifest = tekla_manifest + .lines() + .map(|l| { + if l.starts_with("agent:") { + "agent: alpha".to_string() + } else if l.starts_with("display-name:") { + format!("display-name: {display_marker}") + } else { + l.to_string() + } + }) + .collect::>() + .join("\n") + + "\n"; + tar.append_data( + &mut header(manifest.len()), + "aware-main/20-agents/alpha/manifest.yaml", + manifest.as_bytes(), + ) + .unwrap(); + for skill in &agent.skills { + let body = std::fs::read_to_string(skills_src.join(skill)).unwrap(); + tar.append_data( + &mut header(body.len()), + format!("aware-main/20-agents/alpha/skills/{skill}"), + body.as_bytes(), + ) + .unwrap(); + } + let mut file = tar.into_inner().unwrap().finish().unwrap(); + file.flush().unwrap(); + } + + #[test] + fn stale_tarball_cache_self_refreshes_after_ttl_even_when_index_unchanged() { + // #270: a manifest change made INSIDE the rolling `main.tar.gz` that leaves + // `registry-index.json` byte-identical does not rotate the snapshot fingerprint, + // so the fingerprint key alone can never bust the tarball cache. The cache has no + // per-agent re-download budget (that is the #243 optimization), so without a TTL + // the stale archive is served forever. This proves the tarball cache self-refreshes + // once it ages past CACHE_TTL — the same 1h lever the index/catalog already pull. + use crate::registry::fetch::CACHE_TTL; + use std::time::{Duration, SystemTime}; + + let tmp = tempfile::tempdir().unwrap(); + let aware = tmp.path().join("aware"); + let paths = Paths { + aware_home: aware.clone(), + }; + + // One archive file = the single mutable `main` archive URL over time. + let archive = tmp.path().join("main.tar.gz"); + let url = format!("file://{}", archive.display()); + + // An index that does NOT change between the two archive states → identical fingerprint. + let index = { + let mut versions = BTreeMap::new(); + versions.insert( + "1".to_string(), + VersionEntry { + tarball: url.clone(), + subdir: "aware-main/20-agents/alpha".to_string(), + }, + ); + let mut agents = BTreeMap::new(); + agents.insert( + "alpha".to_string(), + IndexEntry { + versions, + ..Default::default() + }, + ); + Index { + version: "1.0".into(), + updated_at: "2026-06-25T00:00:00Z".into(), + agents, + bundles: BTreeMap::new(), + } + }; + + let cache_file = paths + .cache_dir() + .join("agents") + .join(tarball_cache_name(&url, &index.snapshot_fingerprint())); + + // v1 of the archive: alpha carries marker DISPLAY-V1. Stage it → caches v1. + write_alpha_archive(&archive, "DISPLAY-V1"); + let (_g1, sub1) = stage_agent_from_registry("alpha", None, &paths, &index).unwrap(); + assert!( + std::fs::read_to_string(sub1.join("manifest.yaml")) + .unwrap() + .contains("DISPLAY-V1") + ); + assert!(cache_file.is_file(), "the tarball was cached"); + + // The manifest inside the SAME archive flips, but the index is byte-identical. + write_alpha_archive(&archive, "DISPLAY-V2"); + + // Within TTL the warm cache is still served (bounded staleness, by design — the + // fingerprint key cannot see a change that lives inside the archive). + let (_g2, sub2) = stage_agent_from_registry("alpha", None, &paths, &index).unwrap(); + assert!( + std::fs::read_to_string(sub2.join("manifest.yaml")) + .unwrap() + .contains("DISPLAY-V1"), + "a fresh cache is reused — the fingerprint key cannot bust an in-archive change" + ); + + // Age the cache past the TTL → the next stage must re-pull the now-current archive. + let stale = SystemTime::now() + .checked_sub(CACHE_TTL + Duration::from_secs(60)) + .unwrap(); + std::fs::File::options() + .write(true) + .open(&cache_file) + .unwrap() + .set_modified(stale) + .unwrap(); + + let (_g3, sub3) = stage_agent_from_registry("alpha", None, &paths, &index).unwrap(); + assert!( + std::fs::read_to_string(sub3.join("manifest.yaml")) + .unwrap() + .contains("DISPLAY-V2"), + "#270: once the tarball cache ages past CACHE_TTL it self-refreshes despite an unchanged index" + ); + } + #[test] fn install_busts_cache_when_index_grows_even_with_frozen_updated_at() { // The #254 end-to-end regression. The shared `main` archive is a MUTABLE ref: diff --git a/cli/src/registry/fetch.rs b/cli/src/registry/fetch.rs index 461b53119..be679b2f5 100644 --- a/cli/src/registry/fetch.rs +++ b/cli/src/registry/fetch.rs @@ -16,8 +16,10 @@ pub const DEFAULT_REGISTRY_URL: &str = pub const DEFAULT_CATALOG_URL: &str = "https://raw.githubusercontent.com/aware-aeco/aware/main/registry-catalog.json"; -/// Cache TTL — 1 hour. Re-fetch happens after this expires. -const CACHE_TTL: Duration = Duration::from_secs(60 * 60); +/// Cache TTL — 1 hour. Re-fetch happens after this expires. Shared with the +/// install tarball cache (`install::registry`) so the index, catalog, and the +/// rolling `main.tar.gz` all carry the same 1h freshness lever (#270). +pub(crate) const CACHE_TTL: Duration = Duration::from_secs(60 * 60); pub fn registry_source() -> String { std::env::var("AWARE_REGISTRY").unwrap_or_else(|_| DEFAULT_REGISTRY_URL.to_string()) From cd7754318584838a485a7f8dd71b4d43ecfcd0bb Mon Sep 17 00:00:00 2001 From: Pawel Date: Fri, 26 Jun 2026 15:03:13 +0200 Subject: [PATCH 2/5] fix(install): fall back to a stale tarball cache when refresh fails offline (#270) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex review: the new TTL skipped a present-but-stale cache and fell straight to the network branch, so an offline/timeout install the warm cache could still satisfy began failing after an hour — a regression from the prior unconditional cache use. Extract the refresh (file:// copy or HTTP download) into `refresh_tarball` and, on any failure, fall back to a stale-but-present cache instead of erroring — mirroring `fetch_index`'s stale-index fallback. A TTL means "prefer fresh", not "refuse stale when fresh is unreachable". Only a cold cache (nothing to fall back to) propagates the error. Covered by `stale_tarball_cache_is_reused_when_refresh_fails_offline`. --- cli/src/install/registry.rs | 147 +++++++++++++++++++++++++++--------- 1 file changed, 111 insertions(+), 36 deletions(-) diff --git a/cli/src/install/registry.rs b/cli/src/install/registry.rs index 92b0cc76a..1ef1d0f17 100644 --- a/cli/src/install/registry.rs +++ b/cli/src/install/registry.rs @@ -70,18 +70,18 @@ fn stage_agent_from_registry( if cache_file.is_file() && cache_is_fresh(&cache_file) { std::fs::copy(&cache_file, &tarball_path)?; - } else if let Some(path) = entry.tarball.strip_prefix("file://") { - std::fs::copy(path, &tarball_path)?; - let _ = std::fs::copy(&tarball_path, &cache_file); - } else { - let resp = ureq::get(&entry.tarball) - .timeout(std::time::Duration::from_secs(60)) - .call() - .map_err(|e| AwareError::Network(format!("GET {}: {e}", entry.tarball)))?; - let mut reader = resp.into_reader(); - let mut file = std::fs::File::create(&tarball_path)?; - std::io::copy(&mut reader, &mut file)?; - let _ = std::fs::copy(&tarball_path, &cache_file); + } else if let Err(refresh_err) = refresh_tarball(&entry.tarball, &tarball_path, &cache_file) { + // Refresh failed (offline, timeout, a vanished `file://` source). A TTL means + // "prefer fresh", not "refuse stale when fresh is unreachable": a stale-but-present + // cache still satisfies the install, so fall back to it rather than failing — exactly + // as `fetch_index` falls back to a stale index on a network error. Only a COLD cache + // (nothing to fall back to) propagates the error. + if cache_file.is_file() { + eprintln!("warning: tarball refresh failed, using stale cache: {refresh_err}"); + std::fs::copy(&cache_file, &tarball_path)?; + } else { + return Err(refresh_err); + } } // Extract ONLY the agent's subtree, not the whole archive: the substrate tarball is @@ -129,6 +129,27 @@ fn cache_is_fresh(cache_file: &Path) -> bool { .is_some_and(|age| age < CACHE_TTL) } +/// (Re)download the registry tarball into `dest` and refresh the shared cache file. +/// `tarball` is a `file://` path or an HTTP(S) URL. Any failure (offline, timeout, a +/// missing `file://` source) is returned so the caller can fall back to a stale cache +/// rather than failing the install (#270). Updating `cache_file` resets its mtime, so a +/// successful refresh re-arms the TTL. +fn refresh_tarball(tarball: &str, dest: &Path, cache_file: &Path) -> Result<(), AwareError> { + if let Some(path) = tarball.strip_prefix("file://") { + std::fs::copy(path, dest)?; + } else { + let resp = ureq::get(tarball) + .timeout(std::time::Duration::from_secs(60)) + .call() + .map_err(|e| AwareError::Network(format!("GET {tarball}: {e}")))?; + let mut reader = resp.into_reader(); + let mut file = std::fs::File::create(dest)?; + std::io::copy(&mut reader, &mut file)?; + } + let _ = std::fs::copy(dest, cache_file); + Ok(()) +} + /// Atomically update an installed agent to the latest registry version. /// /// `id` is the agent as it is installed (its `manifest.agent` / folder name) or @@ -620,6 +641,33 @@ mod tests { file.flush().unwrap(); } + /// A registry index carrying a single `alpha` agent backed by `url`. Used to prove + /// cache behavior across archive states that leave the index byte-identical (#270). + fn single_alpha_index(url: &str) -> Index { + let mut versions = BTreeMap::new(); + versions.insert( + "1".to_string(), + VersionEntry { + tarball: url.to_string(), + subdir: "aware-main/20-agents/alpha".to_string(), + }, + ); + let mut agents = BTreeMap::new(); + agents.insert( + "alpha".to_string(), + IndexEntry { + versions, + ..Default::default() + }, + ); + Index { + version: "1.0".into(), + updated_at: "2026-06-25T00:00:00Z".into(), + agents, + bundles: BTreeMap::new(), + } + } + #[test] fn stale_tarball_cache_self_refreshes_after_ttl_even_when_index_unchanged() { // #270: a manifest change made INSIDE the rolling `main.tar.gz` that leaves @@ -642,30 +690,7 @@ mod tests { let url = format!("file://{}", archive.display()); // An index that does NOT change between the two archive states → identical fingerprint. - let index = { - let mut versions = BTreeMap::new(); - versions.insert( - "1".to_string(), - VersionEntry { - tarball: url.clone(), - subdir: "aware-main/20-agents/alpha".to_string(), - }, - ); - let mut agents = BTreeMap::new(); - agents.insert( - "alpha".to_string(), - IndexEntry { - versions, - ..Default::default() - }, - ); - Index { - version: "1.0".into(), - updated_at: "2026-06-25T00:00:00Z".into(), - agents, - bundles: BTreeMap::new(), - } - }; + let index = single_alpha_index(&url); let cache_file = paths .cache_dir() @@ -715,6 +740,56 @@ mod tests { ); } + #[test] + fn stale_tarball_cache_is_reused_when_refresh_fails_offline() { + // Codex review of #270: a TTL that SKIPS a present cache must not turn a + // transient-network / offline install into a hard failure. When the tarball is + // stale AND the source is unreachable, fall back to the stale cache (as + // `fetch_index` does for the index) rather than erroring — a stale install beats + // no install. + use crate::registry::fetch::CACHE_TTL; + use std::time::{Duration, SystemTime}; + + let tmp = tempfile::tempdir().unwrap(); + let aware = tmp.path().join("aware"); + let paths = Paths { + aware_home: aware.clone(), + }; + let archive = tmp.path().join("main.tar.gz"); + let url = format!("file://{}", archive.display()); + let index = single_alpha_index(&url); + + // Warm the cache from the live source. + write_alpha_archive(&archive, "DISPLAY-V1"); + let (_g1, _s1) = stage_agent_from_registry("alpha", None, &paths, &index).unwrap(); + + let cache_file = paths + .cache_dir() + .join("agents") + .join(tarball_cache_name(&url, &index.snapshot_fingerprint())); + + // Age the cache past the TTL AND make the source unreachable (offline). + let stale = SystemTime::now() + .checked_sub(CACHE_TTL + Duration::from_secs(60)) + .unwrap(); + std::fs::File::options() + .write(true) + .open(&cache_file) + .unwrap() + .set_modified(stale) + .unwrap(); + std::fs::remove_file(&archive).unwrap(); + + // The install still succeeds, served from the stale-but-present cache. + let (_g2, s2) = stage_agent_from_registry("alpha", None, &paths, &index).unwrap(); + assert!( + std::fs::read_to_string(s2.join("manifest.yaml")) + .unwrap() + .contains("DISPLAY-V1"), + "offline + stale cache: fall back to the cache instead of failing the install" + ); + } + #[test] fn install_busts_cache_when_index_grows_even_with_frozen_updated_at() { // The #254 end-to-end regression. The shared `main` archive is a MUTABLE ref: From 8aa2a1ffbe615bd66767b6128a3198ec005f695d Mon Sep 17 00:00:00 2001 From: Pawel Date: Fri, 26 Jun 2026 15:12:24 +0200 Subject: [PATCH 3/5] fix(install): keep TTL tarball refreshes consistent with the cached index (#270) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex review: installs resolve against a cached (1h-TTL) index, so a TTL-triggered tarball refresh can pull the rolling main.tar.gz at a snapshot that advanced PAST that index — to one where the agent's subdir moved — making the fresh archive lack entry.subdir. The previous code would then fail the install ("subdir not in tarball") and had already overwritten the good cache with the raced archive (poisoning the snapshot's cache file). Two changes keep the cache consistent with the index it is keyed by: - Commit a downloaded archive to the shared cache ONLY after it has successfully served the agent (post-extraction) — a download that raced past the index can't poison the cache. - When a refreshed archive lacks the requested subdir, fall back to the prior cache (which was consistent with the cached index) instead of failing; the index self-corrects on its own TTL. Covered by `ttl_refresh_falls_back_to_prior_cache_when_archive_outran_the_index`. --- cli/src/install/registry.rs | 129 ++++++++++++++++++++++++++++++------ 1 file changed, 110 insertions(+), 19 deletions(-) diff --git a/cli/src/install/registry.rs b/cli/src/install/registry.rs index 1ef1d0f17..bb726f656 100644 --- a/cli/src/install/registry.rs +++ b/cli/src/install/registry.rs @@ -68,19 +68,25 @@ fn stage_agent_from_registry( &index.snapshot_fingerprint(), )); + // Obtain the archive into `tarball_path`. Track whether it came from a live (re)download + // (vs the warm/stale cache) so we only commit a download to the shared cache AFTER it has + // served this agent — never before extraction. + let mut downloaded = false; if cache_file.is_file() && cache_is_fresh(&cache_file) { std::fs::copy(&cache_file, &tarball_path)?; - } else if let Err(refresh_err) = refresh_tarball(&entry.tarball, &tarball_path, &cache_file) { - // Refresh failed (offline, timeout, a vanished `file://` source). A TTL means - // "prefer fresh", not "refuse stale when fresh is unreachable": a stale-but-present - // cache still satisfies the install, so fall back to it rather than failing — exactly - // as `fetch_index` falls back to a stale index on a network error. Only a COLD cache - // (nothing to fall back to) propagates the error. - if cache_file.is_file() { - eprintln!("warning: tarball refresh failed, using stale cache: {refresh_err}"); - std::fs::copy(&cache_file, &tarball_path)?; - } else { - return Err(refresh_err); + } else { + match download_tarball(&entry.tarball, &tarball_path) { + Ok(()) => downloaded = true, + // Refresh failed (offline, timeout, a vanished `file://` source). A TTL means + // "prefer fresh", not "refuse stale when fresh is unreachable": a stale-but-present + // cache still satisfies the install, so fall back to it rather than failing — as + // `fetch_index` falls back to a stale index on a network error. We do NOT re-arm + // the TTL here, so the next install retries the source. A COLD cache propagates. + Err(refresh_err) if cache_file.is_file() => { + eprintln!("warning: tarball refresh failed, using stale cache: {refresh_err}"); + std::fs::copy(&cache_file, &tarball_path)?; + } + Err(refresh_err) => return Err(refresh_err), } } @@ -90,14 +96,39 @@ fn stage_agent_from_registry( // seekable) but write only the matching entries. let extract_root = scratch.path().join("extract"); extract_subdir(&tarball_path, &extract_root, &entry.subdir)?; + let mut subdir = extract_root.join(&entry.subdir); + + if !subdir.is_dir() && downloaded && cache_file.is_file() { + // A TTL-triggered re-download pulled the rolling `main.tar.gz` at a snapshot that has + // advanced PAST our (cached, 1h-TTL) index to one where this agent's subdir moved, so + // the fresh archive no longer carries `entry.subdir`. The prior cache was consistent + // with the cached index — fall back to it rather than failing the install (#270 / Codex + // review). `downloaded` stays implicitly true so we still avoid caching the raced + // archive; we just don't surface it as the install source. + eprintln!( + "warning: refreshed archive no longer carries {}; using prior cache", + entry.subdir + ); + downloaded = false; + std::fs::copy(&cache_file, &tarball_path)?; + let retry_root = scratch.path().join("extract-cached"); + extract_subdir(&tarball_path, &retry_root, &entry.subdir)?; + subdir = retry_root.join(&entry.subdir); + } - let subdir = extract_root.join(&entry.subdir); if !subdir.is_dir() { return Err(AwareError::Validation(format!( "registry entry {key}@{resolved_version}: subdir {} not in tarball", entry.subdir, ))); } + + // Commit a freshly-downloaded archive to the shared cache ONLY now that it has served this + // agent (re-arming the TTL). Caching post-extraction means a download that raced past our + // index — and so lacks the requested subdir — can never poison the snapshot's cache file. + if downloaded { + let _ = std::fs::copy(&tarball_path, &cache_file); + } Ok((scratch, subdir)) } @@ -129,12 +160,12 @@ fn cache_is_fresh(cache_file: &Path) -> bool { .is_some_and(|age| age < CACHE_TTL) } -/// (Re)download the registry tarball into `dest` and refresh the shared cache file. -/// `tarball` is a `file://` path or an HTTP(S) URL. Any failure (offline, timeout, a -/// missing `file://` source) is returned so the caller can fall back to a stale cache -/// rather than failing the install (#270). Updating `cache_file` resets its mtime, so a -/// successful refresh re-arms the TTL. -fn refresh_tarball(tarball: &str, dest: &Path, cache_file: &Path) -> Result<(), AwareError> { +/// (Re)download the registry tarball into `dest`. `tarball` is a `file://` path or an +/// HTTP(S) URL. Any failure (offline, timeout, a missing `file://` source) is returned so +/// the caller can fall back to a stale cache rather than failing the install (#270). The +/// caller commits `dest` to the shared cache only after a successful extraction, so a +/// download that raced past the cached index can't poison the snapshot's cache file. +fn download_tarball(tarball: &str, dest: &Path) -> Result<(), AwareError> { if let Some(path) = tarball.strip_prefix("file://") { std::fs::copy(path, dest)?; } else { @@ -146,7 +177,6 @@ fn refresh_tarball(tarball: &str, dest: &Path, cache_file: &Path) -> Result<(), let mut file = std::fs::File::create(dest)?; std::io::copy(&mut reader, &mut file)?; } - let _ = std::fs::copy(dest, cache_file); Ok(()) } @@ -790,6 +820,67 @@ mod tests { ); } + #[test] + fn ttl_refresh_falls_back_to_prior_cache_when_archive_outran_the_index() { + // Codex review of #270: installs resolve against a CACHED index (1h TTL). A TTL + // tarball refresh can pull the rolling `main.tar.gz` at a snapshot that advanced PAST + // that cached index — to one where this agent's subdir moved — so the fresh archive no + // longer carries `entry.subdir`. That must NOT fail the install (the prior cache was + // consistent with the cached index) and must NOT poison the snapshot's cache file. + use crate::registry::fetch::CACHE_TTL; + use std::time::{Duration, SystemTime}; + + let tmp = tempfile::tempdir().unwrap(); + let aware = tmp.path().join("aware"); + let paths = Paths { + aware_home: aware.clone(), + }; + let archive = tmp.path().join("main.tar.gz"); + let url = format!("file://{}", archive.display()); + // The cached index keeps alpha at its original subdir (fingerprint frozen). + let index = single_alpha_index(&url); + + // Warm the cache from an archive that carries alpha. + write_alpha_archive(&archive, "DISPLAY-V1"); + let (_g1, _s1) = stage_agent_from_registry("alpha", None, &paths, &index).unwrap(); + + let cache_file = paths + .cache_dir() + .join("agents") + .join(tarball_cache_name(&url, &index.snapshot_fingerprint())); + + // The rolling archive advances PAST the cached index: alpha's subdir is gone (only an + // unrelated `beta` remains). Age the tarball cache so a refresh is triggered. + write_repo_tarball(&archive, &["beta"]); + let stale = SystemTime::now() + .checked_sub(CACHE_TTL + Duration::from_secs(60)) + .unwrap(); + std::fs::File::options() + .write(true) + .open(&cache_file) + .unwrap() + .set_modified(stale) + .unwrap(); + + // The install still succeeds, served from the prior (index-consistent) cache. + let (_g2, s2) = stage_agent_from_registry("alpha", None, &paths, &index).unwrap(); + assert!( + std::fs::read_to_string(s2.join("manifest.yaml")) + .unwrap() + .contains("DISPLAY-V1"), + "archive outran the index: fall back to the prior cache, don't fail the install" + ); + + // The cache was NOT poisoned with the alpha-less archive — alpha is still installable. + let (_g3, s3) = stage_agent_from_registry("alpha", None, &paths, &index).unwrap(); + assert!( + std::fs::read_to_string(s3.join("manifest.yaml")) + .unwrap() + .contains("DISPLAY-V1"), + "the raced archive must not overwrite the snapshot's good cache file" + ); + } + #[test] fn install_busts_cache_when_index_grows_even_with_frozen_updated_at() { // The #254 end-to-end regression. The shared `main` archive is a MUTABLE ref: From 432460ae3e95055485cdfae7d486963ce0b83e06 Mon Sep 17 00:00:00 2001 From: Pawel Date: Fri, 26 Jun 2026 15:34:09 +0200 Subject: [PATCH 4/5] fix(install): fall back to the prior cache on a corrupt TTL refresh too (#270) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex review: the moved-subdir fallback ran only after a SUCCESSFUL extraction, so a TTL-refreshed archive that was truncated/corrupt (a transient bad body, or a local archive caught mid-write) propagated the gzip/tar error before the fallback could run — failing an install a prior cache could still satisfy. Unify the two unusable-refresh cases behind `extract_agent_subdir`, which fails on either a corrupt archive OR a missing subdir. The caller treats both the same: fall back to the prior, index-consistent cache before failing. Covered by `ttl_refresh_falls_back_to_prior_cache_when_refresh_is_corrupt`. --- cli/src/install/registry.rs | 139 +++++++++++++++++++++++++++--------- 1 file changed, 106 insertions(+), 33 deletions(-) diff --git a/cli/src/install/registry.rs b/cli/src/install/registry.rs index bb726f656..3ad863b8c 100644 --- a/cli/src/install/registry.rs +++ b/cli/src/install/registry.rs @@ -90,42 +90,43 @@ fn stage_agent_from_registry( } } - // Extract ONLY the agent's subtree, not the whole archive: the substrate tarball is - // the entire monorepo, so unpacking all of it (tens of thousands of files) to reach - // one agent dominated install time — #243. We still stream through the gzip (it isn't - // seekable) but write only the matching entries. + // Extract the agent's subtree and confirm it landed. A refreshed archive can be unusable + // two ways: truncated/corrupt (a transient bad body, or a local archive caught mid-write) + // so extraction ERRORS, or advanced PAST our cached (1h-TTL) index to a snapshot where the + // subdir MOVED so it is absent. Both are handled identically — a prior cache was consistent + // with the cached index, so fall back to it rather than failing the install (#270 / Codex + // review). A cold cache (nothing to fall back to) propagates the error. let extract_root = scratch.path().join("extract"); - extract_subdir(&tarball_path, &extract_root, &entry.subdir)?; - let mut subdir = extract_root.join(&entry.subdir); - - if !subdir.is_dir() && downloaded && cache_file.is_file() { - // A TTL-triggered re-download pulled the rolling `main.tar.gz` at a snapshot that has - // advanced PAST our (cached, 1h-TTL) index to one where this agent's subdir moved, so - // the fresh archive no longer carries `entry.subdir`. The prior cache was consistent - // with the cached index — fall back to it rather than failing the install (#270 / Codex - // review). `downloaded` stays implicitly true so we still avoid caching the raced - // archive; we just don't surface it as the install source. - eprintln!( - "warning: refreshed archive no longer carries {}; using prior cache", - entry.subdir - ); - downloaded = false; - std::fs::copy(&cache_file, &tarball_path)?; - let retry_root = scratch.path().join("extract-cached"); - extract_subdir(&tarball_path, &retry_root, &entry.subdir)?; - subdir = retry_root.join(&entry.subdir); - } - - if !subdir.is_dir() { - return Err(AwareError::Validation(format!( - "registry entry {key}@{resolved_version}: subdir {} not in tarball", - entry.subdir, - ))); - } + let subdir = match extract_agent_subdir( + &tarball_path, + &extract_root, + &entry.subdir, + key, + resolved_version, + ) { + Ok(dir) => dir, + Err(_) if downloaded && cache_file.is_file() => { + eprintln!( + "warning: refreshed archive unusable for {}; using prior cache", + entry.subdir + ); + downloaded = false; + std::fs::copy(&cache_file, &tarball_path)?; + let retry_root = scratch.path().join("extract-cached"); + extract_agent_subdir( + &tarball_path, + &retry_root, + &entry.subdir, + key, + resolved_version, + )? + } + Err(err) => return Err(err), + }; // Commit a freshly-downloaded archive to the shared cache ONLY now that it has served this - // agent (re-arming the TTL). Caching post-extraction means a download that raced past our - // index — and so lacks the requested subdir — can never poison the snapshot's cache file. + // agent (re-arming the TTL). Caching post-extraction means a download that was corrupt or + // raced past our index can never poison the snapshot's cache file. if downloaded { let _ = std::fs::copy(&tarball_path, &cache_file); } @@ -277,6 +278,29 @@ pub fn update_agent_from_registry( Ok(new_name) } +/// Extract `subdir` from `tarball` into a fresh dir under `extract_root` and return the +/// agent's source dir. Fails if the archive can't be read (truncated / corrupt gzip or tar) +/// OR does not carry `subdir`. Callers treat both failures identically: a refreshed archive +/// that is corrupt or has advanced past the cached index is replaced by a prior, +/// index-consistent cache when one exists (#270). +fn extract_agent_subdir( + tarball: &Path, + extract_root: &Path, + subdir: &str, + key: &str, + version: &str, +) -> Result { + extract_subdir(tarball, extract_root, subdir)?; + let dir = extract_root.join(subdir); + if dir.is_dir() { + Ok(dir) + } else { + Err(AwareError::Validation(format!( + "registry entry {key}@{version}: subdir {subdir} not in tarball" + ))) + } +} + /// Extract only the entries under `subdir` (the agent's own subtree) from the tarball /// into `dest`, preserving their archive-relative paths. The substrate tarball is the /// whole monorepo, so unpacking everything to reach one agent was the dominant install @@ -881,6 +905,55 @@ mod tests { ); } + #[test] + fn ttl_refresh_falls_back_to_prior_cache_when_refresh_is_corrupt() { + // Codex review of #270: a TTL refresh whose source returns a truncated/garbage body + // (a transient bad response, or a local archive caught mid-write) must not fail an + // install a prior cache could satisfy — an extraction error falls back to the cache + // just like a moved subdir does. + use crate::registry::fetch::CACHE_TTL; + use std::time::{Duration, SystemTime}; + + let tmp = tempfile::tempdir().unwrap(); + let aware = tmp.path().join("aware"); + let paths = Paths { + aware_home: aware.clone(), + }; + let archive = tmp.path().join("main.tar.gz"); + let url = format!("file://{}", archive.display()); + let index = single_alpha_index(&url); + + // Warm the cache from a good archive. + write_alpha_archive(&archive, "DISPLAY-V1"); + let (_g1, _s1) = stage_agent_from_registry("alpha", None, &paths, &index).unwrap(); + + let cache_file = paths + .cache_dir() + .join("agents") + .join(tarball_cache_name(&url, &index.snapshot_fingerprint())); + + // The source is now garbage (not a valid gzip stream); age the cache to force a refresh. + std::fs::write(&archive, b"not a gzip stream").unwrap(); + let stale = SystemTime::now() + .checked_sub(CACHE_TTL + Duration::from_secs(60)) + .unwrap(); + std::fs::File::options() + .write(true) + .open(&cache_file) + .unwrap() + .set_modified(stale) + .unwrap(); + + // The install still succeeds, served from the prior cache instead of erroring. + let (_g2, s2) = stage_agent_from_registry("alpha", None, &paths, &index).unwrap(); + assert!( + std::fs::read_to_string(s2.join("manifest.yaml")) + .unwrap() + .contains("DISPLAY-V1"), + "corrupt refresh: fall back to the prior cache, don't fail the install" + ); + } + #[test] fn install_busts_cache_when_index_grows_even_with_frozen_updated_at() { // The #254 end-to-end regression. The shared `main` archive is a MUTABLE ref: From ead87723b63512b1fa1b1326731160f57c184c39 Mon Sep 17 00:00:00 2001 From: Pawel Date: Fri, 26 Jun 2026 15:51:28 +0200 Subject: [PATCH 5/5] chore(docs): sync cli_version stat to 0.81.0 (unblock Stats CI) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The v0.81.0 release bumped cli/Cargo.toml but did not run sync_stats, leaving README.md/CLAUDE.md at 0.80.0 — pre-existing drift that fails the Stats CI check. Pure mechanical sync; no behavior change. --- CLAUDE.md | 2 +- README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index d6852f204..c4a46f21e 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -1,6 +1,6 @@ # AWARE — Project Instructions -You are working on **AWARE** (`aware-aeco/aware`) — the open-source agentic substrate for AECO. The substrate is content-complete (manifesto, decalog, specs, 75 agents, 3,339 skills, 7 reference apps, 11 meta-primitives) and the `aware` CLI has shipped (v0.80.0). The current focus is the **`aware` CLI** — the runtime binary that executes everything the substrate describes. +You are working on **AWARE** (`aware-aeco/aware`) — the open-source agentic substrate for AECO. The substrate is content-complete (manifesto, decalog, specs, 75 agents, 3,339 skills, 7 reference apps, 11 meta-primitives) and the `aware` CLI has shipped (v0.81.0). The current focus is the **`aware` CLI** — the runtime binary that executes everything the substrate describes. ## Read these first (in order) diff --git a/README.md b/README.md index 11d15c577..f8c31d20c 100644 --- a/README.md +++ b/README.md @@ -93,7 +93,7 @@ aware-aeco/ **Substrate: content-complete. Runtime: shipped.** -The `aware` CLI is live at **v0.80.0** (Rust), published to npm as **`@aware-aeco/cli`**, with curl + PowerShell installers in [`scripts/`](./scripts/). What began as 7 reference agents is now a working substrate: +The `aware` CLI is live at **v0.81.0** (Rust), published to npm as **`@aware-aeco/cli`**, with curl + PowerShell installers in [`scripts/`](./scripts/). What began as 7 reference agents is now a working substrate: - **75 agents** — 26 hand-written + 49 auto-generated from vendor SDKs — **all registered** in [`registry-index.json`](./registry-index.json) and installable today. - **`aware build agent`** generators: `--from-nuget`, `--from-npm`, `--from-yard`, `--from-openapi`, `--from-csharp` (Roslyn source reader).