diff --git a/CLAUDE.md b/CLAUDE.md index d6852f204..c4a46f21e 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -1,6 +1,6 @@ # AWARE — Project Instructions -You are working on **AWARE** (`aware-aeco/aware`) — the open-source agentic substrate for AECO. The substrate is content-complete (manifesto, decalog, specs, 75 agents, 3,339 skills, 7 reference apps, 11 meta-primitives) and the `aware` CLI has shipped (v0.80.0). The current focus is the **`aware` CLI** — the runtime binary that executes everything the substrate describes. +You are working on **AWARE** (`aware-aeco/aware`) — the open-source agentic substrate for AECO. The substrate is content-complete (manifesto, decalog, specs, 75 agents, 3,339 skills, 7 reference apps, 11 meta-primitives) and the `aware` CLI has shipped (v0.81.0). The current focus is the **`aware` CLI** — the runtime binary that executes everything the substrate describes. ## Read these first (in order) diff --git a/README.md b/README.md index 11d15c577..f8c31d20c 100644 --- a/README.md +++ b/README.md @@ -93,7 +93,7 @@ aware-aeco/ **Substrate: content-complete. Runtime: shipped.** -The `aware` CLI is live at **v0.80.0** (Rust), published to npm as **`@aware-aeco/cli`**, with curl + PowerShell installers in [`scripts/`](./scripts/). What began as 7 reference agents is now a working substrate: +The `aware` CLI is live at **v0.81.0** (Rust), published to npm as **`@aware-aeco/cli`**, with curl + PowerShell installers in [`scripts/`](./scripts/). What began as 7 reference agents is now a working substrate: - **75 agents** — 26 hand-written + 49 auto-generated from vendor SDKs — **all registered** in [`registry-index.json`](./registry-index.json) and installable today. - **`aware build agent`** generators: `--from-nuget`, `--from-npm`, `--from-yard`, `--from-openapi`, `--from-csharp` (Roslyn source reader). diff --git a/cli/src/install/registry.rs b/cli/src/install/registry.rs index fd62feb4a..3ad863b8c 100644 --- a/cli/src/install/registry.rs +++ b/cli/src/install/registry.rs @@ -3,6 +3,7 @@ //! off to `local::install_agent_from_path`. use std::path::{Path, PathBuf}; +use std::time::SystemTime; use flate2::read::GzDecoder; use sha2::{Digest, Sha256}; @@ -13,6 +14,7 @@ use crate::install::local::{copy_dir_recursive, install_agent_from_path}; use crate::manifest::loader::load_agent; use crate::paths::Paths; use crate::registry::Index; +use crate::registry::fetch::CACHE_TTL; use crate::validate::{Severity, has_errors, validate_agent_on_disk}; pub fn install_agent_from_registry( @@ -55,40 +57,78 @@ fn stage_agent_from_registry( // replaces keying on `updated-at` ALONE here (#254): that field is hand-maintained // and went stale, so a newly-added agent's subdir stayed absent from the cached // archive forever — hashing the content too busts the cache regardless. + // + // The fingerprint busts the cache the instant `registry-index.json` changes, but a + // manifest edit made INSIDE the rolling `main.tar.gz` (a `status:` flip, a keyword + // fix) that leaves the index byte-identical never rotates it — so the cache also + // carries the same 1h TTL the index/catalog use (`fetch::CACHE_TTL`), bounding that + // residual staleness to one self-healing re-download per snapshot (#270). let cache_file = cache_dir.join(tarball_cache_name( &entry.tarball, &index.snapshot_fingerprint(), )); - if cache_file.is_file() { + // Obtain the archive into `tarball_path`. Track whether it came from a live (re)download + // (vs the warm/stale cache) so we only commit a download to the shared cache AFTER it has + // served this agent — never before extraction. + let mut downloaded = false; + if cache_file.is_file() && cache_is_fresh(&cache_file) { std::fs::copy(&cache_file, &tarball_path)?; - } else if let Some(path) = entry.tarball.strip_prefix("file://") { - std::fs::copy(path, &tarball_path)?; - let _ = std::fs::copy(&tarball_path, &cache_file); } else { - let resp = ureq::get(&entry.tarball) - .timeout(std::time::Duration::from_secs(60)) - .call() - .map_err(|e| AwareError::Network(format!("GET {}: {e}", entry.tarball)))?; - let mut reader = resp.into_reader(); - let mut file = std::fs::File::create(&tarball_path)?; - std::io::copy(&mut reader, &mut file)?; - let _ = std::fs::copy(&tarball_path, &cache_file); + match download_tarball(&entry.tarball, &tarball_path) { + Ok(()) => downloaded = true, + // Refresh failed (offline, timeout, a vanished `file://` source). A TTL means + // "prefer fresh", not "refuse stale when fresh is unreachable": a stale-but-present + // cache still satisfies the install, so fall back to it rather than failing — as + // `fetch_index` falls back to a stale index on a network error. We do NOT re-arm + // the TTL here, so the next install retries the source. A COLD cache propagates. + Err(refresh_err) if cache_file.is_file() => { + eprintln!("warning: tarball refresh failed, using stale cache: {refresh_err}"); + std::fs::copy(&cache_file, &tarball_path)?; + } + Err(refresh_err) => return Err(refresh_err), + } } - // Extract ONLY the agent's subtree, not the whole archive: the substrate tarball is - // the entire monorepo, so unpacking all of it (tens of thousands of files) to reach - // one agent dominated install time — #243. We still stream through the gzip (it isn't - // seekable) but write only the matching entries. + // Extract the agent's subtree and confirm it landed. A refreshed archive can be unusable + // two ways: truncated/corrupt (a transient bad body, or a local archive caught mid-write) + // so extraction ERRORS, or advanced PAST our cached (1h-TTL) index to a snapshot where the + // subdir MOVED so it is absent. Both are handled identically — a prior cache was consistent + // with the cached index, so fall back to it rather than failing the install (#270 / Codex + // review). A cold cache (nothing to fall back to) propagates the error. let extract_root = scratch.path().join("extract"); - extract_subdir(&tarball_path, &extract_root, &entry.subdir)?; + let subdir = match extract_agent_subdir( + &tarball_path, + &extract_root, + &entry.subdir, + key, + resolved_version, + ) { + Ok(dir) => dir, + Err(_) if downloaded && cache_file.is_file() => { + eprintln!( + "warning: refreshed archive unusable for {}; using prior cache", + entry.subdir + ); + downloaded = false; + std::fs::copy(&cache_file, &tarball_path)?; + let retry_root = scratch.path().join("extract-cached"); + extract_agent_subdir( + &tarball_path, + &retry_root, + &entry.subdir, + key, + resolved_version, + )? + } + Err(err) => return Err(err), + }; - let subdir = extract_root.join(&entry.subdir); - if !subdir.is_dir() { - return Err(AwareError::Validation(format!( - "registry entry {key}@{resolved_version}: subdir {} not in tarball", - entry.subdir, - ))); + // Commit a freshly-downloaded archive to the shared cache ONLY now that it has served this + // agent (re-arming the TTL). Caching post-extraction means a download that was corrupt or + // raced past our index can never poison the snapshot's cache file. + if downloaded { + let _ = std::fs::copy(&tarball_path, &cache_file); } Ok((scratch, subdir)) } @@ -106,6 +146,41 @@ fn tarball_cache_name(tarball: &str, snapshot: &str) -> String { format!("tarball-{:x}.tar.gz", h.finalize()) } +/// The cached tarball is fresh if it was (re)written within `CACHE_TTL`. The snapshot +/// fingerprint busts the cache the instant `registry-index.json` changes, but a manifest +/// edit made INSIDE the rolling `main.tar.gz` that leaves the index byte-identical never +/// rotates the fingerprint — so without a TTL that change would be served stale forever +/// (#270). This mirrors the index/catalog 1h TTL (`fetch::CACHE_TTL`): a warm cache +/// self-refreshes within an hour, bounded to one re-download per snapshot (not per agent). +/// Any failure to read the mtime falls to `false` (re-download) — prefer fresh over stale. +fn cache_is_fresh(cache_file: &Path) -> bool { + std::fs::metadata(cache_file) + .and_then(|m| m.modified()) + .ok() + .and_then(|modified| SystemTime::now().duration_since(modified).ok()) + .is_some_and(|age| age < CACHE_TTL) +} + +/// (Re)download the registry tarball into `dest`. `tarball` is a `file://` path or an +/// HTTP(S) URL. Any failure (offline, timeout, a missing `file://` source) is returned so +/// the caller can fall back to a stale cache rather than failing the install (#270). The +/// caller commits `dest` to the shared cache only after a successful extraction, so a +/// download that raced past the cached index can't poison the snapshot's cache file. +fn download_tarball(tarball: &str, dest: &Path) -> Result<(), AwareError> { + if let Some(path) = tarball.strip_prefix("file://") { + std::fs::copy(path, dest)?; + } else { + let resp = ureq::get(tarball) + .timeout(std::time::Duration::from_secs(60)) + .call() + .map_err(|e| AwareError::Network(format!("GET {tarball}: {e}")))?; + let mut reader = resp.into_reader(); + let mut file = std::fs::File::create(dest)?; + std::io::copy(&mut reader, &mut file)?; + } + Ok(()) +} + /// Atomically update an installed agent to the latest registry version. /// /// `id` is the agent as it is installed (its `manifest.agent` / folder name) or @@ -203,6 +278,29 @@ pub fn update_agent_from_registry( Ok(new_name) } +/// Extract `subdir` from `tarball` into a fresh dir under `extract_root` and return the +/// agent's source dir. Fails if the archive can't be read (truncated / corrupt gzip or tar) +/// OR does not carry `subdir`. Callers treat both failures identically: a refreshed archive +/// that is corrupt or has advanced past the cached index is replaced by a prior, +/// index-consistent cache when one exists (#270). +fn extract_agent_subdir( + tarball: &Path, + extract_root: &Path, + subdir: &str, + key: &str, + version: &str, +) -> Result { + extract_subdir(tarball, extract_root, subdir)?; + let dir = extract_root.join(subdir); + if dir.is_dir() { + Ok(dir) + } else { + Err(AwareError::Validation(format!( + "registry entry {key}@{version}: subdir {subdir} not in tarball" + ))) + } +} + /// Extract only the entries under `subdir` (the agent's own subtree) from the tarball /// into `dest`, preserving their archive-relative paths. The substrate tarball is the /// whole monorepo, so unpacking everything to reach one agent was the dominant install @@ -535,6 +633,327 @@ mod tests { ); } + /// Like `write_repo_tarball` but for a single `alpha` agent whose `display-name` + /// carries a caller-chosen marker, so a test can tell one archive *state* from + /// another while the registry index stays byte-identical (same fingerprint). + fn write_alpha_archive(path: &Path, display_marker: &str) { + let enc = flate2::write::GzEncoder::new( + std::fs::File::create(path).unwrap(), + flate2::Compression::default(), + ); + let mut tar = tar::Builder::new(enc); + + let repo = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .parent() + .unwrap() + .to_path_buf(); + let tekla_manifest = + std::fs::read_to_string(repo.join("20-agents/aeco/engineering/tekla/manifest.yaml")) + .unwrap(); + let agent: crate::manifest::Agent = serde_yaml::from_str(&tekla_manifest).unwrap(); + let skills_src = repo.join("20-agents/aeco/engineering/tekla/skills"); + + let header = |len: usize| { + let mut h = tar::Header::new_gnu(); + h.set_size(len as u64); + h.set_mode(0o644); + h.set_cksum(); + h + }; + + // Rename `agent:` → alpha and stamp `display-name:` with the marker (both single lines). + let manifest = tekla_manifest + .lines() + .map(|l| { + if l.starts_with("agent:") { + "agent: alpha".to_string() + } else if l.starts_with("display-name:") { + format!("display-name: {display_marker}") + } else { + l.to_string() + } + }) + .collect::>() + .join("\n") + + "\n"; + tar.append_data( + &mut header(manifest.len()), + "aware-main/20-agents/alpha/manifest.yaml", + manifest.as_bytes(), + ) + .unwrap(); + for skill in &agent.skills { + let body = std::fs::read_to_string(skills_src.join(skill)).unwrap(); + tar.append_data( + &mut header(body.len()), + format!("aware-main/20-agents/alpha/skills/{skill}"), + body.as_bytes(), + ) + .unwrap(); + } + let mut file = tar.into_inner().unwrap().finish().unwrap(); + file.flush().unwrap(); + } + + /// A registry index carrying a single `alpha` agent backed by `url`. Used to prove + /// cache behavior across archive states that leave the index byte-identical (#270). + fn single_alpha_index(url: &str) -> Index { + let mut versions = BTreeMap::new(); + versions.insert( + "1".to_string(), + VersionEntry { + tarball: url.to_string(), + subdir: "aware-main/20-agents/alpha".to_string(), + }, + ); + let mut agents = BTreeMap::new(); + agents.insert( + "alpha".to_string(), + IndexEntry { + versions, + ..Default::default() + }, + ); + Index { + version: "1.0".into(), + updated_at: "2026-06-25T00:00:00Z".into(), + agents, + bundles: BTreeMap::new(), + } + } + + #[test] + fn stale_tarball_cache_self_refreshes_after_ttl_even_when_index_unchanged() { + // #270: a manifest change made INSIDE the rolling `main.tar.gz` that leaves + // `registry-index.json` byte-identical does not rotate the snapshot fingerprint, + // so the fingerprint key alone can never bust the tarball cache. The cache has no + // per-agent re-download budget (that is the #243 optimization), so without a TTL + // the stale archive is served forever. This proves the tarball cache self-refreshes + // once it ages past CACHE_TTL — the same 1h lever the index/catalog already pull. + use crate::registry::fetch::CACHE_TTL; + use std::time::{Duration, SystemTime}; + + let tmp = tempfile::tempdir().unwrap(); + let aware = tmp.path().join("aware"); + let paths = Paths { + aware_home: aware.clone(), + }; + + // One archive file = the single mutable `main` archive URL over time. + let archive = tmp.path().join("main.tar.gz"); + let url = format!("file://{}", archive.display()); + + // An index that does NOT change between the two archive states → identical fingerprint. + let index = single_alpha_index(&url); + + let cache_file = paths + .cache_dir() + .join("agents") + .join(tarball_cache_name(&url, &index.snapshot_fingerprint())); + + // v1 of the archive: alpha carries marker DISPLAY-V1. Stage it → caches v1. + write_alpha_archive(&archive, "DISPLAY-V1"); + let (_g1, sub1) = stage_agent_from_registry("alpha", None, &paths, &index).unwrap(); + assert!( + std::fs::read_to_string(sub1.join("manifest.yaml")) + .unwrap() + .contains("DISPLAY-V1") + ); + assert!(cache_file.is_file(), "the tarball was cached"); + + // The manifest inside the SAME archive flips, but the index is byte-identical. + write_alpha_archive(&archive, "DISPLAY-V2"); + + // Within TTL the warm cache is still served (bounded staleness, by design — the + // fingerprint key cannot see a change that lives inside the archive). + let (_g2, sub2) = stage_agent_from_registry("alpha", None, &paths, &index).unwrap(); + assert!( + std::fs::read_to_string(sub2.join("manifest.yaml")) + .unwrap() + .contains("DISPLAY-V1"), + "a fresh cache is reused — the fingerprint key cannot bust an in-archive change" + ); + + // Age the cache past the TTL → the next stage must re-pull the now-current archive. + let stale = SystemTime::now() + .checked_sub(CACHE_TTL + Duration::from_secs(60)) + .unwrap(); + std::fs::File::options() + .write(true) + .open(&cache_file) + .unwrap() + .set_modified(stale) + .unwrap(); + + let (_g3, sub3) = stage_agent_from_registry("alpha", None, &paths, &index).unwrap(); + assert!( + std::fs::read_to_string(sub3.join("manifest.yaml")) + .unwrap() + .contains("DISPLAY-V2"), + "#270: once the tarball cache ages past CACHE_TTL it self-refreshes despite an unchanged index" + ); + } + + #[test] + fn stale_tarball_cache_is_reused_when_refresh_fails_offline() { + // Codex review of #270: a TTL that SKIPS a present cache must not turn a + // transient-network / offline install into a hard failure. When the tarball is + // stale AND the source is unreachable, fall back to the stale cache (as + // `fetch_index` does for the index) rather than erroring — a stale install beats + // no install. + use crate::registry::fetch::CACHE_TTL; + use std::time::{Duration, SystemTime}; + + let tmp = tempfile::tempdir().unwrap(); + let aware = tmp.path().join("aware"); + let paths = Paths { + aware_home: aware.clone(), + }; + let archive = tmp.path().join("main.tar.gz"); + let url = format!("file://{}", archive.display()); + let index = single_alpha_index(&url); + + // Warm the cache from the live source. + write_alpha_archive(&archive, "DISPLAY-V1"); + let (_g1, _s1) = stage_agent_from_registry("alpha", None, &paths, &index).unwrap(); + + let cache_file = paths + .cache_dir() + .join("agents") + .join(tarball_cache_name(&url, &index.snapshot_fingerprint())); + + // Age the cache past the TTL AND make the source unreachable (offline). + let stale = SystemTime::now() + .checked_sub(CACHE_TTL + Duration::from_secs(60)) + .unwrap(); + std::fs::File::options() + .write(true) + .open(&cache_file) + .unwrap() + .set_modified(stale) + .unwrap(); + std::fs::remove_file(&archive).unwrap(); + + // The install still succeeds, served from the stale-but-present cache. + let (_g2, s2) = stage_agent_from_registry("alpha", None, &paths, &index).unwrap(); + assert!( + std::fs::read_to_string(s2.join("manifest.yaml")) + .unwrap() + .contains("DISPLAY-V1"), + "offline + stale cache: fall back to the cache instead of failing the install" + ); + } + + #[test] + fn ttl_refresh_falls_back_to_prior_cache_when_archive_outran_the_index() { + // Codex review of #270: installs resolve against a CACHED index (1h TTL). A TTL + // tarball refresh can pull the rolling `main.tar.gz` at a snapshot that advanced PAST + // that cached index — to one where this agent's subdir moved — so the fresh archive no + // longer carries `entry.subdir`. That must NOT fail the install (the prior cache was + // consistent with the cached index) and must NOT poison the snapshot's cache file. + use crate::registry::fetch::CACHE_TTL; + use std::time::{Duration, SystemTime}; + + let tmp = tempfile::tempdir().unwrap(); + let aware = tmp.path().join("aware"); + let paths = Paths { + aware_home: aware.clone(), + }; + let archive = tmp.path().join("main.tar.gz"); + let url = format!("file://{}", archive.display()); + // The cached index keeps alpha at its original subdir (fingerprint frozen). + let index = single_alpha_index(&url); + + // Warm the cache from an archive that carries alpha. + write_alpha_archive(&archive, "DISPLAY-V1"); + let (_g1, _s1) = stage_agent_from_registry("alpha", None, &paths, &index).unwrap(); + + let cache_file = paths + .cache_dir() + .join("agents") + .join(tarball_cache_name(&url, &index.snapshot_fingerprint())); + + // The rolling archive advances PAST the cached index: alpha's subdir is gone (only an + // unrelated `beta` remains). Age the tarball cache so a refresh is triggered. + write_repo_tarball(&archive, &["beta"]); + let stale = SystemTime::now() + .checked_sub(CACHE_TTL + Duration::from_secs(60)) + .unwrap(); + std::fs::File::options() + .write(true) + .open(&cache_file) + .unwrap() + .set_modified(stale) + .unwrap(); + + // The install still succeeds, served from the prior (index-consistent) cache. + let (_g2, s2) = stage_agent_from_registry("alpha", None, &paths, &index).unwrap(); + assert!( + std::fs::read_to_string(s2.join("manifest.yaml")) + .unwrap() + .contains("DISPLAY-V1"), + "archive outran the index: fall back to the prior cache, don't fail the install" + ); + + // The cache was NOT poisoned with the alpha-less archive — alpha is still installable. + let (_g3, s3) = stage_agent_from_registry("alpha", None, &paths, &index).unwrap(); + assert!( + std::fs::read_to_string(s3.join("manifest.yaml")) + .unwrap() + .contains("DISPLAY-V1"), + "the raced archive must not overwrite the snapshot's good cache file" + ); + } + + #[test] + fn ttl_refresh_falls_back_to_prior_cache_when_refresh_is_corrupt() { + // Codex review of #270: a TTL refresh whose source returns a truncated/garbage body + // (a transient bad response, or a local archive caught mid-write) must not fail an + // install a prior cache could satisfy — an extraction error falls back to the cache + // just like a moved subdir does. + use crate::registry::fetch::CACHE_TTL; + use std::time::{Duration, SystemTime}; + + let tmp = tempfile::tempdir().unwrap(); + let aware = tmp.path().join("aware"); + let paths = Paths { + aware_home: aware.clone(), + }; + let archive = tmp.path().join("main.tar.gz"); + let url = format!("file://{}", archive.display()); + let index = single_alpha_index(&url); + + // Warm the cache from a good archive. + write_alpha_archive(&archive, "DISPLAY-V1"); + let (_g1, _s1) = stage_agent_from_registry("alpha", None, &paths, &index).unwrap(); + + let cache_file = paths + .cache_dir() + .join("agents") + .join(tarball_cache_name(&url, &index.snapshot_fingerprint())); + + // The source is now garbage (not a valid gzip stream); age the cache to force a refresh. + std::fs::write(&archive, b"not a gzip stream").unwrap(); + let stale = SystemTime::now() + .checked_sub(CACHE_TTL + Duration::from_secs(60)) + .unwrap(); + std::fs::File::options() + .write(true) + .open(&cache_file) + .unwrap() + .set_modified(stale) + .unwrap(); + + // The install still succeeds, served from the prior cache instead of erroring. + let (_g2, s2) = stage_agent_from_registry("alpha", None, &paths, &index).unwrap(); + assert!( + std::fs::read_to_string(s2.join("manifest.yaml")) + .unwrap() + .contains("DISPLAY-V1"), + "corrupt refresh: fall back to the prior cache, don't fail the install" + ); + } + #[test] fn install_busts_cache_when_index_grows_even_with_frozen_updated_at() { // The #254 end-to-end regression. The shared `main` archive is a MUTABLE ref: diff --git a/cli/src/registry/fetch.rs b/cli/src/registry/fetch.rs index 461b53119..be679b2f5 100644 --- a/cli/src/registry/fetch.rs +++ b/cli/src/registry/fetch.rs @@ -16,8 +16,10 @@ pub const DEFAULT_REGISTRY_URL: &str = pub const DEFAULT_CATALOG_URL: &str = "https://raw.githubusercontent.com/aware-aeco/aware/main/registry-catalog.json"; -/// Cache TTL — 1 hour. Re-fetch happens after this expires. -const CACHE_TTL: Duration = Duration::from_secs(60 * 60); +/// Cache TTL — 1 hour. Re-fetch happens after this expires. Shared with the +/// install tarball cache (`install::registry`) so the index, catalog, and the +/// rolling `main.tar.gz` all carry the same 1h freshness lever (#270). +pub(crate) const CACHE_TTL: Duration = Duration::from_secs(60 * 60); pub fn registry_source() -> String { std::env::var("AWARE_REGISTRY").unwrap_or_else(|_| DEFAULT_REGISTRY_URL.to_string())