From 8459d6a6ece54c171a961e6e15d0fc54941a0548 Mon Sep 17 00:00:00 2001 From: "Jean M. Sexton" Date: Tue, 17 Feb 2026 07:49:00 -0800 Subject: [PATCH 1/3] Add container runner for local and HPC --- README.md | 26 ++ docs/container_runs.md | 38 ++ docs/mcp.md | 4 + mcp_server.py | 8 + src/config.py | 41 ++ src/nodes/runner_node.py | 13 +- src/services/run_container.py | 524 ++++++++++++++++++++++++ src/services/run_superfacility_tools.py | 10 + 8 files changed, 662 insertions(+), 2 deletions(-) create mode 100644 docs/container_runs.md create mode 100644 src/services/run_container.py diff --git a/README.md b/README.md index f832a81..d577762 100644 --- a/README.md +++ b/README.md @@ -74,6 +74,32 @@ Config override example: python amrex_agent.py --prompt "Run AMReX Advection_AmrCore with a 64x64 grid" --config demo/amrex/config.yaml ``` +### Container runs (local/NERSC/ALCF) + +Container runs are enabled when `container_image` (or `container_runtime`) is set in config. +Default image (when unset): `registry.nersc.gov/amsc014/superfacility/pele:latest`. +ALCF defaults to `docker://registry.nersc.gov/amsc014/superfacility/pele:latest` for Apptainer. + +Minimal examples (YAML config override): + +```yaml +environment: local +container_image: registry.nersc.gov/amsc014/superfacility/pele:latest +container_runtime: docker +``` + +```yaml +environment: perlmutter +container_image: registry.nersc.gov/amsc014/superfacility/pele:latest +container_runtime: podman-hpc +``` + +```yaml +environment: alcf +container_image: docker://registry.nersc.gov/amsc014/superfacility/pele:latest +container_runtime: apptainer +``` + ## Configuration knobs (examples) - `--indexing-strategy`: diff --git a/docs/container_runs.md b/docs/container_runs.md new file mode 100644 index 0000000..b0d0ece --- /dev/null +++ b/docs/container_runs.md @@ -0,0 +1,38 @@ +# Container Runs + +Container runs activate when `container_image` (or `container_runtime`) is set. +Defaults: +- Image: `registry.nersc.gov/amsc014/superfacility/pele:latest` +- ALCF image: `docker://registry.nersc.gov/amsc014/superfacility/pele:latest` + +## Local (Docker) + +Example config override: + +```yaml +environment: local +container_image: registry.nersc.gov/amsc014/superfacility/pele:latest +container_runtime: docker +``` + +## NERSC Perlmutter (podman-hpc) + +Example config override: + +```yaml +environment: perlmutter +container_image: registry.nersc.gov/amsc014/superfacility/pele:latest +container_runtime: podman-hpc +``` + +Reference: https://docs.nersc.gov/development/containers/podman-hpc/overview/ + +## ALCF (Apptainer) + +Example config override: + +```yaml +environment: alcf +container_image: docker://registry.nersc.gov/amsc014/superfacility/pele:latest +container_runtime: apptainer +``` diff --git a/docs/mcp.md b/docs/mcp.md index 4b17e94..f9d0f6c 100644 --- a/docs/mcp.md +++ b/docs/mcp.md @@ -36,6 +36,10 @@ The MCP server exposes these tools: Use `tools/list` to discover schemas and required parameters. +## Container runs + +Container usage examples live in `docs/container_runs.md`. + ## Example: apply_plan Apply a previously generated plan to write inputs: diff --git a/mcp_server.py b/mcp_server.py index b67e63d..a2f431f 100755 --- a/mcp_server.py +++ b/mcp_server.py @@ -48,6 +48,7 @@ from src.services.input_writer import InputWriterService from src.services.knowledge import PeleKnowledgeService from src.services.plan import SimulationPlan + from src.services.run_container import ContainerRunner from src.services.run_local import LocalRunner from src.services.run_superfacility import SuperfacilityRunner from src.services.validation import ValidationService @@ -62,6 +63,7 @@ from src.services.input_writer import InputWriterService from src.services.knowledge import PeleKnowledgeService from src.services.plan import SimulationPlan + from src.services.run_container import ContainerRunner from src.services.run_local import LocalRunner from src.services.run_superfacility import SuperfacilityRunner from src.services.validation import ValidationService @@ -130,6 +132,12 @@ def _persist_session_context(session_id: str, context: dict[str, Any]) -> None: def _select_runner(active_config: AMReXAgentConfig): """Select execution runner based on config environment.""" environment = (active_config.environment or "").lower() + container_image = getattr(active_config, "container_image", None) + container_runtime = getattr(active_config, "container_runtime", None) + has_container_image = isinstance(container_image, (str, Path)) and str(container_image).strip() + has_container_runtime = isinstance(container_runtime, str) and container_runtime.strip() + if has_container_image or has_container_runtime: + return ContainerRunner(active_config) if environment == "local": return LocalRunner(active_config) if environment in {"perlmutter", "mcp"}: diff --git a/src/config.py b/src/config.py index b847a50..278dbad 100644 --- a/src/config.py +++ b/src/config.py @@ -533,6 +533,47 @@ class AMReXAgentConfig(BaseModel): "Auto-detected from PODMAN_HPC or SHIFTER env variables. " "When True, visualization uses extraction (headless) + rendering (local) workflow." ) + container_runtime: Optional[str] = Field( + default=None, + description="Container runtime override (docker, podman, podman-hpc, apptainer, singularity). " + "When unset, inferred from environment." + ) + container_image: Optional[str] = Field( + default=None, + description="Container image reference (e.g., docker image or apptainer .sif or docker:// ref)." + ) + container_entrypoint: Optional[str] = Field( + default=None, + description="Container entrypoint or executable path to run (default: /usr/local/bin/run_pelelmex)." + ) + container_workdir: str = Field( + default="/work", + description="Workdir inside the container for mounted run directory." + ) + container_inputs_name: str = Field( + default="inputs", + description="Inputs filename passed to the container entrypoint." + ) + container_extra_args: Optional[List[str]] = Field( + default_factory=list, + description="Extra runtime args passed to the container (e.g., --env, --network)." + ) + container_account: Optional[str] = Field( + default=None, + description="Slurm account for container jobs (used for non-NERSC slurm submission)." + ) + container_qos: Optional[str] = Field( + default=None, + description="Slurm QoS for container jobs (overrides default when set)." + ) + container_constraint: Optional[str] = Field( + default=None, + description="Slurm constraint for container jobs (overrides default when set)." + ) + container_walltime: Optional[str] = Field( + default=None, + description="Slurm walltime for container jobs (overrides default when set)." + ) analysis_always_enabled: bool = Field( default=True, diff --git a/src/nodes/runner_node.py b/src/nodes/runner_node.py index 97b8cde..f2dfc83 100644 --- a/src/nodes/runner_node.py +++ b/src/nodes/runner_node.py @@ -154,8 +154,17 @@ def runner_node(state: GraphState) -> dict[str, Any]: if compile_selection.get("value") == "compile_only": run_after_compile = False - # Select runner based on environment - if config.environment == "local": + # Select runner based on environment or container settings + container_image = getattr(config, "container_image", None) + container_runtime = getattr(config, "container_runtime", None) + has_container_image = isinstance(container_image, (str, Path)) and str(container_image).strip() + has_container_runtime = isinstance(container_runtime, str) and container_runtime.strip() + + if has_container_image or has_container_runtime: + from src.services.run_container import ContainerRunner + runner = ContainerRunner(config) + logger.info("Using ContainerRunner for containerized execution") + elif config.environment == "local": from src.services.run_local import LocalRunner runner = LocalRunner(config) logger.info("Using LocalRunner for local execution") diff --git a/src/services/run_container.py b/src/services/run_container.py new file mode 100644 index 0000000..30cf5a5 --- /dev/null +++ b/src/services/run_container.py @@ -0,0 +1,524 @@ +""" +Container execution service for local/HPC runs. + +Supports local Docker/Podman, NERSC podman-hpc, and ALCF Apptainer. +""" +import logging +import os +import shlex +import subprocess +from datetime import datetime +from pathlib import Path +from typing import Any + +from amrex_tools import copy_to_rundir, setup_run_directory + +from src.services.run_superfacility_tools import ( + _load_sfapi_key_file, + ensure_remote_directory_rest, + resolve_remote_output_dir, + stage_run_directory, + submit_job, +) + +logger = logging.getLogger(__name__) + + +class ContainerRunner: + """Run AMReX simulations inside a container runtime.""" + + def __init__(self, config): + self.config = config + logger.debug("ContainerRunner initialized") + + def setup_job( + self, + inputs_path: str | Path | None = None, + case_dir: str | Path | None = None, + executable_path: str | None = None, + base_name: str | None = None, + output_dir: str | Path | None = None, + ) -> dict[str, Any]: + """ + Set up run directory without compiling or copying executables. + + Container images are expected to include the executable/entrypoint. + """ + if base_name is None: + default_solver = self.config.default_solver + if not default_solver: + raise ValueError("No default solver configured for run directory naming") + base_name = default_solver.lower() + + if output_dir is None: + output_dir = self.config.output_dir + + output_path = Path(output_dir) + if output_path.name.startswith("run_"): + run_dir = str(output_path) + logger.debug("Using existing run directory: %s", run_dir) + else: + run_dir = setup_run_directory.invoke({ + "base_name": base_name, + "base_dir": str(output_dir) if output_dir else None, + }) + logger.debug("Run directory: %s", run_dir) + + run_dir_path = Path(run_dir) + + if inputs_path: + files = copy_to_rundir.invoke({ + "run_dir": run_dir, + "executable_path": "", + "inputs_path": str(inputs_path), + }) + elif case_dir: + files = copy_to_rundir.invoke({ + "run_dir": run_dir, + "executable_path": "", + "inputs_dir": str(case_dir), + }) + else: + raise ValueError("Must provide either inputs_path or case_dir") + + logger.debug("[ OK ] Copied %d files to run directory", len(files)) + + return { + "run_dir": str(run_dir_path), + "executable": self._resolve_container_entrypoint(), + "inputs": files.get("inputs"), + "files": files, + } + + def submit( + self, + run_directory: str | Path, + nodes: int = 1, + walltime: str = "00:10:00", + account: str | None = None, + qos: str = "regular", + constraint: str = "gpu&hbm40g", + system: str = "perlmutter", + dry_run: bool = False, + run_mode: str | None = None, + case_dir: str | Path | None = None, + ) -> dict[str, Any]: + """ + Submit or run a containerized job (local, NERSC, or ALCF). + """ + logger.debug("\n=== Submitting Container Job ===\n") + run_dir = Path(run_directory) + + effective_mode = run_mode or ("dry" if dry_run else "full") + runtime = self._resolve_runtime() + if account is None: + account = getattr(self.config, "container_account", None) + if getattr(self.config, "container_qos", None): + qos = getattr(self.config, "container_qos") + if getattr(self.config, "container_constraint", None): + constraint = getattr(self.config, "container_constraint") + if getattr(self.config, "container_walltime", None): + walltime = getattr(self.config, "container_walltime") + container_cmd = self._build_container_command(run_dir, runtime) + + if self._is_local_runtime(runtime): + return self._run_local_container(run_dir, container_cmd, effective_mode) + + if self._is_nersc_runtime(runtime): + return self._submit_nersc_container( + run_dir=run_dir, + container_cmd=container_cmd, + nodes=nodes, + walltime=walltime, + account=account, + qos=qos, + constraint=constraint, + system=system, + effective_mode=effective_mode, + ) + + return self._submit_slurm_container( + run_dir=run_dir, + container_cmd=container_cmd, + nodes=nodes, + walltime=walltime, + account=account, + qos=qos, + constraint=constraint, + effective_mode=effective_mode, + ) + + def _resolve_runtime(self) -> str: + runtime = getattr(self.config, "container_runtime", None) + if runtime: + return runtime.lower() + environment = (getattr(self.config, "environment", "") or "").lower() + if environment in {"perlmutter", "nersc"}: + return "podman-hpc" + if environment == "alcf": + return "apptainer" + return "docker" + + def _resolve_container_image(self) -> str: + image = getattr(self.config, "container_image", None) + if image: + return str(image) + environment = (getattr(self.config, "environment", "") or "").lower() + default_ref = "registry.nersc.gov/amsc014/superfacility/pele:latest" + if environment == "alcf": + return f"docker://{default_ref}" + return default_ref + + def _resolve_container_entrypoint(self) -> str: + entrypoint = getattr(self.config, "container_entrypoint", None) + if entrypoint: + return str(entrypoint) + return "/usr/local/bin/run_pelelmex" + + def _build_container_command(self, run_dir: Path, runtime: str) -> list[str]: + image = self._resolve_container_image() + workdir = getattr(self.config, "container_workdir", "/work") + entrypoint = self._resolve_container_entrypoint() + inputs_name = getattr(self.config, "container_inputs_name", "inputs") + extra_args = getattr(self.config, "container_extra_args", []) or [] + use_cuda = bool(getattr(self.config, "use_cuda", True)) + + if runtime in {"docker", "podman", "podman-hpc"}: + cmd = [runtime, "run", "--rm"] + if use_cuda: + if runtime == "podman-hpc": + cmd.append("--gpu") + else: + cmd += ["--gpus", "all"] + cmd += ["-v", f"{run_dir}:{workdir}", "-w", workdir] + cmd += list(extra_args) + cmd += [image, entrypoint, inputs_name] + return cmd + + if runtime in {"apptainer", "singularity"}: + cmd = [runtime, "exec"] + if use_cuda: + cmd.append("--nv") + cmd += ["--bind", f"{run_dir}:{workdir}", "--pwd", workdir] + cmd += list(extra_args) + cmd += [image, entrypoint, inputs_name] + return cmd + + raise ValueError(f"Unsupported container runtime: {runtime}") + + def _run_local_container( + self, + run_dir: Path, + container_cmd: list[str], + effective_mode: str, + ) -> dict[str, Any]: + script_path = run_dir / "run_container.sh" + script_path.write_text(self._format_shell_script(container_cmd, run_dir)) + script_path.chmod(0o755) + logger.debug("Generated local container script: %s", script_path.name) + + if effective_mode in {"dry", "stage"}: + return { + "script_path": str(script_path), + "run_dir": str(run_dir), + "method": "dry_run", + "submitted": False, + "job_status": "completed", + } + + logger.info("Executing container locally: %s", " ".join(container_cmd)) + with open(run_dir / "stdout.log", "w") as stdout_file, open( + run_dir / "stderr.log", "w" + ) as stderr_file: + proc = subprocess.Popen( + container_cmd, + cwd=run_dir, + stdout=stdout_file, + stderr=stderr_file, + ) + return_code = proc.wait() + + job_status = "completed" if return_code == 0 else "failed" + return { + "job_id": str(proc.pid), + "method": "local_container", + "run_dir": str(run_dir), + "script_path": str(script_path), + "params": {"nodes": 1, "walltime": "n/a"}, + "job_status": job_status, + "exit_code": return_code, + } + + def _submit_nersc_container( + self, + run_dir: Path, + container_cmd: list[str], + nodes: int, + walltime: str, + account: str | None, + qos: str, + constraint: str, + system: str, + effective_mode: str, + ) -> dict[str, Any]: + if account is None: + account = self.config.superfacility_account or "amsc014" + account = os.path.expandvars(str(account)) + + params = { + "nodes": nodes, + "walltime": walltime, + "account": account, + "qos": qos, + "constraint": constraint, + "use_srun": True, + } + + try: + if hasattr(self.config, "should_stage_run"): + remote_staging = self.config.should_stage_run() + else: + from src.config import detect_environment, should_stage_run + remote_staging = should_stage_run( + getattr(self.config, "environment", None), + detect_environment(), + ) + except Exception: + remote_staging = False + + remote_run_dir = None + if remote_staging: + preferred_output_dir = getattr(self.config, "remote_output_dir", None) + if preferred_output_dir is None: + preferred_output_dir = self.config.output_dir + logger.warning( + "[Config] remote_output_dir not set; defaulting staging target to %s", + preferred_output_dir, + ) + remote_output_dir = resolve_remote_output_dir( + preferred_output_dir=preferred_output_dir, + account=account, + user=os.getenv("USER"), + system=system, + ) + fixed_remote_run_dir = getattr(self.config, "remote_run_dir", None) + if fixed_remote_run_dir: + remote_run_dir = Path(os.path.expandvars(str(fixed_remote_run_dir))) + ensure_remote_directory_rest( + remote_run_dir=str(remote_run_dir), + upload_host=system, + ) + else: + remote_run_dir = Path(remote_output_dir) / run_dir.name + + run_dir_for_script = remote_run_dir if remote_run_dir else run_dir + script_content = self._generate_slurm_script( + params=params, + run_dir=str(run_dir_for_script), + container_cmd=container_cmd, + ) + + script_path = run_dir / "submit.sh" + script_path.write_text(script_content) + script_path.chmod(0o755) + logger.debug("Generated submit script: %s", script_path.name) + + if effective_mode == "dry": + return { + "script_path": str(script_path), + "run_dir": str(run_dir), + "method": "dry_run", + "submitted": False, + "job_status": "completed", + } + + if remote_staging and effective_mode in {"stage", "submit", "full"}: + cfg = self.config.model_dump() if hasattr(self.config, "model_dump") else {} + client_id = cfg.get("superfacility_client_id") + secret = cfg.get("superfacility_secret") + if not client_id or not secret: + parsed = _load_sfapi_key_file() + if parsed: + client_id, secret = parsed + staging_method = getattr(self.config, "remote_staging_method", "auto") + stage_run_directory( + local_run_dir=run_dir, + remote_run_dir=str(remote_run_dir), + client_id=client_id, + secret=secret, + method=staging_method, + ) + script_path = remote_run_dir / "submit.sh" + + if effective_mode == "stage": + return { + "script_path": str(script_path), + "run_dir": str(run_dir), + "method": "stage_only", + "submitted": False, + "job_status": "completed", + } + + job_id, method = submit_job( + script_path=str(script_path), + system=system, + ) + + return { + "job_id": job_id, + "method": method or "superfacility", + "run_dir": str(run_dir), + "remote_run_dir": str(remote_run_dir) if remote_run_dir else None, + "script_path": str(script_path), + "params": params, + "job_status": "submitted", + } + + def _submit_slurm_container( + self, + run_dir: Path, + container_cmd: list[str], + nodes: int, + walltime: str, + account: str | None, + qos: str, + constraint: str, + effective_mode: str, + ) -> dict[str, Any]: + params = { + "nodes": nodes, + "walltime": walltime, + "account": account, + "qos": qos, + "constraint": constraint, + "use_srun": True, + } + + script_content = self._generate_slurm_script( + params=params, + run_dir=str(run_dir), + container_cmd=container_cmd, + ) + + script_path = run_dir / "submit.sh" + script_path.write_text(script_content) + script_path.chmod(0o755) + + if effective_mode in {"dry", "stage"}: + return { + "script_path": str(script_path), + "run_dir": str(run_dir), + "method": "dry_run", + "submitted": False, + "job_status": "completed", + } + + try: + result = subprocess.run( + ["sbatch", str(script_path)], + cwd=run_dir, + capture_output=True, + text=True, + check=True, + ) + job_id = self._parse_sbatch_job_id(result.stdout) + return { + "job_id": job_id or "unknown", + "method": "sbatch", + "run_dir": str(run_dir), + "script_path": str(script_path), + "params": params, + "job_status": "submitted", + } + except subprocess.CalledProcessError as exc: + logger.error("sbatch submission failed: %s", exc.stderr) + return { + "job_id": None, + "method": "sbatch", + "run_dir": str(run_dir), + "script_path": str(script_path), + "params": params, + "job_status": "failed", + "error": exc.stderr, + } + + def _generate_slurm_script( + self, + params: dict[str, Any], + run_dir: str, + container_cmd: list[str], + ) -> str: + nodes = params.get("nodes", 1) + walltime = params.get("walltime", "00:10:00") + account = params.get("account") + qos = params.get("qos") + constraint = params.get("constraint") + use_srun = params.get("use_srun", True) + ntasks = max(1, int(nodes)) + run_dir_name = Path(run_dir).name + + sbatch_lines = [ + "#!/bin/bash", + f"#SBATCH --nodes={nodes}", + f"#SBATCH --time={walltime}", + f"#SBATCH --job-name=amrex_{run_dir_name}", + "#SBATCH --output=job_stdout.log", + "#SBATCH --error=job_stderr.log", + ] + if account: + sbatch_lines.append(f"#SBATCH --account={account}") + if qos: + sbatch_lines.append(f"#SBATCH --qos={qos}") + if constraint: + sbatch_lines.append(f"#SBATCH --constraint={constraint}") + if getattr(self.config, "use_cuda", True): + sbatch_lines.append("#SBATCH --gpus-per-task=1") + sbatch_lines.append("#SBATCH --gpu-bind=none") + + command_str = self._format_command(container_cmd) + if use_srun: + command_str = f"srun -n {ntasks} {command_str}" + + return "\n".join(sbatch_lines) + f""" + +echo "========================================" +echo "AMReX Container Run" +echo "Run: {run_dir_name}" +echo "Nodes: {nodes} | Tasks: {ntasks}" +echo "========================================" +echo "" + +cd {Path(run_dir).absolute()} +echo "Starting at $(date)" +{command_str} > stdout.log 2> stderr.log + +echo "" +echo "Finished at $(date)" +echo "" +ls -lh stdout.log stderr.log job_stdout.log job_stderr.log plt* 2>/dev/null || echo " (no plotfiles yet)" +""" + + def _format_shell_script(self, cmd: list[str], run_dir: Path) -> str: + return "\n".join([ + "#!/bin/bash", + "# Local container execution script", + f"# Generated: {datetime.utcnow().isoformat()}Z", + f"cd {run_dir}", + self._format_command(cmd), + "", + ]) + + def _format_command(self, cmd: list[str]) -> str: + return " ".join(shlex.quote(part) for part in cmd) + + def _is_local_runtime(self, runtime: str) -> bool: + return runtime in {"docker", "podman"} + + def _is_nersc_runtime(self, runtime: str) -> bool: + return runtime == "podman-hpc" + + def _parse_sbatch_job_id(self, stdout: str) -> str | None: + for token in stdout.split(): + if token.isdigit(): + return token + return None diff --git a/src/services/run_superfacility_tools.py b/src/services/run_superfacility_tools.py index dbb1ed7..64469f0 100644 --- a/src/services/run_superfacility_tools.py +++ b/src/services/run_superfacility_tools.py @@ -1103,6 +1103,16 @@ def resolve_remote_output_dir( try: list_remote_entries(str(candidate), nersc_session=nersc_session, system=system) except Exception as exc: + shared_root = Path(f"/global/cfs/cdirs/{account}/superfacility") + user_root = ( + Path(f"/global/cfs/cdirs/{account}/{user}/superfacility") if user else None + ) + is_shared_candidate = str(candidate).startswith(str(shared_root)) + if user_root and str(candidate).startswith(str(user_root)): + is_shared_candidate = False + if is_shared_candidate: + logger.debug("Shared output dir missing or unreadable: %s", candidate) + continue if create_budget > 0: try: ensure_remote_directory_rest( From f8a91c1627c2c535755b29a97b2016255874502e Mon Sep 17 00:00:00 2001 From: "Jean M. Sexton" Date: Tue, 17 Feb 2026 07:50:58 -0800 Subject: [PATCH 2/3] Fix container runner selection and staging --- src/services/run_superfacility_tools.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/services/run_superfacility_tools.py b/src/services/run_superfacility_tools.py index 64469f0..0183135 100644 --- a/src/services/run_superfacility_tools.py +++ b/src/services/run_superfacility_tools.py @@ -1098,18 +1098,18 @@ def resolve_remote_output_dir( if candidate not in candidates: candidates.append(candidate) + shared_root = Path(f"/global/cfs/cdirs/{account}/superfacility") + user_root = ( + Path(f"/global/cfs/cdirs/{account}/{user}/superfacility") if user else None + ) create_budget = 2 for candidate in candidates: + is_shared_candidate = str(candidate).startswith(str(shared_root)) + if user_root and str(candidate).startswith(str(user_root)): + is_shared_candidate = False try: list_remote_entries(str(candidate), nersc_session=nersc_session, system=system) except Exception as exc: - shared_root = Path(f"/global/cfs/cdirs/{account}/superfacility") - user_root = ( - Path(f"/global/cfs/cdirs/{account}/{user}/superfacility") if user else None - ) - is_shared_candidate = str(candidate).startswith(str(shared_root)) - if user_root and str(candidate).startswith(str(user_root)): - is_shared_candidate = False if is_shared_candidate: logger.debug("Shared output dir missing or unreadable: %s", candidate) continue @@ -1128,7 +1128,7 @@ def resolve_remote_output_dir( else: logger.debug("Remote output dir check failed for %s: %s", candidate, exc) continue - if sfapi_available and not nersc_session: + if sfapi_available and not nersc_session and is_shared_candidate: logger.debug("Using SFAPI credentials for %s; skipping REST mkdir check", candidate) logger.info("Using remote output dir: %s", candidate) return candidate From dac75bd1fcbca6549a9f337b656a21ec097e0b88 Mon Sep 17 00:00:00 2001 From: "Jean M. Sexton" Date: Tue, 17 Feb 2026 07:52:00 -0800 Subject: [PATCH 3/3] Drop README container snippet --- README.md | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/README.md b/README.md index d577762..f832a81 100644 --- a/README.md +++ b/README.md @@ -74,32 +74,6 @@ Config override example: python amrex_agent.py --prompt "Run AMReX Advection_AmrCore with a 64x64 grid" --config demo/amrex/config.yaml ``` -### Container runs (local/NERSC/ALCF) - -Container runs are enabled when `container_image` (or `container_runtime`) is set in config. -Default image (when unset): `registry.nersc.gov/amsc014/superfacility/pele:latest`. -ALCF defaults to `docker://registry.nersc.gov/amsc014/superfacility/pele:latest` for Apptainer. - -Minimal examples (YAML config override): - -```yaml -environment: local -container_image: registry.nersc.gov/amsc014/superfacility/pele:latest -container_runtime: docker -``` - -```yaml -environment: perlmutter -container_image: registry.nersc.gov/amsc014/superfacility/pele:latest -container_runtime: podman-hpc -``` - -```yaml -environment: alcf -container_image: docker://registry.nersc.gov/amsc014/superfacility/pele:latest -container_runtime: apptainer -``` - ## Configuration knobs (examples) - `--indexing-strategy`: