From 1012aa0e83a1de7966cdddb77773d8f5486b2636 Mon Sep 17 00:00:00 2001 From: Alex Manley Date: Tue, 27 Jan 2026 13:46:40 -0800 Subject: [PATCH 1/3] agent hyperparam interface --- src/cloudai/cli/handlers.py | 51 +++++++++++++++++-- src/cloudai/models/agent_config.py | 79 ++++++++++++++++++++++++++++++ src/cloudai/models/workload.py | 3 +- 3 files changed, 129 insertions(+), 4 deletions(-) create mode 100644 src/cloudai/models/agent_config.py diff --git a/src/cloudai/cli/handlers.py b/src/cloudai/cli/handlers.py index d474ff421..df381c791 100644 --- a/src/cloudai/cli/handlers.py +++ b/src/cloudai/cli/handlers.py @@ -1,5 +1,5 @@ # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -21,11 +21,12 @@ import signal from contextlib import contextmanager from pathlib import Path -from typing import Callable, List, Optional +from typing import Any, Callable, List, Optional from unittest.mock import Mock import toml import yaml +from pydantic import ValidationError from cloudai.core import ( BaseInstaller, @@ -40,6 +41,11 @@ TestParser, TestScenario, ) +from cloudai.models.agent_config import ( + BayesianOptimizationConfig, + GeneticAlgorithmConfig, + MultiArmedBanditConfig, +) from cloudai.models.scenario import ReportConfig from cloudai.models.workload import TestDefinition from cloudai.parser import HOOK_ROOT @@ -145,7 +151,19 @@ def handle_dse_job(runner: Runner, args: argparse.Namespace) -> int: continue env = CloudAIGymEnv(test_run=test_run, runner=runner.runner) - agent = agent_class(env) + + try: + agent_overrides = validate_agent_overrides(agent_type, test_run.test.agent_config) + except ValidationError as e: + logging.error(f"Invalid agent_config for agent '{agent_type}':") + for error in e.errors(): + field = ".".join(str(loc) for loc in error["loc"]) + logging.error(f" - {field}: {error['msg']}") + err = 1 + continue + + agent = agent_class(env, **agent_overrides) + for step in range(agent.max_steps): result = agent.select_action() if result is None: @@ -166,6 +184,33 @@ def handle_dse_job(runner: Runner, args: argparse.Namespace) -> int: return err +def validate_agent_overrides(agent_type: str, agent_config: Optional[dict[str, Any]]) -> dict[str, Any]: + """ + Validate and process agent configuration overrides. + """ + if not agent_config: + return {} + + config_class_map = { + "ga": GeneticAlgorithmConfig, + "bo": BayesianOptimizationConfig, + "mab": MultiArmedBanditConfig, + } + + config_class = config_class_map.get(agent_type) + if not config_class: + logging.debug(f"No config validation available for agent type '{agent_type}', using defaults.") + return {} + + validated_config = config_class.model_validate(agent_config) + agent_kwargs = validated_config.model_dump(exclude_none=True) + + if agent_kwargs: + logging.info(f"Applying agent config overrides for '{agent_type}': {agent_kwargs}") + + return agent_kwargs + + def generate_reports(system: System, test_scenario: TestScenario, result_dir: Path) -> None: registry = Registry() diff --git a/src/cloudai/models/agent_config.py b/src/cloudai/models/agent_config.py new file mode 100644 index 000000000..6688baaa2 --- /dev/null +++ b/src/cloudai/models/agent_config.py @@ -0,0 +1,79 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from abc import ABC +from typing import Any, Optional + +from pydantic import BaseModel, ConfigDict, Field + + +class AgentConfig(BaseModel, ABC): + """ + Base configuration for agent overrides. + """ + + model_config = ConfigDict(extra="forbid") + random_seed: Optional[int] = Field(default=None, description="Random seed for reproducibility") + +class GeneticAlgorithmConfig(AgentConfig): + """ + Configuration overrides for Genetic Algorithm agent. + """ + + population_size: Optional[int] = Field(default=None, ge=2, description="Population size for the genetic algorithm") + n_offsprings: Optional[int] = Field(default=None, ge=1, description="Number of offsprings per generation") + crossover_prob: Optional[float] = Field(default=None, ge=0.0, le=1.0, description="Crossover probability") + mutation_prob: Optional[float] = Field(default=None, ge=0.0, le=1.0, description="Mutation probability") + + +class BayesianOptimizationConfig(AgentConfig): + """ + Configuration overrides for Bayesian Optimization agent. + """ + + sobol_num_trials: Optional[int] = Field(default=None, ge=1, description="Number of SOBOL initialization trials") + botorch_num_trials: Optional[int] = Field( + default=None, description="Number of BoTorch trials (-1 for unlimited until max_steps)" + ) + +class MultiArmedBanditConfig(AgentConfig): + """ + Configuration overrides for Multi-Armed Bandit agent. + """ + + algorithm: Optional[str] = Field( + default=None, + description="MAB algorithm: ucb1, ts (thompson_sampling), epsilon_greedy, softmax, or random", + ) + algorithm_params: Optional[dict[str, Any]] = Field( + default=None, description="Algorithm-specific parameters (e.g., alpha for UCB1, epsilon for epsilon_greedy)" + ) + seed_parameters: Optional[dict[str, Any]] = Field( + default=None, description="Initial seed configuration to evaluate first" + ) + max_arms: Optional[int] = Field(default=None, ge=1, description="Maximum number of arms in the action space") + warm_start_size: Optional[int] = Field( + default=None, ge=0, description="Number of arms to randomly explore initially" + ) + epsilon_override: Optional[float] = Field( + default=None, ge=0.0, le=1.0, description="Epsilon value for exploration (overrides algorithm epsilon)" + ) + max_explore_steps: Optional[int] = Field( + default=None, ge=0, description="Maximum steps for epsilon exploration (None for unlimited)" + ) + prefer_unseen_random: Optional[bool] = Field( + default=None, description="Prefer unseen arms during random exploration (epsilon)" + ) diff --git a/src/cloudai/models/workload.py b/src/cloudai/models/workload.py index 1745ae734..0a962cf59 100644 --- a/src/cloudai/models/workload.py +++ b/src/cloudai/models/workload.py @@ -1,5 +1,5 @@ # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -107,6 +107,7 @@ class TestDefinition(BaseModel, ABC): agent_steps: int = 1 agent_metrics: list[str] = Field(default=["default"]) agent_reward_function: str = "inverse" + agent_config: Optional[dict[str, Any]] = None @property def cmd_args_dict(self) -> Dict[str, Union[str, List[str]]]: From bd551e49cf05d2ce4cb9ae712c45e28bea1e4abf Mon Sep 17 00:00:00 2001 From: Alex Manley Date: Tue, 27 Jan 2026 13:50:13 -0800 Subject: [PATCH 2/3] fix formatting --- src/cloudai/cli/handlers.py | 14 ++++++-------- src/cloudai/models/agent_config.py | 18 ++++++------------ 2 files changed, 12 insertions(+), 20 deletions(-) diff --git a/src/cloudai/cli/handlers.py b/src/cloudai/cli/handlers.py index df381c791..674013f5e 100644 --- a/src/cloudai/cli/handlers.py +++ b/src/cloudai/cli/handlers.py @@ -151,7 +151,7 @@ def handle_dse_job(runner: Runner, args: argparse.Namespace) -> int: continue env = CloudAIGymEnv(test_run=test_run, runner=runner.runner) - + try: agent_overrides = validate_agent_overrides(agent_type, test_run.test.agent_config) except ValidationError as e: @@ -161,7 +161,7 @@ def handle_dse_job(runner: Runner, args: argparse.Namespace) -> int: logging.error(f" - {field}: {error['msg']}") err = 1 continue - + agent = agent_class(env, **agent_overrides) for step in range(agent.max_steps): @@ -185,9 +185,7 @@ def handle_dse_job(runner: Runner, args: argparse.Namespace) -> int: def validate_agent_overrides(agent_type: str, agent_config: Optional[dict[str, Any]]) -> dict[str, Any]: - """ - Validate and process agent configuration overrides. - """ + """Validate and process agent configuration overrides.""" if not agent_config: return {} @@ -196,7 +194,7 @@ def validate_agent_overrides(agent_type: str, agent_config: Optional[dict[str, A "bo": BayesianOptimizationConfig, "mab": MultiArmedBanditConfig, } - + config_class = config_class_map.get(agent_type) if not config_class: logging.debug(f"No config validation available for agent type '{agent_type}', using defaults.") @@ -204,10 +202,10 @@ def validate_agent_overrides(agent_type: str, agent_config: Optional[dict[str, A validated_config = config_class.model_validate(agent_config) agent_kwargs = validated_config.model_dump(exclude_none=True) - + if agent_kwargs: logging.info(f"Applying agent config overrides for '{agent_type}': {agent_kwargs}") - + return agent_kwargs diff --git a/src/cloudai/models/agent_config.py b/src/cloudai/models/agent_config.py index 6688baaa2..3e090a622 100644 --- a/src/cloudai/models/agent_config.py +++ b/src/cloudai/models/agent_config.py @@ -21,17 +21,14 @@ class AgentConfig(BaseModel, ABC): - """ - Base configuration for agent overrides. - """ + """Base configuration for agent overrides.""" model_config = ConfigDict(extra="forbid") random_seed: Optional[int] = Field(default=None, description="Random seed for reproducibility") + class GeneticAlgorithmConfig(AgentConfig): - """ - Configuration overrides for Genetic Algorithm agent. - """ + """Configuration overrides for Genetic Algorithm agent.""" population_size: Optional[int] = Field(default=None, ge=2, description="Population size for the genetic algorithm") n_offsprings: Optional[int] = Field(default=None, ge=1, description="Number of offsprings per generation") @@ -40,19 +37,16 @@ class GeneticAlgorithmConfig(AgentConfig): class BayesianOptimizationConfig(AgentConfig): - """ - Configuration overrides for Bayesian Optimization agent. - """ + """Configuration overrides for Bayesian Optimization agent.""" sobol_num_trials: Optional[int] = Field(default=None, ge=1, description="Number of SOBOL initialization trials") botorch_num_trials: Optional[int] = Field( default=None, description="Number of BoTorch trials (-1 for unlimited until max_steps)" ) + class MultiArmedBanditConfig(AgentConfig): - """ - Configuration overrides for Multi-Armed Bandit agent. - """ + """Configuration overrides for Multi-Armed Bandit agent.""" algorithm: Optional[str] = Field( default=None, From c1ecf2563120941d83c53b9d96150b211b39cdeb Mon Sep 17 00:00:00 2001 From: Alex Manley Date: Tue, 27 Jan 2026 14:11:34 -0800 Subject: [PATCH 3/3] default pass no kwargs --- src/cloudai/cli/handlers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cloudai/cli/handlers.py b/src/cloudai/cli/handlers.py index 674013f5e..c8ff8a74f 100644 --- a/src/cloudai/cli/handlers.py +++ b/src/cloudai/cli/handlers.py @@ -162,7 +162,7 @@ def handle_dse_job(runner: Runner, args: argparse.Namespace) -> int: err = 1 continue - agent = agent_class(env, **agent_overrides) + agent = agent_class(env, **agent_overrides) if agent_overrides else agent_class(env) for step in range(agent.max_steps): result = agent.select_action()