diff --git a/docker/Dockerfile b/docker/Dockerfile index 88b092b9a..1ff4d6596 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -83,8 +83,8 @@ RUN if [ "$framework" = "jax" ] ; then \ RUN cd /algorithmic-efficiency && git fetch origin RUN cd /algorithmic-efficiency && git pull -# Todo: remove this, this is temporary for developing -COPY scripts/startup.sh /algorithmic-efficiency/docker/scripts/startup.sh +# Uncomment this for developing purposes +# COPY scripts/startup.sh /algorithmic-efficiency/docker/scripts/startup.sh RUN chmod a+x /algorithmic-efficiency/docker/scripts/startup.sh ENTRYPOINT ["bash", "/algorithmic-efficiency/docker/scripts/startup.sh"] diff --git a/scoring/performance_profile.py b/scoring/performance_profile.py index 043a65791..eecb9799e 100644 --- a/scoring/performance_profile.py +++ b/scoring/performance_profile.py @@ -59,7 +59,7 @@ # workloads and rules for the scoring to be correct. # We do not use the workload registry since it contains test and development # workloads as well. -NUM_BASE_WORKLOADS = 8 +NUM_BASE_WORKLOADS = 9 NUM_VARIANT_WORKLOADS = 0 NUM_TRIALS = 5 NUM_STUDIES = 3 diff --git a/scoring/score_submissions.py b/scoring/score_submissions.py index efe276a33..10e624148 100644 --- a/scoring/score_submissions.py +++ b/scoring/score_submissions.py @@ -75,10 +75,10 @@ FLAGS = flags.FLAGS -def get_summary_df(workload, workload_df, include_test_split=False): +def get_summary_df(workload, workload_df): print(f' WORKLOAD: {workload}') validation_metric, validation_target = ( - scoring_utils.get_workload_metrics_and_targets(workload, split='validation') + scoring_utils.get_workload_metrics_and_targets(workload) ) is_minimized = performance_profile.check_if_minimized(validation_metric) @@ -127,7 +127,7 @@ def get_summary_df(workload, workload_df, include_test_split=False): # compute the step times def delta(series): - return series.shift(1, fill_value=0) - series + return series.apply(lambda x: np.diff(x, prepend=0)) accumulated_time_intervals = delta(workload_df['accumulated_submission_time']) step_intervals = delta(workload_df['global_step']) @@ -136,47 +136,19 @@ def delta(series): f'WARNING: The number of evals may be too low to calculate reliable step time for {workload}' ) - summary_df['step_time (s)'] = np.median( - (accumulated_time_intervals / step_intervals).iloc[0] - ) - - summary_df['step_hint'] = scoring_utils.get_workload_stephint(workload) - - # test metrics - if include_test_split: - test_metric, test_target = scoring_utils.get_workload_metrics_and_targets( - workload, split='test' + # Flatten all intervals from all trials and take the global median + with np.errstate(divide='ignore', invalid='ignore'): + all_ratios = np.concatenate( + (accumulated_time_intervals / step_intervals).values ) + summary_df['step_time (s)'] = np.nanmedian(all_ratios) - summary_df['test target metric name'] = test_metric - summary_df['test target metric value'] = test_target - - summary_df['test target reached'] = ( - workload_df[test_metric] - .apply(lambda x: target_op(x, test_target)) - .apply(np.any) - ) - summary_df['best metric value on test'] = workload_df[test_metric].apply( - lambda x: best_op(x) - ) - workload_df['index best eval on test'] = workload_df[test_metric].apply( - lambda x: idx_op(x) - ) - summary_df['time to best eval on test (s)'] = workload_df.apply( - lambda x: x['accumulated_submission_time'][x['index best eval on test']], - axis=1, - ) - summary_df['time to target on test (s)'] = summary_df.apply( - lambda x: x['time to best eval on test (s)'] - if x['test target reached'] - else np.inf, - axis=1, - ) + summary_df['step_hint'] = scoring_utils.get_workload_stephint(workload) return summary_df -def get_submission_summary(df, include_test_split=False): +def get_submission_summary(df): """Summarizes the submission results into metric and time tables organized by workload. """ @@ -184,9 +156,7 @@ def get_submission_summary(df, include_test_split=False): dfs = [] print(df) for workload, group in df.groupby('workload'): - summary_df = get_summary_df( - workload, group, include_test_split=include_test_split - ) + summary_df = get_summary_df(workload, group) dfs.append(summary_df) df = pd.concat(dfs) diff --git a/scoring/scoring_utils.py b/scoring/scoring_utils.py index cb63eab4b..c2ab8aeec 100644 --- a/scoring/scoring_utils.py +++ b/scoring/scoring_utils.py @@ -214,7 +214,7 @@ def get_experiment_df(experiment_dir): ## Get workload properties -def get_workload_metrics_and_targets(workload, split='validation'): +def get_workload_metrics_and_targets(workload): """Returns workload target metric name and value.""" workload_name = re.match(WORKLOAD_NAME_PATTERN, workload).group(1) framework = re.match(WORKLOAD_NAME_PATTERN, workload).group(2) @@ -233,12 +233,8 @@ def get_workload_metrics_and_targets(workload, split='validation'): workload_init_kwargs=workload_init_kwargs, ) metric_name = workload_obj.target_metric_name - if split == 'validation': - metric = f'validation/{metric_name}' - target = workload_obj.validation_target_value - elif split == 'test': - metric = f'test/{metric_name}' - target = workload_obj.test_target_value + metric = f'validation/{metric_name}' + target = workload_obj.validation_target_value return metric, target diff --git a/scoring/utils/slurm/README.md b/scoring/utils/slurm/README.md index a8e41f04b..0aec296c8 100644 --- a/scoring/utils/slurm/README.md +++ b/scoring/utils/slurm/README.md @@ -48,6 +48,29 @@ LOGS_BUCKET="algoperf-runs-internal" sbatch run_jobs.sh ``` +## Convenient bash script to launch SLURM jobs + +The run_submissions.sh script does all the steps above for you. It is intended to be used on a slurm login node. It however does expect a very specific directory structure. You need to be in the $HOME dir with the algorithmic-efficiency and submissions_algorithms git repos in the home dir. + +``` +$USER$@$USER$:~/$ tree -L 1 +. +├── algorithmic-efficiency +└── submissions_algorithms +``` + +And you run the script with a command like so: + +``` +./algorithmic-efficiency/scoring/utils/slurm/run_submission.sh \ + --submission_path submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2 + --dry_run false +``` + +The submission path points to the dir where the submission exists (in the submissions git repo). `dry_run` is set to true by default (which limits max global steps to 10) to prevent accidental commands from wasting resources. Explicitly set it to false for full runs. + +The script will figure out the rest and run them for you (creating the config, saving it to a path with a reasonable name, and running the sbatch script with the right flags). + # Set up new SLURM cluster If you are setting up a new cluster, we recommend using the [HPC toolkit to set up a SLURM cluster](https://cloud.google.com/cluster-toolkit/docs/quickstarts/slurm-cluster). diff --git a/scoring/utils/slurm/make_job_config.py b/scoring/utils/slurm/make_job_config.py index 39f6f5eb0..91afc9196 100644 --- a/scoring/utils/slurm/make_job_config.py +++ b/scoring/utils/slurm/make_job_config.py @@ -9,6 +9,7 @@ import json import os +import struct import jax from absl import app, flags @@ -17,8 +18,6 @@ TUNING_SEARCH_SPACE = ( 'reference_algorithms/paper_baselines/adamw/tuning_search_space.json' ) -NUM_TUNING_TRIALS = 3 # For external tuning ruleset -NUM_STUDIES = 3 flags.DEFINE_string( 'submission_path', @@ -35,11 +34,6 @@ 'experiments', 'Path to experiment dir where logs will be saved.', ) -flags.DEFINE_string( - 'experiment_dir', - 'experiments/', - 'Path to experiment dir where logs will be saved.', -) flags.DEFINE_enum( 'framework', 'jax', @@ -56,14 +50,13 @@ flags.DEFINE_string( 'workloads', None, help='Comma seperated list of workloads to run.' ) -flags.DEFINE_integer('num_studies', NUM_STUDIES, help='Number of studies.') +flags.DEFINE_integer('num_studies', None, help='Number of studies.') +flags.DEFINE_integer('num_tuning_trials', None, help='Number of tuning trials.') FLAGS = flags.FLAGS MIN_INT = -(2 ** (31)) MAX_INT = 2 ** (31) - 1 -NUM_TUNING_TRIALS = 5 # For external tuning ruleset -NUM_STUDIES = 3 WORKLOADS = { 'imagenet_resnet': {'dataset': 'imagenet'}, @@ -74,6 +67,12 @@ 'librispeech_deepspeech': {'dataset': 'librispeech'}, 'criteo1tb': {'dataset': 'criteo1tb'}, 'librispeech_conformer': {'dataset': 'librispeech'}, + 'finewebedu_lm': {'dataset': 'fineweb_edu_10B'} +} + +RULESET_CONFIGS = { + 'self': {'num_studies': 3, 'num_tuning_trials': 1}, + 'external': {'num_studies': 3, 'num_tuning_trials': 5}, } @@ -83,17 +82,29 @@ def main(_): else: workloads = FLAGS.workloads.split(',') - key = jax.random.key(FLAGS.seed) + if not FLAGS.seed: + FLAGS.seed = struct.unpack('I', os.urandom(4))[0] + + # Set defaults based on tuning_ruleset if not provided by user + num_studies = FLAGS.num_studies + if num_studies is None: + num_studies = RULESET_CONFIGS[FLAGS.tuning_ruleset]['num_studies'] + + num_tuning_trials = FLAGS.num_tuning_trials + if num_tuning_trials is None: + num_tuning_trials = RULESET_CONFIGS[FLAGS.tuning_ruleset]['num_tuning_trials'] + + key = jax.random.PRNGKey(FLAGS.seed) jobs = [] for workload in workloads: # Fold in hash(workload) mod(max(uint32)) workload_key = jax.random.fold_in(key, hash(workload) % (2**32 - 1)) - for study_index in range(NUM_STUDIES): + for study_index in range(num_studies): study_key = jax.random.fold_in(workload_key, study_index) if FLAGS.tuning_ruleset == 'external': - for hparam_index in range(NUM_TUNING_TRIALS): + for hparam_index in range(num_tuning_trials): run_key = jax.random.fold_in(study_key, hparam_index) seed = jax.random.randint(run_key, (1,), MIN_INT, MAX_INT)[0].item() print(seed) @@ -107,7 +118,7 @@ def main(_): job['experiment_dir'] = study_dir job['rng_seed'] = seed job['tuning_ruleset'] = FLAGS.tuning_ruleset - job['num_tuning_trials'] = NUM_TUNING_TRIALS + job['num_tuning_trials'] = num_tuning_trials job['hparam_start_index'] = hparam_index job['hparam_end_index'] = hparam_index + 1 job['tuning_search_space'] = FLAGS.tuning_search_space diff --git a/scoring/utils/slurm/run_jobs.sh b/scoring/utils/slurm/run_jobs.sh index 5fcf8f69e..1047b31c0 100644 --- a/scoring/utils/slurm/run_jobs.sh +++ b/scoring/utils/slurm/run_jobs.sh @@ -2,31 +2,21 @@ #SBATCH --nodes=1 # give it a full node #SBATCH --ntasks-per-node=1 -#SBATCH --array= -#SBATCH --partition=v100 -#SBATCH --gpus-per-node=8 +#SBATCH --array=0-26 +#SBATCH --partition=a100 +#SBATCH --gpus-per-node=4 #SBATCH --exclusive #this will not allow other jobs to run on this cluster -#SBATCH --output=experiments/tests/jit_debug_deepspeech_old_stephint_nadamw/job_%A_%a.out -#SBATCH --error=experiments/tests/jit_debug_deepspeech_old_stephint_nadamw/job_%A_%a.err +#SBATCH --output=experiments/tests/updated_schedule_free/job_%A_%a.out +#SBATCH --error=experiments/tests/updated_schedule_free/job_%A_%a.err -# Usage: sbatch .sh +# Usage: sbatch .sh [options] # This script reads config.json and launches a sbatch job using task -# arrays where each job in the array corresponds to a training run +# arrays where each job in the array corresponds to a training run # for a workload given a random seed and tuning trial index. # To generate the config.json use make_job_config.py. set -x -# Pull docker image (ATTENTION: you may want to modify this) -REPO="" -IMAGE="" -y | gcloud auth configure-docker $REPO -docker pull $IMAGE -# Job config (ATTENTION: you may want to modify this) -config_file="" # Replace with your config file path -LOGS_BUCKET="" # replace with your bucket used for logging - - # Function to read a JSON file and extract a value by key read_json_value() { local json_file="$1" @@ -43,41 +33,89 @@ then exit 1 fi -TASK="$SLURM_ARRAY_TASK_ID" -FRAMEWORK=$(read_json_value "$config_file" "$TASK" "framework") -DATASET=$(read_json_value "$config_file" "$TASK" "dataset") -SUBMISSION_PATH=$(read_json_value "$config_file" "$TASK" "submission_path") -FRAMEWORK=$(read_json_value "$config_file" "$TASK" "framework") -TUNING_SEARCH_SPACE=$(read_json_value "$config_file" "$TASK" "tuning_search_space") -EXPERIMENT_DIR=$(read_json_value "$config_file" "$TASK" "experiment_dir") -MAX_STEPS=$(read_json_value "$config_file" "$TASK" "max_steps") -RNG_SEED=$(read_json_value "$config_file" "$TASK" "rng_seed") -WORKLOAD=$(read_json_value "$config_file" "$TASK" "workload") -HPARAM_START_INDEX=$(read_json_value "$config_file" "$TASK" "hparam_start_index") -HPARAM_END_INDEX=$(read_json_value "$config_file" "$TASK" "hparam_end_index") -NUM_TUNING_TRIALS=$(read_json_value "$config_file" "$TASK" "num_tuning_trials") -TUNING_RULESET=$(read_json_value "$config_file" "$TASK" "tuning_ruleset") -MAX_GLOBAL_STEPS=$(read_json_value "$config_file" "$MAX_GLOBAL_STEPS" "max_global_steps") +# Default values +REPO="europe-west4-docker.pkg.dev" +IMAGE="europe-west4-docker.pkg.dev/mlcommons-algoperf/algoperf-docker-repo/algoperf_pytorch_main:latest" +CONFIG_FILE="$HOME/algorithmic-efficiency/config.json" +LOGS_BUCKET="algoperf-runs" +TASK_ID="${SLURM_ARRAY_TASK_ID:-0}" + +# Parse flags +while [[ $# -gt 0 ]]; do + case $1 in + --repo) + REPO="$2" + shift 2 + ;; + --image) + IMAGE="$2" + shift 2 + ;; + --config_file) + CONFIG_FILE="$2" + shift 2 + ;; + --logs_bucket) + LOGS_BUCKET="$2" + shift 2 + ;; + --max_global_steps) + MAX_GLOBAL_STEPS="$2" + shift 2 + ;; + *) + echo "Unknown option $1" + exit 1 + ;; + esac +done + +# Pull docker image +yes | gcloud auth configure-docker "$REPO" +docker pull "$IMAGE" + +# Set variables from config file +FRAMEWORK=$(read_json_value "$CONFIG_FILE" "$TASK_ID" "framework") +DATASET=$(read_json_value "$CONFIG_FILE" "$TASK_ID" "dataset") +SUBMISSION_PATH=$(read_json_value "$CONFIG_FILE" "$TASK_ID" "submission_path") +TUNING_SEARCH_SPACE=$(read_json_value "$CONFIG_FILE" "$TASK_ID" "tuning_search_space") +EXPERIMENT_DIR=$(read_json_value "$CONFIG_FILE" "$TASK_ID" "experiment_dir") +RNG_SEED=$(read_json_value "$CONFIG_FILE" "$TASK_ID" "rng_seed") +WORKLOAD=$(read_json_value "$CONFIG_FILE" "$TASK_ID" "workload") +HPARAM_START_INDEX=$(read_json_value "$CONFIG_FILE" "$TASK_ID" "hparam_start_index") +HPARAM_END_INDEX=$(read_json_value "$CONFIG_FILE" "$TASK_ID" "hparam_end_index") +NUM_TUNING_TRIALS=$(read_json_value "$CONFIG_FILE" "$TASK_ID" "num_tuning_trials") +TUNING_RULESET=$(read_json_value "$CONFIG_FILE" "$TASK_ID" "tuning_ruleset") + +DOCKER_CMD=( + docker run + -v /opt/data/:/data/ + -v "$HOME/experiment_runs:/experiment_runs" + -v "$HOME/submissions_algorithms/:/algorithmic-efficiency/submissions_algorithms" + -v "$HOME/algorithmic-efficiency/docker/scripts/startup.sh:/algorithmic-efficiency/docker/scripts/startup.sh" + --gpus all + --ipc=host + "$IMAGE" + -d "$DATASET" + -f "$FRAMEWORK" + -s "$SUBMISSION_PATH" + -w "$WORKLOAD" + -t "$TUNING_SEARCH_SPACE" + -e "$EXPERIMENT_DIR" + -c False + -o True + --rng_seed "$RNG_SEED" + --hparam_start_index "$HPARAM_START_INDEX" + --hparam_end_index "$HPARAM_END_INDEX" + --num_tuning_trials "$NUM_TUNING_TRIALS" + --tuning_ruleset "$TUNING_RULESET" + -i true + -r false + --logs_bucket "$LOGS_BUCKET" +) + +if [ -n "$MAX_GLOBAL_STEPS" ]; then + DOCKER_CMD+=(-m "$MAX_GLOBAL_STEPS") +fi -docker run \ - -v /opt/data/:/data/ \ - -v $HOME/submissions_algorithms/:/algorithmic-efficiency/submissions_algorithms \ - --gpus all \ - --ipc=host \ - $IMAGE \ - -d $DATASET \ - -f $FRAMEWORK \ - -s $SUBMISSION_PATH \ - -w $WORKLOAD \ - -t $TUNING_SEARCH_SPACE \ - -e $EXPERIMENT_DIR \ - -c False \ - -o True \ - --rng_seed $RNG_SEED \ - --hparam_start_index $HPARAM_START_INDEX \ - --hparam_end_index $HPARAM_END_INDEX \ - --num_tuning_trials $NUM_TUNING_TRIALS \ - --tuning_ruleset $TUNING_RULESET \ - --logs_bucket $LOGS_BUCKET \ - -i true \ - -r false \ No newline at end of file +"${DOCKER_CMD[@]}" diff --git a/scoring/utils/slurm/run_submission.sh b/scoring/utils/slurm/run_submission.sh new file mode 100644 index 000000000..ddef9f586 --- /dev/null +++ b/scoring/utils/slurm/run_submission.sh @@ -0,0 +1,204 @@ +#!/bin/bash + +# Usage: +# ./algorithmic-efficiency/scoring/utils/slurm/run_submission.sh \ +# --submission_path submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2 +# +# Note: --dry_run is true by default (sets MAX_GLOBAL_STEPS=10). +# To perform a full run, explicitly set --dry_run false. + +set -e +set -x + +# --- Global Variables --- +SUBMISSION_PATH="" +DRY_RUN=true +MAX_GLOBAL_STEPS=10 +SUBMISSION_NAME="" +RULESET="" +FRAMEWORK="" +ARRAY_RANGE="" + +# --- Helper Functions --- + +install_yq() { + if ! command -v yq &> /dev/null; then + echo "yq not found. Attempting to install locally to $HOME/.local/bin..." + mkdir -p "$HOME/.local/bin" + local OS=$(uname | tr '[:upper:]' '[:lower:]') + local ARCH=$(uname -m) + case "$ARCH" in + x86_64) ARCH="amd64" ;; + aarch64) ARCH="arm64" ;; + esac + + local YQ_URL="https://github.com/mikefarah/yq/releases/latest/download/yq_${OS}_${ARCH}" + if command -v curl &> /dev/null; then + curl -L "$YQ_URL" -o "$HOME/.local/bin/yq" + elif command -v wget &> /dev/null; then + wget "$YQ_URL" -O "$HOME/.local/bin/yq" + else + echo "Error: Neither curl nor wget found. Please install yq manually: https://github.com/mikefarah/yq" + exit 1 + fi + chmod +x "$HOME/.local/bin/yq" + export PATH="$HOME/.local/bin:$PATH" + echo "yq installed successfully to $HOME/.local/bin" + fi +} + +check_command() { + if ! command -v "$1" &> /dev/null; then + echo "Error: $1 could not be found. Please install it." + exit 1 + fi +} + +verify_environment() { + if [[ "$PWD" != "$HOME" ]]; then + echo "Error: This script must be run from your home directory ($HOME)." + echo "Expected directory structure:" + echo " $HOME/" + echo " ├── algorithmic-efficiency/" + echo " └── submissions_algorithms/" + exit 1 + fi + + if [[ ! -d "algorithmic-efficiency" || ! -d "submissions_algorithms" ]]; then + echo "Error: Required repositories not found in the current directory." + echo "Please ensure both 'algorithmic-efficiency' and 'submissions_algorithms' are present in $HOME." + exit 1 + fi + + install_yq + check_command "jq" +} + +parse_flags() { + while [[ $# -gt 0 ]]; do + case $1 in + --submission_path) + SUBMISSION_PATH="$2" + shift 2 + ;; + --dry_run) + DRY_RUN="$2" + shift 2 + ;; + *) + echo "Unknown option $1" + exit 1 + ;; + esac + done + + if [ -z "$SUBMISSION_PATH" ]; then + echo "Error: --submission_path is required." + exit 1 + fi + + if [ "$DRY_RUN" = false ]; then + MAX_GLOBAL_STEPS="" + fi +} + +extract_submission_info() { + SUBMISSION_NAME=$(basename "$SUBMISSION_PATH") + local info_file="$SUBMISSION_PATH/submission_info.yml" + + if [ ! -f "$info_file" ]; then + echo "Error: $info_file not found." + exit 1 + fi + + local raw_ruleset=$(yq eval '.ruleset' "$info_file" | tr '[:upper:]' '[:lower:]') + FRAMEWORK=$(yq eval '.framework' "$info_file" | tr '[:upper:]' '[:lower:]') + + # Parse ruleset by checking for substrings "self" or "external" + if [[ "$raw_ruleset" == *"self"* ]]; then + RULESET="self" + elif [[ "$raw_ruleset" == *"external"* ]]; then + RULESET="external" + else + echo "Error: Expected 'ruleset' in $info_file to contain 'self' or 'external' (got '$raw_ruleset')." + exit 1 + fi + + # Verify framework + if [[ "$FRAMEWORK" != "jax" && "$FRAMEWORK" != "pytorch" ]]; then + echo "Error: 'framework' in $info_file must be either 'jax' or 'pytorch' (got '$FRAMEWORK')." + exit 1 + fi + + echo "Submission Name: $SUBMISSION_NAME" + echo "Ruleset: $RULESET" + echo "Framework: $FRAMEWORK" + echo "Dry Run: $DRY_RUN" + echo "Max Global Steps: $MAX_GLOBAL_STEPS" +} + +generate_config() { + local exp_prefix="submissions_a100_dry_run" + if [ "$DRY_RUN" = false ]; then + exp_prefix="submissions_a100" + fi + + docker run \ + --rm \ + -v "$(pwd)":/algorithmic-efficiency \ + -w /algorithmic-efficiency \ + --entrypoint python \ + "europe-west4-docker.pkg.dev/mlcommons-algoperf/algoperf-docker-repo/algoperf_${FRAMEWORK}_main:latest" \ + algorithmic-efficiency/scoring/utils/slurm/make_job_config.py \ + --framework="$FRAMEWORK" \ + --tuning_ruleset="$RULESET" \ + --submission_path="$SUBMISSION_PATH/submission.py" \ + --experiment_dir="${exp_prefix}/$SUBMISSION_NAME" + + mv config.json "$SUBMISSION_NAME.json" +} + +prepare_sbatch_array() { + local num_jobs=$(jq 'length' "$SUBMISSION_NAME.json") + if [[ "$num_jobs" -eq 0 ]]; then + echo "Error: No jobs found in $SUBMISSION_NAME.json." + exit 1 + fi + + ARRAY_RANGE="0-$((num_jobs - 1))" + echo "Number of jobs: $num_jobs" + echo "Sbatch array range: $ARRAY_RANGE" + + mkdir -p "experiments/tests/$SUBMISSION_NAME" +} + +run_sbatch() { + local sbatch_cmd=( + sbatch + --array="$ARRAY_RANGE" + --output="experiments/tests/$SUBMISSION_NAME/job_%A_%a.out" + --error="experiments/tests/$SUBMISSION_NAME/job_%A_%a.err" + "algorithmic-efficiency/scoring/utils/slurm/run_jobs.sh" + --config_file "$(pwd)/$SUBMISSION_NAME.json" + --image "europe-west4-docker.pkg.dev/mlcommons-algoperf/algoperf-docker-repo/algoperf_${FRAMEWORK}_main:latest" + ) + + if [ -n "$MAX_GLOBAL_STEPS" ]; then + sbatch_cmd+=(--max_global_steps "$MAX_GLOBAL_STEPS") + fi + + "${sbatch_cmd[@]}" +} + +# --- Main --- + +main() { + verify_environment + parse_flags "$@" + extract_submission_info + generate_config + prepare_sbatch_array + run_sbatch +} + +main "$@"