From 5f69eaf1ec280236af96aa5422726c0cf9b41a97 Mon Sep 17 00:00:00 2001 From: Aahlad C Date: Thu, 19 Mar 2026 11:06:48 -0700 Subject: [PATCH 1/6] Fix minor dev comment in Dockerfile to prevent crashes --- docker/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 88b092b9a..1ff4d6596 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -83,8 +83,8 @@ RUN if [ "$framework" = "jax" ] ; then \ RUN cd /algorithmic-efficiency && git fetch origin RUN cd /algorithmic-efficiency && git pull -# Todo: remove this, this is temporary for developing -COPY scripts/startup.sh /algorithmic-efficiency/docker/scripts/startup.sh +# Uncomment this for developing purposes +# COPY scripts/startup.sh /algorithmic-efficiency/docker/scripts/startup.sh RUN chmod a+x /algorithmic-efficiency/docker/scripts/startup.sh ENTRYPOINT ["bash", "/algorithmic-efficiency/docker/scripts/startup.sh"] From 2d7d8c89d8dbf33df64102635d8c7a4721ad0a15 Mon Sep 17 00:00:00 2001 From: Aahlad C Date: Thu, 19 Mar 2026 11:11:09 -0700 Subject: [PATCH 2/6] Remove redifined flag in make_job_config --- scoring/utils/slurm/make_job_config.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/scoring/utils/slurm/make_job_config.py b/scoring/utils/slurm/make_job_config.py index 39f6f5eb0..22a412298 100644 --- a/scoring/utils/slurm/make_job_config.py +++ b/scoring/utils/slurm/make_job_config.py @@ -35,11 +35,6 @@ 'experiments', 'Path to experiment dir where logs will be saved.', ) -flags.DEFINE_string( - 'experiment_dir', - 'experiments/', - 'Path to experiment dir where logs will be saved.', -) flags.DEFINE_enum( 'framework', 'jax', From 8ed4856240949c269280a6bb91b8b6279e162328 Mon Sep 17 00:00:00 2001 From: Aahlad C Date: Thu, 19 Mar 2026 11:17:05 -0700 Subject: [PATCH 3/6] Update make_job_config for a100 submissions. 1. Add finewebedu workload. 2. Update num_trials and num_studies to be flag defined (since they can vary between self and external tuning rulesets) 3. Have every run use a different seed for more variability --- scoring/utils/slurm/make_job_config.py | 34 +++++++++++++++++++------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/scoring/utils/slurm/make_job_config.py b/scoring/utils/slurm/make_job_config.py index 22a412298..91afc9196 100644 --- a/scoring/utils/slurm/make_job_config.py +++ b/scoring/utils/slurm/make_job_config.py @@ -9,6 +9,7 @@ import json import os +import struct import jax from absl import app, flags @@ -17,8 +18,6 @@ TUNING_SEARCH_SPACE = ( 'reference_algorithms/paper_baselines/adamw/tuning_search_space.json' ) -NUM_TUNING_TRIALS = 3 # For external tuning ruleset -NUM_STUDIES = 3 flags.DEFINE_string( 'submission_path', @@ -51,14 +50,13 @@ flags.DEFINE_string( 'workloads', None, help='Comma seperated list of workloads to run.' ) -flags.DEFINE_integer('num_studies', NUM_STUDIES, help='Number of studies.') +flags.DEFINE_integer('num_studies', None, help='Number of studies.') +flags.DEFINE_integer('num_tuning_trials', None, help='Number of tuning trials.') FLAGS = flags.FLAGS MIN_INT = -(2 ** (31)) MAX_INT = 2 ** (31) - 1 -NUM_TUNING_TRIALS = 5 # For external tuning ruleset -NUM_STUDIES = 3 WORKLOADS = { 'imagenet_resnet': {'dataset': 'imagenet'}, @@ -69,6 +67,12 @@ 'librispeech_deepspeech': {'dataset': 'librispeech'}, 'criteo1tb': {'dataset': 'criteo1tb'}, 'librispeech_conformer': {'dataset': 'librispeech'}, + 'finewebedu_lm': {'dataset': 'fineweb_edu_10B'} +} + +RULESET_CONFIGS = { + 'self': {'num_studies': 3, 'num_tuning_trials': 1}, + 'external': {'num_studies': 3, 'num_tuning_trials': 5}, } @@ -78,17 +82,29 @@ def main(_): else: workloads = FLAGS.workloads.split(',') - key = jax.random.key(FLAGS.seed) + if not FLAGS.seed: + FLAGS.seed = struct.unpack('I', os.urandom(4))[0] + + # Set defaults based on tuning_ruleset if not provided by user + num_studies = FLAGS.num_studies + if num_studies is None: + num_studies = RULESET_CONFIGS[FLAGS.tuning_ruleset]['num_studies'] + + num_tuning_trials = FLAGS.num_tuning_trials + if num_tuning_trials is None: + num_tuning_trials = RULESET_CONFIGS[FLAGS.tuning_ruleset]['num_tuning_trials'] + + key = jax.random.PRNGKey(FLAGS.seed) jobs = [] for workload in workloads: # Fold in hash(workload) mod(max(uint32)) workload_key = jax.random.fold_in(key, hash(workload) % (2**32 - 1)) - for study_index in range(NUM_STUDIES): + for study_index in range(num_studies): study_key = jax.random.fold_in(workload_key, study_index) if FLAGS.tuning_ruleset == 'external': - for hparam_index in range(NUM_TUNING_TRIALS): + for hparam_index in range(num_tuning_trials): run_key = jax.random.fold_in(study_key, hparam_index) seed = jax.random.randint(run_key, (1,), MIN_INT, MAX_INT)[0].item() print(seed) @@ -102,7 +118,7 @@ def main(_): job['experiment_dir'] = study_dir job['rng_seed'] = seed job['tuning_ruleset'] = FLAGS.tuning_ruleset - job['num_tuning_trials'] = NUM_TUNING_TRIALS + job['num_tuning_trials'] = num_tuning_trials job['hparam_start_index'] = hparam_index job['hparam_end_index'] = hparam_index + 1 job['tuning_search_space'] = FLAGS.tuning_search_space From 8a9d6598e6047539d27cc1dde569c373190710c4 Mon Sep 17 00:00:00 2001 From: Aahlad C Date: Thu, 19 Mar 2026 11:30:58 -0700 Subject: [PATCH 4/6] Update run_jobs.sh script for a100 submissions. 1. Ensure any variable can be passed in via flags. Folks shouldn't have to edit the file and hardcode variables for any reason. 2. Pass max global steps via a flag. 3. Update some default values for the new submission (repo/image/config file/logs bucket) --- scoring/utils/slurm/run_jobs.sh | 146 ++++++++++++++++++++------------ 1 file changed, 92 insertions(+), 54 deletions(-) diff --git a/scoring/utils/slurm/run_jobs.sh b/scoring/utils/slurm/run_jobs.sh index 5fcf8f69e..1047b31c0 100644 --- a/scoring/utils/slurm/run_jobs.sh +++ b/scoring/utils/slurm/run_jobs.sh @@ -2,31 +2,21 @@ #SBATCH --nodes=1 # give it a full node #SBATCH --ntasks-per-node=1 -#SBATCH --array= -#SBATCH --partition=v100 -#SBATCH --gpus-per-node=8 +#SBATCH --array=0-26 +#SBATCH --partition=a100 +#SBATCH --gpus-per-node=4 #SBATCH --exclusive #this will not allow other jobs to run on this cluster -#SBATCH --output=experiments/tests/jit_debug_deepspeech_old_stephint_nadamw/job_%A_%a.out -#SBATCH --error=experiments/tests/jit_debug_deepspeech_old_stephint_nadamw/job_%A_%a.err +#SBATCH --output=experiments/tests/updated_schedule_free/job_%A_%a.out +#SBATCH --error=experiments/tests/updated_schedule_free/job_%A_%a.err -# Usage: sbatch .sh +# Usage: sbatch .sh [options] # This script reads config.json and launches a sbatch job using task -# arrays where each job in the array corresponds to a training run +# arrays where each job in the array corresponds to a training run # for a workload given a random seed and tuning trial index. # To generate the config.json use make_job_config.py. set -x -# Pull docker image (ATTENTION: you may want to modify this) -REPO="" -IMAGE="" -y | gcloud auth configure-docker $REPO -docker pull $IMAGE -# Job config (ATTENTION: you may want to modify this) -config_file="" # Replace with your config file path -LOGS_BUCKET="" # replace with your bucket used for logging - - # Function to read a JSON file and extract a value by key read_json_value() { local json_file="$1" @@ -43,41 +33,89 @@ then exit 1 fi -TASK="$SLURM_ARRAY_TASK_ID" -FRAMEWORK=$(read_json_value "$config_file" "$TASK" "framework") -DATASET=$(read_json_value "$config_file" "$TASK" "dataset") -SUBMISSION_PATH=$(read_json_value "$config_file" "$TASK" "submission_path") -FRAMEWORK=$(read_json_value "$config_file" "$TASK" "framework") -TUNING_SEARCH_SPACE=$(read_json_value "$config_file" "$TASK" "tuning_search_space") -EXPERIMENT_DIR=$(read_json_value "$config_file" "$TASK" "experiment_dir") -MAX_STEPS=$(read_json_value "$config_file" "$TASK" "max_steps") -RNG_SEED=$(read_json_value "$config_file" "$TASK" "rng_seed") -WORKLOAD=$(read_json_value "$config_file" "$TASK" "workload") -HPARAM_START_INDEX=$(read_json_value "$config_file" "$TASK" "hparam_start_index") -HPARAM_END_INDEX=$(read_json_value "$config_file" "$TASK" "hparam_end_index") -NUM_TUNING_TRIALS=$(read_json_value "$config_file" "$TASK" "num_tuning_trials") -TUNING_RULESET=$(read_json_value "$config_file" "$TASK" "tuning_ruleset") -MAX_GLOBAL_STEPS=$(read_json_value "$config_file" "$MAX_GLOBAL_STEPS" "max_global_steps") +# Default values +REPO="europe-west4-docker.pkg.dev" +IMAGE="europe-west4-docker.pkg.dev/mlcommons-algoperf/algoperf-docker-repo/algoperf_pytorch_main:latest" +CONFIG_FILE="$HOME/algorithmic-efficiency/config.json" +LOGS_BUCKET="algoperf-runs" +TASK_ID="${SLURM_ARRAY_TASK_ID:-0}" + +# Parse flags +while [[ $# -gt 0 ]]; do + case $1 in + --repo) + REPO="$2" + shift 2 + ;; + --image) + IMAGE="$2" + shift 2 + ;; + --config_file) + CONFIG_FILE="$2" + shift 2 + ;; + --logs_bucket) + LOGS_BUCKET="$2" + shift 2 + ;; + --max_global_steps) + MAX_GLOBAL_STEPS="$2" + shift 2 + ;; + *) + echo "Unknown option $1" + exit 1 + ;; + esac +done + +# Pull docker image +yes | gcloud auth configure-docker "$REPO" +docker pull "$IMAGE" + +# Set variables from config file +FRAMEWORK=$(read_json_value "$CONFIG_FILE" "$TASK_ID" "framework") +DATASET=$(read_json_value "$CONFIG_FILE" "$TASK_ID" "dataset") +SUBMISSION_PATH=$(read_json_value "$CONFIG_FILE" "$TASK_ID" "submission_path") +TUNING_SEARCH_SPACE=$(read_json_value "$CONFIG_FILE" "$TASK_ID" "tuning_search_space") +EXPERIMENT_DIR=$(read_json_value "$CONFIG_FILE" "$TASK_ID" "experiment_dir") +RNG_SEED=$(read_json_value "$CONFIG_FILE" "$TASK_ID" "rng_seed") +WORKLOAD=$(read_json_value "$CONFIG_FILE" "$TASK_ID" "workload") +HPARAM_START_INDEX=$(read_json_value "$CONFIG_FILE" "$TASK_ID" "hparam_start_index") +HPARAM_END_INDEX=$(read_json_value "$CONFIG_FILE" "$TASK_ID" "hparam_end_index") +NUM_TUNING_TRIALS=$(read_json_value "$CONFIG_FILE" "$TASK_ID" "num_tuning_trials") +TUNING_RULESET=$(read_json_value "$CONFIG_FILE" "$TASK_ID" "tuning_ruleset") + +DOCKER_CMD=( + docker run + -v /opt/data/:/data/ + -v "$HOME/experiment_runs:/experiment_runs" + -v "$HOME/submissions_algorithms/:/algorithmic-efficiency/submissions_algorithms" + -v "$HOME/algorithmic-efficiency/docker/scripts/startup.sh:/algorithmic-efficiency/docker/scripts/startup.sh" + --gpus all + --ipc=host + "$IMAGE" + -d "$DATASET" + -f "$FRAMEWORK" + -s "$SUBMISSION_PATH" + -w "$WORKLOAD" + -t "$TUNING_SEARCH_SPACE" + -e "$EXPERIMENT_DIR" + -c False + -o True + --rng_seed "$RNG_SEED" + --hparam_start_index "$HPARAM_START_INDEX" + --hparam_end_index "$HPARAM_END_INDEX" + --num_tuning_trials "$NUM_TUNING_TRIALS" + --tuning_ruleset "$TUNING_RULESET" + -i true + -r false + --logs_bucket "$LOGS_BUCKET" +) + +if [ -n "$MAX_GLOBAL_STEPS" ]; then + DOCKER_CMD+=(-m "$MAX_GLOBAL_STEPS") +fi -docker run \ - -v /opt/data/:/data/ \ - -v $HOME/submissions_algorithms/:/algorithmic-efficiency/submissions_algorithms \ - --gpus all \ - --ipc=host \ - $IMAGE \ - -d $DATASET \ - -f $FRAMEWORK \ - -s $SUBMISSION_PATH \ - -w $WORKLOAD \ - -t $TUNING_SEARCH_SPACE \ - -e $EXPERIMENT_DIR \ - -c False \ - -o True \ - --rng_seed $RNG_SEED \ - --hparam_start_index $HPARAM_START_INDEX \ - --hparam_end_index $HPARAM_END_INDEX \ - --num_tuning_trials $NUM_TUNING_TRIALS \ - --tuning_ruleset $TUNING_RULESET \ - --logs_bucket $LOGS_BUCKET \ - -i true \ - -r false \ No newline at end of file +"${DOCKER_CMD[@]}" From b4734a4a01662634e5164e7fa89c897050701c3a Mon Sep 17 00:00:00 2001 From: Aahlad C Date: Thu, 19 Mar 2026 11:35:24 -0700 Subject: [PATCH 5/6] Add a script that makes it easy to score submissions. The script is meant to be used only in the slurm cluster. It forces a specific directory structure, and checks for it right in the beginning. If the dir structure is not as expected, it throws an error and explains the structure it expects. It also includes a dry run flag which runs the job for 10 steps, and includes a command on how to use it at the top of the file. Also update the readme file to explain this script. --- scoring/utils/slurm/README.md | 23 +++ scoring/utils/slurm/run_submission.sh | 204 ++++++++++++++++++++++++++ 2 files changed, 227 insertions(+) create mode 100644 scoring/utils/slurm/run_submission.sh diff --git a/scoring/utils/slurm/README.md b/scoring/utils/slurm/README.md index a8e41f04b..0aec296c8 100644 --- a/scoring/utils/slurm/README.md +++ b/scoring/utils/slurm/README.md @@ -48,6 +48,29 @@ LOGS_BUCKET="algoperf-runs-internal" sbatch run_jobs.sh ``` +## Convenient bash script to launch SLURM jobs + +The run_submissions.sh script does all the steps above for you. It is intended to be used on a slurm login node. It however does expect a very specific directory structure. You need to be in the $HOME dir with the algorithmic-efficiency and submissions_algorithms git repos in the home dir. + +``` +$USER$@$USER$:~/$ tree -L 1 +. +├── algorithmic-efficiency +└── submissions_algorithms +``` + +And you run the script with a command like so: + +``` +./algorithmic-efficiency/scoring/utils/slurm/run_submission.sh \ + --submission_path submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2 + --dry_run false +``` + +The submission path points to the dir where the submission exists (in the submissions git repo). `dry_run` is set to true by default (which limits max global steps to 10) to prevent accidental commands from wasting resources. Explicitly set it to false for full runs. + +The script will figure out the rest and run them for you (creating the config, saving it to a path with a reasonable name, and running the sbatch script with the right flags). + # Set up new SLURM cluster If you are setting up a new cluster, we recommend using the [HPC toolkit to set up a SLURM cluster](https://cloud.google.com/cluster-toolkit/docs/quickstarts/slurm-cluster). diff --git a/scoring/utils/slurm/run_submission.sh b/scoring/utils/slurm/run_submission.sh new file mode 100644 index 000000000..ddef9f586 --- /dev/null +++ b/scoring/utils/slurm/run_submission.sh @@ -0,0 +1,204 @@ +#!/bin/bash + +# Usage: +# ./algorithmic-efficiency/scoring/utils/slurm/run_submission.sh \ +# --submission_path submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2 +# +# Note: --dry_run is true by default (sets MAX_GLOBAL_STEPS=10). +# To perform a full run, explicitly set --dry_run false. + +set -e +set -x + +# --- Global Variables --- +SUBMISSION_PATH="" +DRY_RUN=true +MAX_GLOBAL_STEPS=10 +SUBMISSION_NAME="" +RULESET="" +FRAMEWORK="" +ARRAY_RANGE="" + +# --- Helper Functions --- + +install_yq() { + if ! command -v yq &> /dev/null; then + echo "yq not found. Attempting to install locally to $HOME/.local/bin..." + mkdir -p "$HOME/.local/bin" + local OS=$(uname | tr '[:upper:]' '[:lower:]') + local ARCH=$(uname -m) + case "$ARCH" in + x86_64) ARCH="amd64" ;; + aarch64) ARCH="arm64" ;; + esac + + local YQ_URL="https://github.com/mikefarah/yq/releases/latest/download/yq_${OS}_${ARCH}" + if command -v curl &> /dev/null; then + curl -L "$YQ_URL" -o "$HOME/.local/bin/yq" + elif command -v wget &> /dev/null; then + wget "$YQ_URL" -O "$HOME/.local/bin/yq" + else + echo "Error: Neither curl nor wget found. Please install yq manually: https://github.com/mikefarah/yq" + exit 1 + fi + chmod +x "$HOME/.local/bin/yq" + export PATH="$HOME/.local/bin:$PATH" + echo "yq installed successfully to $HOME/.local/bin" + fi +} + +check_command() { + if ! command -v "$1" &> /dev/null; then + echo "Error: $1 could not be found. Please install it." + exit 1 + fi +} + +verify_environment() { + if [[ "$PWD" != "$HOME" ]]; then + echo "Error: This script must be run from your home directory ($HOME)." + echo "Expected directory structure:" + echo " $HOME/" + echo " ├── algorithmic-efficiency/" + echo " └── submissions_algorithms/" + exit 1 + fi + + if [[ ! -d "algorithmic-efficiency" || ! -d "submissions_algorithms" ]]; then + echo "Error: Required repositories not found in the current directory." + echo "Please ensure both 'algorithmic-efficiency' and 'submissions_algorithms' are present in $HOME." + exit 1 + fi + + install_yq + check_command "jq" +} + +parse_flags() { + while [[ $# -gt 0 ]]; do + case $1 in + --submission_path) + SUBMISSION_PATH="$2" + shift 2 + ;; + --dry_run) + DRY_RUN="$2" + shift 2 + ;; + *) + echo "Unknown option $1" + exit 1 + ;; + esac + done + + if [ -z "$SUBMISSION_PATH" ]; then + echo "Error: --submission_path is required." + exit 1 + fi + + if [ "$DRY_RUN" = false ]; then + MAX_GLOBAL_STEPS="" + fi +} + +extract_submission_info() { + SUBMISSION_NAME=$(basename "$SUBMISSION_PATH") + local info_file="$SUBMISSION_PATH/submission_info.yml" + + if [ ! -f "$info_file" ]; then + echo "Error: $info_file not found." + exit 1 + fi + + local raw_ruleset=$(yq eval '.ruleset' "$info_file" | tr '[:upper:]' '[:lower:]') + FRAMEWORK=$(yq eval '.framework' "$info_file" | tr '[:upper:]' '[:lower:]') + + # Parse ruleset by checking for substrings "self" or "external" + if [[ "$raw_ruleset" == *"self"* ]]; then + RULESET="self" + elif [[ "$raw_ruleset" == *"external"* ]]; then + RULESET="external" + else + echo "Error: Expected 'ruleset' in $info_file to contain 'self' or 'external' (got '$raw_ruleset')." + exit 1 + fi + + # Verify framework + if [[ "$FRAMEWORK" != "jax" && "$FRAMEWORK" != "pytorch" ]]; then + echo "Error: 'framework' in $info_file must be either 'jax' or 'pytorch' (got '$FRAMEWORK')." + exit 1 + fi + + echo "Submission Name: $SUBMISSION_NAME" + echo "Ruleset: $RULESET" + echo "Framework: $FRAMEWORK" + echo "Dry Run: $DRY_RUN" + echo "Max Global Steps: $MAX_GLOBAL_STEPS" +} + +generate_config() { + local exp_prefix="submissions_a100_dry_run" + if [ "$DRY_RUN" = false ]; then + exp_prefix="submissions_a100" + fi + + docker run \ + --rm \ + -v "$(pwd)":/algorithmic-efficiency \ + -w /algorithmic-efficiency \ + --entrypoint python \ + "europe-west4-docker.pkg.dev/mlcommons-algoperf/algoperf-docker-repo/algoperf_${FRAMEWORK}_main:latest" \ + algorithmic-efficiency/scoring/utils/slurm/make_job_config.py \ + --framework="$FRAMEWORK" \ + --tuning_ruleset="$RULESET" \ + --submission_path="$SUBMISSION_PATH/submission.py" \ + --experiment_dir="${exp_prefix}/$SUBMISSION_NAME" + + mv config.json "$SUBMISSION_NAME.json" +} + +prepare_sbatch_array() { + local num_jobs=$(jq 'length' "$SUBMISSION_NAME.json") + if [[ "$num_jobs" -eq 0 ]]; then + echo "Error: No jobs found in $SUBMISSION_NAME.json." + exit 1 + fi + + ARRAY_RANGE="0-$((num_jobs - 1))" + echo "Number of jobs: $num_jobs" + echo "Sbatch array range: $ARRAY_RANGE" + + mkdir -p "experiments/tests/$SUBMISSION_NAME" +} + +run_sbatch() { + local sbatch_cmd=( + sbatch + --array="$ARRAY_RANGE" + --output="experiments/tests/$SUBMISSION_NAME/job_%A_%a.out" + --error="experiments/tests/$SUBMISSION_NAME/job_%A_%a.err" + "algorithmic-efficiency/scoring/utils/slurm/run_jobs.sh" + --config_file "$(pwd)/$SUBMISSION_NAME.json" + --image "europe-west4-docker.pkg.dev/mlcommons-algoperf/algoperf-docker-repo/algoperf_${FRAMEWORK}_main:latest" + ) + + if [ -n "$MAX_GLOBAL_STEPS" ]; then + sbatch_cmd+=(--max_global_steps "$MAX_GLOBAL_STEPS") + fi + + "${sbatch_cmd[@]}" +} + +# --- Main --- + +main() { + verify_environment + parse_flags "$@" + extract_submission_info + generate_config + prepare_sbatch_array + run_sbatch +} + +main "$@" From c0d4087ea2e1e1d2ec899ec5f2e439dc87e039e6 Mon Sep 17 00:00:00 2001 From: Aahlad C Date: Thu, 19 Mar 2026 12:17:27 -0700 Subject: [PATCH 6/6] Prepare scoring scripts for a100 runs. 1. Update base workloads to 9 (with finewebedu). 2. Remove all logic related to test targets, since they are no longer used. Work only with validation targets. 3. Fix step time computation. --- scoring/performance_profile.py | 2 +- scoring/score_submissions.py | 52 +++++++--------------------------- scoring/scoring_utils.py | 10 ++----- 3 files changed, 15 insertions(+), 49 deletions(-) diff --git a/scoring/performance_profile.py b/scoring/performance_profile.py index 043a65791..eecb9799e 100644 --- a/scoring/performance_profile.py +++ b/scoring/performance_profile.py @@ -59,7 +59,7 @@ # workloads and rules for the scoring to be correct. # We do not use the workload registry since it contains test and development # workloads as well. -NUM_BASE_WORKLOADS = 8 +NUM_BASE_WORKLOADS = 9 NUM_VARIANT_WORKLOADS = 0 NUM_TRIALS = 5 NUM_STUDIES = 3 diff --git a/scoring/score_submissions.py b/scoring/score_submissions.py index efe276a33..10e624148 100644 --- a/scoring/score_submissions.py +++ b/scoring/score_submissions.py @@ -75,10 +75,10 @@ FLAGS = flags.FLAGS -def get_summary_df(workload, workload_df, include_test_split=False): +def get_summary_df(workload, workload_df): print(f' WORKLOAD: {workload}') validation_metric, validation_target = ( - scoring_utils.get_workload_metrics_and_targets(workload, split='validation') + scoring_utils.get_workload_metrics_and_targets(workload) ) is_minimized = performance_profile.check_if_minimized(validation_metric) @@ -127,7 +127,7 @@ def get_summary_df(workload, workload_df, include_test_split=False): # compute the step times def delta(series): - return series.shift(1, fill_value=0) - series + return series.apply(lambda x: np.diff(x, prepend=0)) accumulated_time_intervals = delta(workload_df['accumulated_submission_time']) step_intervals = delta(workload_df['global_step']) @@ -136,47 +136,19 @@ def delta(series): f'WARNING: The number of evals may be too low to calculate reliable step time for {workload}' ) - summary_df['step_time (s)'] = np.median( - (accumulated_time_intervals / step_intervals).iloc[0] - ) - - summary_df['step_hint'] = scoring_utils.get_workload_stephint(workload) - - # test metrics - if include_test_split: - test_metric, test_target = scoring_utils.get_workload_metrics_and_targets( - workload, split='test' + # Flatten all intervals from all trials and take the global median + with np.errstate(divide='ignore', invalid='ignore'): + all_ratios = np.concatenate( + (accumulated_time_intervals / step_intervals).values ) + summary_df['step_time (s)'] = np.nanmedian(all_ratios) - summary_df['test target metric name'] = test_metric - summary_df['test target metric value'] = test_target - - summary_df['test target reached'] = ( - workload_df[test_metric] - .apply(lambda x: target_op(x, test_target)) - .apply(np.any) - ) - summary_df['best metric value on test'] = workload_df[test_metric].apply( - lambda x: best_op(x) - ) - workload_df['index best eval on test'] = workload_df[test_metric].apply( - lambda x: idx_op(x) - ) - summary_df['time to best eval on test (s)'] = workload_df.apply( - lambda x: x['accumulated_submission_time'][x['index best eval on test']], - axis=1, - ) - summary_df['time to target on test (s)'] = summary_df.apply( - lambda x: x['time to best eval on test (s)'] - if x['test target reached'] - else np.inf, - axis=1, - ) + summary_df['step_hint'] = scoring_utils.get_workload_stephint(workload) return summary_df -def get_submission_summary(df, include_test_split=False): +def get_submission_summary(df): """Summarizes the submission results into metric and time tables organized by workload. """ @@ -184,9 +156,7 @@ def get_submission_summary(df, include_test_split=False): dfs = [] print(df) for workload, group in df.groupby('workload'): - summary_df = get_summary_df( - workload, group, include_test_split=include_test_split - ) + summary_df = get_summary_df(workload, group) dfs.append(summary_df) df = pd.concat(dfs) diff --git a/scoring/scoring_utils.py b/scoring/scoring_utils.py index cb63eab4b..c2ab8aeec 100644 --- a/scoring/scoring_utils.py +++ b/scoring/scoring_utils.py @@ -214,7 +214,7 @@ def get_experiment_df(experiment_dir): ## Get workload properties -def get_workload_metrics_and_targets(workload, split='validation'): +def get_workload_metrics_and_targets(workload): """Returns workload target metric name and value.""" workload_name = re.match(WORKLOAD_NAME_PATTERN, workload).group(1) framework = re.match(WORKLOAD_NAME_PATTERN, workload).group(2) @@ -233,12 +233,8 @@ def get_workload_metrics_and_targets(workload, split='validation'): workload_init_kwargs=workload_init_kwargs, ) metric_name = workload_obj.target_metric_name - if split == 'validation': - metric = f'validation/{metric_name}' - target = workload_obj.validation_target_value - elif split == 'test': - metric = f'test/{metric_name}' - target = workload_obj.test_target_value + metric = f'validation/{metric_name}' + target = workload_obj.validation_target_value return metric, target