diff --git a/docker/Dockerfile b/docker/Dockerfile index 88b092b9a..f31ff9e1f 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -84,7 +84,7 @@ RUN cd /algorithmic-efficiency && git fetch origin RUN cd /algorithmic-efficiency && git pull # Todo: remove this, this is temporary for developing -COPY scripts/startup.sh /algorithmic-efficiency/docker/scripts/startup.sh +# COPY scripts/startup.sh /algorithmic-efficiency/docker/scripts/startup.sh RUN chmod a+x /algorithmic-efficiency/docker/scripts/startup.sh ENTRYPOINT ["bash", "/algorithmic-efficiency/docker/scripts/startup.sh"] diff --git a/docs/DOCUMENTATION.md b/docs/DOCUMENTATION.md index 15ffc7e65..b32b494c4 100644 --- a/docs/DOCUMENTATION.md +++ b/docs/DOCUMENTATION.md @@ -797,9 +797,9 @@ a rough guideline, the entire set of workloads was designed to have a combined runtime of very roughly $100$ hours on the [**benchmarking hardware**](#benchmarking-hardware). -The eight *AlgoPerf Workloads* are: +The nine *AlgoPerf Workloads* are: - | **Task** | **Dataset** | **Model** | **Loss** | **Metric** | Validation
**Target** | Test
**Target** | Max
**Runtime**
*(in seconds)* | Default
**Dropout**
Value +| | **Task** | **Dataset** | **Model** | **Loss** | **Metric** | Validation
**Target** | Test
**Target** | Max
**Runtime**
*(in seconds)* | Default
**Dropout**
Value ----- | ----------------------------- | ----------- | ----------- | -------- | ---------- | ------------------------ | ------------------ | ------------------------------------- | ------------------------------- **1** | Clickthrough rate prediction | Criteo 1TB | DLRMsmall | CE | CE (↓) | 0.123735 | 0.126041 | 8,915 | 0 **2** | MRI reconstruction | fastMRI | U-Net | L1 | SSIM (↑) | 0.723653 | 0.740633 | 2,745 | 0 diff --git a/pytorch_scoring_config_1.json b/pytorch_scoring_config_1.json new file mode 100644 index 000000000..fcc0e86f2 --- /dev/null +++ b/pytorch_scoring_config_1.json @@ -0,0 +1,299 @@ +{ + "0": { + "framework": "pytorch", + "workload": "imagenet_resnet", + "dataset": "imagenet", + "submission_path": "submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2/submission.py", + "experiment_dir": "submissions_a100/schedule_free_adamw_v2/study_0", + "rng_seed": -1447200680, + "tuning_ruleset": "self", + "num_tuning_trials": 1, + "max_global_steps": 10 + }, + "1": { + "framework": "pytorch", + "workload": "imagenet_resnet", + "dataset": "imagenet", + "submission_path": "submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2/submission.py", + "experiment_dir": "submissions_a100/schedule_free_adamw_v2/study_1", + "rng_seed": -1977906563, + "tuning_ruleset": "self", + "num_tuning_trials": 1, + "max_global_steps": 10 + }, + "2": { + "framework": "pytorch", + "workload": "imagenet_resnet", + "dataset": "imagenet", + "submission_path": "submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2/submission.py", + "experiment_dir": "submissions_a100/schedule_free_adamw_v2/study_2", + "rng_seed": 666869491, + "tuning_ruleset": "self", + "num_tuning_trials": 1, + "max_global_steps": 10 + }, + "3": { + "framework": "pytorch", + "workload": "imagenet_vit", + "dataset": "imagenet", + "submission_path": "submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2/submission.py", + "experiment_dir": "submissions_a100/schedule_free_adamw_v2/study_0", + "rng_seed": -796448826, + "tuning_ruleset": "self", + "num_tuning_trials": 1, + "max_global_steps": 10 + }, + "4": { + "framework": "pytorch", + "workload": "imagenet_vit", + "dataset": "imagenet", + "submission_path": "submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2/submission.py", + "experiment_dir": "submissions_a100/schedule_free_adamw_v2/study_1", + "rng_seed": -557820510, + "tuning_ruleset": "self", + "num_tuning_trials": 1, + "max_global_steps": 10 + }, + "5": { + "framework": "pytorch", + "workload": "imagenet_vit", + "dataset": "imagenet", + "submission_path": "submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2/submission.py", + "experiment_dir": "submissions_a100/schedule_free_adamw_v2/study_2", + "rng_seed": -1307522002, + "tuning_ruleset": "self", + "num_tuning_trials": 1, + "max_global_steps": 10 + }, + "6": { + "framework": "pytorch", + "workload": "fastmri", + "dataset": "fastmri", + "submission_path": "submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2/submission.py", + "experiment_dir": "submissions_a100/schedule_free_adamw_v2/study_0", + "rng_seed": 1083014187, + "tuning_ruleset": "self", + "num_tuning_trials": 1, + "max_global_steps": 10 + }, + "7": { + "framework": "pytorch", + "workload": "fastmri", + "dataset": "fastmri", + "submission_path": "submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2/submission.py", + "experiment_dir": "submissions_a100/schedule_free_adamw_v2/study_1", + "rng_seed": -1077277636, + "tuning_ruleset": "self", + "num_tuning_trials": 1, + "max_global_steps": 10 + }, + "8": { + "framework": "pytorch", + "workload": "fastmri", + "dataset": "fastmri", + "submission_path": "submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2/submission.py", + "experiment_dir": "submissions_a100/schedule_free_adamw_v2/study_2", + "rng_seed": -397959160, + "tuning_ruleset": "self", + "num_tuning_trials": 1, + "max_global_steps": 10 + }, + "9": { + "framework": "pytorch", + "workload": "ogbg", + "dataset": "ogbg", + "submission_path": "submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2/submission.py", + "experiment_dir": "submissions_a100/schedule_free_adamw_v2/study_0", + "rng_seed": 1662399765, + "tuning_ruleset": "self", + "num_tuning_trials": 1, + "max_global_steps": 10 + }, + "10": { + "framework": "pytorch", + "workload": "ogbg", + "dataset": "ogbg", + "submission_path": "submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2/submission.py", + "experiment_dir": "submissions_a100/schedule_free_adamw_v2/study_1", + "rng_seed": 486196682, + "tuning_ruleset": "self", + "num_tuning_trials": 1, + "max_global_steps": 10 + }, + "11": { + "framework": "pytorch", + "workload": "ogbg", + "dataset": "ogbg", + "submission_path": "submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2/submission.py", + "experiment_dir": "submissions_a100/schedule_free_adamw_v2/study_2", + "rng_seed": 1039483369, + "tuning_ruleset": "self", + "num_tuning_trials": 1, + "max_global_steps": 10 + }, + "12": { + "framework": "pytorch", + "workload": "wmt", + "dataset": "wmt", + "submission_path": "submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2/submission.py", + "experiment_dir": "submissions_a100/schedule_free_adamw_v2/study_0", + "rng_seed": -811149048, + "tuning_ruleset": "self", + "num_tuning_trials": 1, + "max_global_steps": 10 + }, + "13": { + "framework": "pytorch", + "workload": "wmt", + "dataset": "wmt", + "submission_path": "submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2/submission.py", + "experiment_dir": "submissions_a100/schedule_free_adamw_v2/study_1", + "rng_seed": -1485236731, + "tuning_ruleset": "self", + "num_tuning_trials": 1, + "max_global_steps": 10 + }, + "14": { + "framework": "pytorch", + "workload": "wmt", + "dataset": "wmt", + "submission_path": "submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2/submission.py", + "experiment_dir": "submissions_a100/schedule_free_adamw_v2/study_2", + "rng_seed": -439753961, + "tuning_ruleset": "self", + "num_tuning_trials": 1, + "max_global_steps": 10 + }, + "15": { + "framework": "pytorch", + "workload": "librispeech_deepspeech", + "dataset": "librispeech", + "submission_path": "submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2/submission.py", + "experiment_dir": "submissions_a100/schedule_free_adamw_v2/study_0", + "rng_seed": -1459326687, + "tuning_ruleset": "self", + "num_tuning_trials": 1, + "max_global_steps": 10 + }, + "16": { + "framework": "pytorch", + "workload": "librispeech_deepspeech", + "dataset": "librispeech", + "submission_path": "submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2/submission.py", + "experiment_dir": "submissions_a100/schedule_free_adamw_v2/study_1", + "rng_seed": 1889675898, + "tuning_ruleset": "self", + "num_tuning_trials": 1, + "max_global_steps": 10 + }, + "17": { + "framework": "pytorch", + "workload": "librispeech_deepspeech", + "dataset": "librispeech", + "submission_path": "submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2/submission.py", + "experiment_dir": "submissions_a100/schedule_free_adamw_v2/study_2", + "rng_seed": -1297403039, + "tuning_ruleset": "self", + "num_tuning_trials": 1, + "max_global_steps": 10 + }, + "18": { + "framework": "pytorch", + "workload": "criteo1tb", + "dataset": "criteo1tb", + "submission_path": "submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2/submission.py", + "experiment_dir": "submissions_a100/schedule_free_adamw_v2/study_0", + "rng_seed": -1790695410, + "tuning_ruleset": "self", + "num_tuning_trials": 1, + "max_global_steps": 10 + }, + "19": { + "framework": "pytorch", + "workload": "criteo1tb", + "dataset": "criteo1tb", + "submission_path": "submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2/submission.py", + "experiment_dir": "submissions_a100/schedule_free_adamw_v2/study_1", + "rng_seed": -816806699, + "tuning_ruleset": "self", + "num_tuning_trials": 1, + "max_global_steps": 10 + }, + "20": { + "framework": "pytorch", + "workload": "criteo1tb", + "dataset": "criteo1tb", + "submission_path": "submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2/submission.py", + "experiment_dir": "submissions_a100/schedule_free_adamw_v2/study_2", + "rng_seed": 1704852417, + "tuning_ruleset": "self", + "num_tuning_trials": 1, + "max_global_steps": 10 + }, + "21": { + "framework": "pytorch", + "workload": "librispeech_conformer", + "dataset": "librispeech", + "submission_path": "submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2/submission.py", + "experiment_dir": "submissions_a100/schedule_free_adamw_v2/study_0", + "rng_seed": 1605670948, + "tuning_ruleset": "self", + "num_tuning_trials": 1, + "max_global_steps": 10 + }, + "22": { + "framework": "pytorch", + "workload": "librispeech_conformer", + "dataset": "librispeech", + "submission_path": "submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2/submission.py", + "experiment_dir": "submissions_a100/schedule_free_adamw_v2/study_1", + "rng_seed": -1323816683, + "tuning_ruleset": "self", + "num_tuning_trials": 1, + "max_global_steps": 10 + }, + "23": { + "framework": "pytorch", + "workload": "librispeech_conformer", + "dataset": "librispeech", + "submission_path": "submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2/submission.py", + "experiment_dir": "submissions_a100/schedule_free_adamw_v2/study_2", + "rng_seed": -1881486829, + "tuning_ruleset": "self", + "num_tuning_trials": 1, + "max_global_steps": 10 + }, + "24": { + "framework": "pytorch", + "workload": "finewebedu_lm", + "dataset": "fineweb_edu_10B", + "submission_path": "submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2/submission.py", + "experiment_dir": "submissions_a100/schedule_free_adamw_v2/study_0", + "rng_seed": -304430747, + "tuning_ruleset": "self", + "num_tuning_trials": 1, + "max_global_steps": 10 + }, + "25": { + "framework": "pytorch", + "workload": "finewebedu_lm", + "dataset": "fineweb_edu_10B", + "submission_path": "submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2/submission.py", + "experiment_dir": "submissions_a100/schedule_free_adamw_v2/study_1", + "rng_seed": -912336586, + "tuning_ruleset": "self", + "num_tuning_trials": 1, + "max_global_steps": 10 + }, + "26": { + "framework": "pytorch", + "workload": "finewebedu_lm", + "dataset": "fineweb_edu_10B", + "submission_path": "submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2/submission.py", + "experiment_dir": "submissions_a100/schedule_free_adamw_v2/study_2", + "rng_seed": 1970089239, + "tuning_ruleset": "self", + "num_tuning_trials": 1, + "max_global_steps": 10 + } +} \ No newline at end of file diff --git a/run_pytorch_scoring.sh b/run_pytorch_scoring.sh new file mode 100644 index 000000000..4f0794416 --- /dev/null +++ b/run_pytorch_scoring.sh @@ -0,0 +1,87 @@ +#!/bin/bash + +#SBATCH --nodes=1 # give it a full node +#SBATCH --ntasks-per-node=1 +#SBATCH --array=0-26 +#SBATCH --partition=a100 +#SBATCH --gpus-per-node=4 +#SBATCH --exclusive #this will not allow other jobs to run on this cluster +#SBATCH --output=experiments/tests/updated_schedule_free/job_%A_%a.out +#SBATCH --error=experiments/tests/updated_schedule_free/job_%A_%a.err + +# Usage: sbatch .sh +# This script reads config.json and launches a sbatch job using task +# arrays where each job in the array corresponds to a training run +# for a workload given a random seed and tuning trial index. +# To generate the config.json use make_job_config.py. + +set -x + +# Pull docker image (ATTENTION: you may want to modify this) +REPO="europe-west4-docker.pkg.dev" +IMAGE="europe-west4-docker.pkg.dev/mlcommons-algoperf/algoperf-docker-repo/algoperf_pytorch_main:latest" +yes | gcloud auth configure-docker $REPO +docker pull $IMAGE +# Job config (ATTENTION: you may want to modify this) +config_file="$HOME/algorithmic-efficiency/pytorch_scoring_config_1.json" # Replace with your config file path +LOGS_BUCKET="algoperf-runs" # replace with your bucket used for logging + + +# Function to read a JSON file and extract a value by key +read_json_value() { + local json_file="$1" + local index="$2" + local key="$3" + local value=$(jq -r ".[\"$index\"].$key" "$json_file") + echo "$value" +} + +# Check if jq is installed +if ! command -v jq &> /dev/null +then + echo "jq could not be found. Please install it." + exit 1 +fi + +TASK="$SLURM_ARRAY_TASK_ID" +FRAMEWORK=$(read_json_value "$config_file" "$TASK" "framework") +DATASET=$(read_json_value "$config_file" "$TASK" "dataset") +SUBMISSION_PATH=$(read_json_value "$config_file" "$TASK" "submission_path") +FRAMEWORK=$(read_json_value "$config_file" "$TASK" "framework") +TUNING_SEARCH_SPACE=$(read_json_value "$config_file" "$TASK" "tuning_search_space") +EXPERIMENT_DIR=$(read_json_value "$config_file" "$TASK" "experiment_dir") +MAX_STEPS=$(read_json_value "$config_file" "$TASK" "max_steps") +RNG_SEED=$(read_json_value "$config_file" "$TASK" "rng_seed") +WORKLOAD=$(read_json_value "$config_file" "$TASK" "workload") +HPARAM_START_INDEX=$(read_json_value "$config_file" "$TASK" "hparam_start_index") +HPARAM_END_INDEX=$(read_json_value "$config_file" "$TASK" "hparam_end_index") +NUM_TUNING_TRIALS=$(read_json_value "$config_file" "$TASK" "num_tuning_trials") +TUNING_RULESET=$(read_json_value "$config_file" "$TASK" "tuning_ruleset") +MAX_GLOBAL_STEPS=$(read_json_value "$config_file" "$TASK" "max_global_steps") + +docker run \ + -v /opt/data/:/data/ \ + -v $HOME/experiment_runs:/experiment_runs \ + -v $HOME/submissions_algorithms/:/algorithmic-efficiency/submissions_algorithms \ + -v $HOME/algorithmic-efficiency/docker/scripts/startup.sh:/algorithmic-efficiency/docker/scripts/startup.sh \ + --gpus all \ + --ipc=host \ + $IMAGE \ + -d $DATASET \ + -f $FRAMEWORK \ + -s $SUBMISSION_PATH \ + -w $WORKLOAD \ + -t $TUNING_SEARCH_SPACE \ + -e $EXPERIMENT_DIR \ + -c False \ + -o True \ + --rng_seed $RNG_SEED \ + --hparam_start_index $HPARAM_START_INDEX \ + --hparam_end_index $HPARAM_END_INDEX \ + --num_tuning_trials $NUM_TUNING_TRIALS \ + --tuning_ruleset $TUNING_RULESET \ + -i true \ + -r false \ + --logs_bucket $LOGS_BUCKET \ + -m $MAX_GLOBAL_STEPS + diff --git a/scoring/performance_profile.py b/scoring/performance_profile.py index 043a65791..eecb9799e 100644 --- a/scoring/performance_profile.py +++ b/scoring/performance_profile.py @@ -59,7 +59,7 @@ # workloads and rules for the scoring to be correct. # We do not use the workload registry since it contains test and development # workloads as well. -NUM_BASE_WORKLOADS = 8 +NUM_BASE_WORKLOADS = 9 NUM_VARIANT_WORKLOADS = 0 NUM_TRIALS = 5 NUM_STUDIES = 3 diff --git a/scoring/score_submissions.py b/scoring/score_submissions.py index efe276a33..10e624148 100644 --- a/scoring/score_submissions.py +++ b/scoring/score_submissions.py @@ -75,10 +75,10 @@ FLAGS = flags.FLAGS -def get_summary_df(workload, workload_df, include_test_split=False): +def get_summary_df(workload, workload_df): print(f' WORKLOAD: {workload}') validation_metric, validation_target = ( - scoring_utils.get_workload_metrics_and_targets(workload, split='validation') + scoring_utils.get_workload_metrics_and_targets(workload) ) is_minimized = performance_profile.check_if_minimized(validation_metric) @@ -127,7 +127,7 @@ def get_summary_df(workload, workload_df, include_test_split=False): # compute the step times def delta(series): - return series.shift(1, fill_value=0) - series + return series.apply(lambda x: np.diff(x, prepend=0)) accumulated_time_intervals = delta(workload_df['accumulated_submission_time']) step_intervals = delta(workload_df['global_step']) @@ -136,47 +136,19 @@ def delta(series): f'WARNING: The number of evals may be too low to calculate reliable step time for {workload}' ) - summary_df['step_time (s)'] = np.median( - (accumulated_time_intervals / step_intervals).iloc[0] - ) - - summary_df['step_hint'] = scoring_utils.get_workload_stephint(workload) - - # test metrics - if include_test_split: - test_metric, test_target = scoring_utils.get_workload_metrics_and_targets( - workload, split='test' + # Flatten all intervals from all trials and take the global median + with np.errstate(divide='ignore', invalid='ignore'): + all_ratios = np.concatenate( + (accumulated_time_intervals / step_intervals).values ) + summary_df['step_time (s)'] = np.nanmedian(all_ratios) - summary_df['test target metric name'] = test_metric - summary_df['test target metric value'] = test_target - - summary_df['test target reached'] = ( - workload_df[test_metric] - .apply(lambda x: target_op(x, test_target)) - .apply(np.any) - ) - summary_df['best metric value on test'] = workload_df[test_metric].apply( - lambda x: best_op(x) - ) - workload_df['index best eval on test'] = workload_df[test_metric].apply( - lambda x: idx_op(x) - ) - summary_df['time to best eval on test (s)'] = workload_df.apply( - lambda x: x['accumulated_submission_time'][x['index best eval on test']], - axis=1, - ) - summary_df['time to target on test (s)'] = summary_df.apply( - lambda x: x['time to best eval on test (s)'] - if x['test target reached'] - else np.inf, - axis=1, - ) + summary_df['step_hint'] = scoring_utils.get_workload_stephint(workload) return summary_df -def get_submission_summary(df, include_test_split=False): +def get_submission_summary(df): """Summarizes the submission results into metric and time tables organized by workload. """ @@ -184,9 +156,7 @@ def get_submission_summary(df, include_test_split=False): dfs = [] print(df) for workload, group in df.groupby('workload'): - summary_df = get_summary_df( - workload, group, include_test_split=include_test_split - ) + summary_df = get_summary_df(workload, group) dfs.append(summary_df) df = pd.concat(dfs) diff --git a/scoring/scoring_utils.py b/scoring/scoring_utils.py index cb63eab4b..c2ab8aeec 100644 --- a/scoring/scoring_utils.py +++ b/scoring/scoring_utils.py @@ -214,7 +214,7 @@ def get_experiment_df(experiment_dir): ## Get workload properties -def get_workload_metrics_and_targets(workload, split='validation'): +def get_workload_metrics_and_targets(workload): """Returns workload target metric name and value.""" workload_name = re.match(WORKLOAD_NAME_PATTERN, workload).group(1) framework = re.match(WORKLOAD_NAME_PATTERN, workload).group(2) @@ -233,12 +233,8 @@ def get_workload_metrics_and_targets(workload, split='validation'): workload_init_kwargs=workload_init_kwargs, ) metric_name = workload_obj.target_metric_name - if split == 'validation': - metric = f'validation/{metric_name}' - target = workload_obj.validation_target_value - elif split == 'test': - metric = f'test/{metric_name}' - target = workload_obj.test_target_value + metric = f'validation/{metric_name}' + target = workload_obj.validation_target_value return metric, target diff --git a/scoring/utils/slurm/make_job_config.py b/scoring/utils/slurm/make_job_config.py index 39f6f5eb0..91afc9196 100644 --- a/scoring/utils/slurm/make_job_config.py +++ b/scoring/utils/slurm/make_job_config.py @@ -9,6 +9,7 @@ import json import os +import struct import jax from absl import app, flags @@ -17,8 +18,6 @@ TUNING_SEARCH_SPACE = ( 'reference_algorithms/paper_baselines/adamw/tuning_search_space.json' ) -NUM_TUNING_TRIALS = 3 # For external tuning ruleset -NUM_STUDIES = 3 flags.DEFINE_string( 'submission_path', @@ -35,11 +34,6 @@ 'experiments', 'Path to experiment dir where logs will be saved.', ) -flags.DEFINE_string( - 'experiment_dir', - 'experiments/', - 'Path to experiment dir where logs will be saved.', -) flags.DEFINE_enum( 'framework', 'jax', @@ -56,14 +50,13 @@ flags.DEFINE_string( 'workloads', None, help='Comma seperated list of workloads to run.' ) -flags.DEFINE_integer('num_studies', NUM_STUDIES, help='Number of studies.') +flags.DEFINE_integer('num_studies', None, help='Number of studies.') +flags.DEFINE_integer('num_tuning_trials', None, help='Number of tuning trials.') FLAGS = flags.FLAGS MIN_INT = -(2 ** (31)) MAX_INT = 2 ** (31) - 1 -NUM_TUNING_TRIALS = 5 # For external tuning ruleset -NUM_STUDIES = 3 WORKLOADS = { 'imagenet_resnet': {'dataset': 'imagenet'}, @@ -74,6 +67,12 @@ 'librispeech_deepspeech': {'dataset': 'librispeech'}, 'criteo1tb': {'dataset': 'criteo1tb'}, 'librispeech_conformer': {'dataset': 'librispeech'}, + 'finewebedu_lm': {'dataset': 'fineweb_edu_10B'} +} + +RULESET_CONFIGS = { + 'self': {'num_studies': 3, 'num_tuning_trials': 1}, + 'external': {'num_studies': 3, 'num_tuning_trials': 5}, } @@ -83,17 +82,29 @@ def main(_): else: workloads = FLAGS.workloads.split(',') - key = jax.random.key(FLAGS.seed) + if not FLAGS.seed: + FLAGS.seed = struct.unpack('I', os.urandom(4))[0] + + # Set defaults based on tuning_ruleset if not provided by user + num_studies = FLAGS.num_studies + if num_studies is None: + num_studies = RULESET_CONFIGS[FLAGS.tuning_ruleset]['num_studies'] + + num_tuning_trials = FLAGS.num_tuning_trials + if num_tuning_trials is None: + num_tuning_trials = RULESET_CONFIGS[FLAGS.tuning_ruleset]['num_tuning_trials'] + + key = jax.random.PRNGKey(FLAGS.seed) jobs = [] for workload in workloads: # Fold in hash(workload) mod(max(uint32)) workload_key = jax.random.fold_in(key, hash(workload) % (2**32 - 1)) - for study_index in range(NUM_STUDIES): + for study_index in range(num_studies): study_key = jax.random.fold_in(workload_key, study_index) if FLAGS.tuning_ruleset == 'external': - for hparam_index in range(NUM_TUNING_TRIALS): + for hparam_index in range(num_tuning_trials): run_key = jax.random.fold_in(study_key, hparam_index) seed = jax.random.randint(run_key, (1,), MIN_INT, MAX_INT)[0].item() print(seed) @@ -107,7 +118,7 @@ def main(_): job['experiment_dir'] = study_dir job['rng_seed'] = seed job['tuning_ruleset'] = FLAGS.tuning_ruleset - job['num_tuning_trials'] = NUM_TUNING_TRIALS + job['num_tuning_trials'] = num_tuning_trials job['hparam_start_index'] = hparam_index job['hparam_end_index'] = hparam_index + 1 job['tuning_search_space'] = FLAGS.tuning_search_space diff --git a/scoring/utils/slurm/run_jobs.sh b/scoring/utils/slurm/run_jobs.sh index 5fcf8f69e..5232387be 100644 --- a/scoring/utils/slurm/run_jobs.sh +++ b/scoring/utils/slurm/run_jobs.sh @@ -2,31 +2,21 @@ #SBATCH --nodes=1 # give it a full node #SBATCH --ntasks-per-node=1 -#SBATCH --array= -#SBATCH --partition=v100 -#SBATCH --gpus-per-node=8 +#SBATCH --array=0-26 +#SBATCH --partition=a100 +#SBATCH --gpus-per-node=4 #SBATCH --exclusive #this will not allow other jobs to run on this cluster -#SBATCH --output=experiments/tests/jit_debug_deepspeech_old_stephint_nadamw/job_%A_%a.out -#SBATCH --error=experiments/tests/jit_debug_deepspeech_old_stephint_nadamw/job_%A_%a.err +#SBATCH --output=experiments/tests/updated_schedule_free/job_%A_%a.out +#SBATCH --error=experiments/tests/updated_schedule_free/job_%A_%a.err -# Usage: sbatch .sh +# Usage: sbatch .sh [options] # This script reads config.json and launches a sbatch job using task -# arrays where each job in the array corresponds to a training run +# arrays where each job in the array corresponds to a training run # for a workload given a random seed and tuning trial index. # To generate the config.json use make_job_config.py. set -x -# Pull docker image (ATTENTION: you may want to modify this) -REPO="" -IMAGE="" -y | gcloud auth configure-docker $REPO -docker pull $IMAGE -# Job config (ATTENTION: you may want to modify this) -config_file="" # Replace with your config file path -LOGS_BUCKET="" # replace with your bucket used for logging - - # Function to read a JSON file and extract a value by key read_json_value() { local json_file="$1" @@ -43,41 +33,137 @@ then exit 1 fi -TASK="$SLURM_ARRAY_TASK_ID" -FRAMEWORK=$(read_json_value "$config_file" "$TASK" "framework") -DATASET=$(read_json_value "$config_file" "$TASK" "dataset") -SUBMISSION_PATH=$(read_json_value "$config_file" "$TASK" "submission_path") -FRAMEWORK=$(read_json_value "$config_file" "$TASK" "framework") -TUNING_SEARCH_SPACE=$(read_json_value "$config_file" "$TASK" "tuning_search_space") -EXPERIMENT_DIR=$(read_json_value "$config_file" "$TASK" "experiment_dir") -MAX_STEPS=$(read_json_value "$config_file" "$TASK" "max_steps") -RNG_SEED=$(read_json_value "$config_file" "$TASK" "rng_seed") -WORKLOAD=$(read_json_value "$config_file" "$TASK" "workload") -HPARAM_START_INDEX=$(read_json_value "$config_file" "$TASK" "hparam_start_index") -HPARAM_END_INDEX=$(read_json_value "$config_file" "$TASK" "hparam_end_index") -NUM_TUNING_TRIALS=$(read_json_value "$config_file" "$TASK" "num_tuning_trials") -TUNING_RULESET=$(read_json_value "$config_file" "$TASK" "tuning_ruleset") -MAX_GLOBAL_STEPS=$(read_json_value "$config_file" "$MAX_GLOBAL_STEPS" "max_global_steps") +# Default values +REPO="europe-west4-docker.pkg.dev" +IMAGE="europe-west4-docker.pkg.dev/mlcommons-algoperf/algoperf-docker-repo/algoperf_pytorch_main:latest" +CONFIG_FILE="$HOME/algorithmic-efficiency/pytorch_scoring_config_1.json" +LOGS_BUCKET="algoperf-runs" +TASK_ID="${SLURM_ARRAY_TASK_ID:-0}" + +# Parse flags +while [[ $# -gt 0 ]]; do + case $1 in + --repo) + REPO="$2" + shift 2 + ;; + --image) + IMAGE="$2" + shift 2 + ;; + --config_file) + CONFIG_FILE="$2" + shift 2 + ;; + --logs_bucket) + LOGS_BUCKET="$2" + shift 2 + ;; + --task_id) + TASK_ID="$2" + shift 2 + ;; + --framework) + FRAMEWORK="$2" + shift 2 + ;; + --dataset) + DATASET="$2" + shift 2 + ;; + --submission_path) + SUBMISSION_PATH="$2" + shift 2 + ;; + --tuning_search_space) + TUNING_SEARCH_SPACE="$2" + shift 2 + ;; + --experiment_dir) + EXPERIMENT_DIR="$2" + shift 2 + ;; + --rng_seed) + RNG_SEED="$2" + shift 2 + ;; + --workload) + WORKLOAD="$2" + shift 2 + ;; + --hparam_start_index) + HPARAM_START_INDEX="$2" + shift 2 + ;; + --hparam_end_index) + HPARAM_END_INDEX="$2" + shift 2 + ;; + --num_tuning_trials) + NUM_TUNING_TRIALS="$2" + shift 2 + ;; + --tuning_ruleset) + TUNING_RULESET="$2" + shift 2 + ;; + --max_global_steps) + MAX_GLOBAL_STEPS="$2" + shift 2 + ;; + *) + echo "Unknown option $1" + exit 1 + ;; + esac +done + +# Pull docker image +yes | gcloud auth configure-docker "$REPO" +docker pull "$IMAGE" + +# Set variables from config file if not already set by flags +FRAMEWORK="${FRAMEWORK:-$(read_json_value "$CONFIG_FILE" "$TASK_ID" "framework")}" +DATASET="${DATASET:-$(read_json_value "$CONFIG_FILE" "$TASK_ID" "dataset")}" +SUBMISSION_PATH="${SUBMISSION_PATH:-$(read_json_value "$CONFIG_FILE" "$TASK_ID" "submission_path")}" +TUNING_SEARCH_SPACE="${TUNING_SEARCH_SPACE:-$(read_json_value "$CONFIG_FILE" "$TASK_ID" "tuning_search_space")}" +EXPERIMENT_DIR="${EXPERIMENT_DIR:-$(read_json_value "$CONFIG_FILE" "$TASK_ID" "experiment_dir")}" +RNG_SEED="${RNG_SEED:-$(read_json_value "$CONFIG_FILE" "$TASK_ID" "rng_seed")}" +WORKLOAD="${WORKLOAD:-$(read_json_value "$CONFIG_FILE" "$TASK_ID" "workload")}" +HPARAM_START_INDEX="${HPARAM_START_INDEX:-$(read_json_value "$CONFIG_FILE" "$TASK_ID" "hparam_start_index")}" +HPARAM_END_INDEX="${HPARAM_END_INDEX:-$(read_json_value "$CONFIG_FILE" "$TASK_ID" "hparam_end_index")}" +NUM_TUNING_TRIALS="${NUM_TUNING_TRIALS:-$(read_json_value "$CONFIG_FILE" "$TASK_ID" "num_tuning_trials")}" +TUNING_RULESET="${TUNING_RULESET:-$(read_json_value "$CONFIG_FILE" "$TASK_ID" "tuning_ruleset")}" + +DOCKER_CMD=( + docker run + -v /opt/data/:/data/ + -v "$HOME/experiment_runs:/experiment_runs" + -v "$HOME/submissions_algorithms/:/algorithmic-efficiency/submissions_algorithms" + -v "$HOME/algorithmic-efficiency/docker/scripts/startup.sh:/algorithmic-efficiency/docker/scripts/startup.sh" + --gpus all + --ipc=host + "$IMAGE" + -d "$DATASET" + -f "$FRAMEWORK" + -s "$SUBMISSION_PATH" + -w "$WORKLOAD" + -t "$TUNING_SEARCH_SPACE" + -e "$EXPERIMENT_DIR" + -c False + -o True + --rng_seed "$RNG_SEED" + --hparam_start_index "$HPARAM_START_INDEX" + --hparam_end_index "$HPARAM_END_INDEX" + --num_tuning_trials "$NUM_TUNING_TRIALS" + --tuning_ruleset "$TUNING_RULESET" + -i true + -r false + --logs_bucket "$LOGS_BUCKET" +) + +if [ -n "$MAX_GLOBAL_STEPS" ]; then + DOCKER_CMD+=(-m "$MAX_GLOBAL_STEPS") +fi -docker run \ - -v /opt/data/:/data/ \ - -v $HOME/submissions_algorithms/:/algorithmic-efficiency/submissions_algorithms \ - --gpus all \ - --ipc=host \ - $IMAGE \ - -d $DATASET \ - -f $FRAMEWORK \ - -s $SUBMISSION_PATH \ - -w $WORKLOAD \ - -t $TUNING_SEARCH_SPACE \ - -e $EXPERIMENT_DIR \ - -c False \ - -o True \ - --rng_seed $RNG_SEED \ - --hparam_start_index $HPARAM_START_INDEX \ - --hparam_end_index $HPARAM_END_INDEX \ - --num_tuning_trials $NUM_TUNING_TRIALS \ - --tuning_ruleset $TUNING_RULESET \ - --logs_bucket $LOGS_BUCKET \ - -i true \ - -r false \ No newline at end of file +"${DOCKER_CMD[@]}" diff --git a/scoring/utils/slurm/run_submission.sh b/scoring/utils/slurm/run_submission.sh new file mode 100644 index 000000000..ddef9f586 --- /dev/null +++ b/scoring/utils/slurm/run_submission.sh @@ -0,0 +1,204 @@ +#!/bin/bash + +# Usage: +# ./algorithmic-efficiency/scoring/utils/slurm/run_submission.sh \ +# --submission_path submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2 +# +# Note: --dry_run is true by default (sets MAX_GLOBAL_STEPS=10). +# To perform a full run, explicitly set --dry_run false. + +set -e +set -x + +# --- Global Variables --- +SUBMISSION_PATH="" +DRY_RUN=true +MAX_GLOBAL_STEPS=10 +SUBMISSION_NAME="" +RULESET="" +FRAMEWORK="" +ARRAY_RANGE="" + +# --- Helper Functions --- + +install_yq() { + if ! command -v yq &> /dev/null; then + echo "yq not found. Attempting to install locally to $HOME/.local/bin..." + mkdir -p "$HOME/.local/bin" + local OS=$(uname | tr '[:upper:]' '[:lower:]') + local ARCH=$(uname -m) + case "$ARCH" in + x86_64) ARCH="amd64" ;; + aarch64) ARCH="arm64" ;; + esac + + local YQ_URL="https://github.com/mikefarah/yq/releases/latest/download/yq_${OS}_${ARCH}" + if command -v curl &> /dev/null; then + curl -L "$YQ_URL" -o "$HOME/.local/bin/yq" + elif command -v wget &> /dev/null; then + wget "$YQ_URL" -O "$HOME/.local/bin/yq" + else + echo "Error: Neither curl nor wget found. Please install yq manually: https://github.com/mikefarah/yq" + exit 1 + fi + chmod +x "$HOME/.local/bin/yq" + export PATH="$HOME/.local/bin:$PATH" + echo "yq installed successfully to $HOME/.local/bin" + fi +} + +check_command() { + if ! command -v "$1" &> /dev/null; then + echo "Error: $1 could not be found. Please install it." + exit 1 + fi +} + +verify_environment() { + if [[ "$PWD" != "$HOME" ]]; then + echo "Error: This script must be run from your home directory ($HOME)." + echo "Expected directory structure:" + echo " $HOME/" + echo " ├── algorithmic-efficiency/" + echo " └── submissions_algorithms/" + exit 1 + fi + + if [[ ! -d "algorithmic-efficiency" || ! -d "submissions_algorithms" ]]; then + echo "Error: Required repositories not found in the current directory." + echo "Please ensure both 'algorithmic-efficiency' and 'submissions_algorithms' are present in $HOME." + exit 1 + fi + + install_yq + check_command "jq" +} + +parse_flags() { + while [[ $# -gt 0 ]]; do + case $1 in + --submission_path) + SUBMISSION_PATH="$2" + shift 2 + ;; + --dry_run) + DRY_RUN="$2" + shift 2 + ;; + *) + echo "Unknown option $1" + exit 1 + ;; + esac + done + + if [ -z "$SUBMISSION_PATH" ]; then + echo "Error: --submission_path is required." + exit 1 + fi + + if [ "$DRY_RUN" = false ]; then + MAX_GLOBAL_STEPS="" + fi +} + +extract_submission_info() { + SUBMISSION_NAME=$(basename "$SUBMISSION_PATH") + local info_file="$SUBMISSION_PATH/submission_info.yml" + + if [ ! -f "$info_file" ]; then + echo "Error: $info_file not found." + exit 1 + fi + + local raw_ruleset=$(yq eval '.ruleset' "$info_file" | tr '[:upper:]' '[:lower:]') + FRAMEWORK=$(yq eval '.framework' "$info_file" | tr '[:upper:]' '[:lower:]') + + # Parse ruleset by checking for substrings "self" or "external" + if [[ "$raw_ruleset" == *"self"* ]]; then + RULESET="self" + elif [[ "$raw_ruleset" == *"external"* ]]; then + RULESET="external" + else + echo "Error: Expected 'ruleset' in $info_file to contain 'self' or 'external' (got '$raw_ruleset')." + exit 1 + fi + + # Verify framework + if [[ "$FRAMEWORK" != "jax" && "$FRAMEWORK" != "pytorch" ]]; then + echo "Error: 'framework' in $info_file must be either 'jax' or 'pytorch' (got '$FRAMEWORK')." + exit 1 + fi + + echo "Submission Name: $SUBMISSION_NAME" + echo "Ruleset: $RULESET" + echo "Framework: $FRAMEWORK" + echo "Dry Run: $DRY_RUN" + echo "Max Global Steps: $MAX_GLOBAL_STEPS" +} + +generate_config() { + local exp_prefix="submissions_a100_dry_run" + if [ "$DRY_RUN" = false ]; then + exp_prefix="submissions_a100" + fi + + docker run \ + --rm \ + -v "$(pwd)":/algorithmic-efficiency \ + -w /algorithmic-efficiency \ + --entrypoint python \ + "europe-west4-docker.pkg.dev/mlcommons-algoperf/algoperf-docker-repo/algoperf_${FRAMEWORK}_main:latest" \ + algorithmic-efficiency/scoring/utils/slurm/make_job_config.py \ + --framework="$FRAMEWORK" \ + --tuning_ruleset="$RULESET" \ + --submission_path="$SUBMISSION_PATH/submission.py" \ + --experiment_dir="${exp_prefix}/$SUBMISSION_NAME" + + mv config.json "$SUBMISSION_NAME.json" +} + +prepare_sbatch_array() { + local num_jobs=$(jq 'length' "$SUBMISSION_NAME.json") + if [[ "$num_jobs" -eq 0 ]]; then + echo "Error: No jobs found in $SUBMISSION_NAME.json." + exit 1 + fi + + ARRAY_RANGE="0-$((num_jobs - 1))" + echo "Number of jobs: $num_jobs" + echo "Sbatch array range: $ARRAY_RANGE" + + mkdir -p "experiments/tests/$SUBMISSION_NAME" +} + +run_sbatch() { + local sbatch_cmd=( + sbatch + --array="$ARRAY_RANGE" + --output="experiments/tests/$SUBMISSION_NAME/job_%A_%a.out" + --error="experiments/tests/$SUBMISSION_NAME/job_%A_%a.err" + "algorithmic-efficiency/scoring/utils/slurm/run_jobs.sh" + --config_file "$(pwd)/$SUBMISSION_NAME.json" + --image "europe-west4-docker.pkg.dev/mlcommons-algoperf/algoperf-docker-repo/algoperf_${FRAMEWORK}_main:latest" + ) + + if [ -n "$MAX_GLOBAL_STEPS" ]; then + sbatch_cmd+=(--max_global_steps "$MAX_GLOBAL_STEPS") + fi + + "${sbatch_cmd[@]}" +} + +# --- Main --- + +main() { + verify_environment + parse_flags "$@" + extract_submission_info + generate_config + prepare_sbatch_array + run_sbatch +} + +main "$@"