diff --git a/.dstack/run-gpu-train.yml b/.dstack/run-gpu-train.yml index ac3e34865..02e70d925 100644 --- a/.dstack/run-gpu-train.yml +++ b/.dstack/run-gpu-train.yml @@ -16,10 +16,13 @@ env: - PROFILE - PROF_SKIP - PROF_ACTIVE + - XLA_PYTHON_CLIENT_PREALLOCATE=false + - UV_HTTP_TIMEOUT=300 commands: - apt-get update && apt-get install -y swig libgl1-mesa-glx libglib2.0-0 libsm6 libxext6 libxrender-dev libgomp1 - - cd /workflow && uv sync + - cd /workflow && uv sync --group playground + - cd /workflow && uv run python -c "from mujoco_playground._src.mjx_env import ensure_menagerie_exists; ensure_menagerie_exists()" - cd /workflow && uv run slm-lab run ${SPEC_VARS} ${SPEC_FILE} ${SPEC_NAME} ${LAB_MODE} --upload-hf resources: @@ -29,7 +32,7 @@ resources: memory: 32GB.. spot_policy: auto -max_duration: 8h +max_duration: 6h max_price: 0.50 retry: on_events: [no-capacity] diff --git a/.githooks/commit-msg b/.githooks/commit-msg index bdbccfec0..d8d6a30b9 100755 --- a/.githooks/commit-msg +++ b/.githooks/commit-msg @@ -1,22 +1,78 @@ #!/usr/bin/env bash -# Validate conventional commit format: type: message | type(scope): message +set -euo pipefail -commit_msg_file="$1" -commit_msg=$(head -1 "$commit_msg_file") +# Conventional Commits validation + idempotent semantic version bump. +# +# Bumps pyproject.toml version based on commit type, always relative +# to the base branch (master) version so repeated commits on a +# feature branch converge to the same result. +# +# Rules (semver): +# breaking (!) → major (X.0.0) +# feat → minor (_.X.0) +# everything else → patch (_._.X) -# Skip merge commits and fixup/squash -if echo "$commit_msg" | grep -qE '^(Merge |fixup! |squash! )'; then - exit 0 -fi +readonly COMMIT_MSG_FILE="$1" +readonly COMMIT_MSG="$(head -1 "$COMMIT_MSG_FILE")" +readonly TYPES="feat|fix|docs|chore|refactor|test|perf|ci|style|build" +readonly PYPROJECT="pyproject.toml" +readonly BASE_BRANCH="master" + +# --- Validation --- + +# Skip non-standard commits +[[ "$COMMIT_MSG" =~ ^(Merge\ |fixup!\ |squash!\ ) ]] && exit 0 + +# Enforce conventional commit format +if ! [[ "$COMMIT_MSG" =~ ^($TYPES)(\(.+\))?!?:\ .+ ]]; then + cat >&2 </dev/null \ + | grep '^version = ' | head -1 | sed 's/version = "//;s/"//' || true + fi +} + +base_version="$(read_version "$BASE_BRANCH")" +base_version="${base_version:-$(read_version file)}" +[[ -z "$base_version" ]] && exit 0 + +IFS='.' read -r major minor patch <<< "$base_version" + +# Classify commit +if [[ "$COMMIT_MSG" =~ ^($TYPES)(\(.+\))?!: ]]; then + ((major++)); minor=0; patch=0 +elif [[ "$COMMIT_MSG" =~ ^feat(\(.+\))?: ]]; then + ((minor++)); patch=0 +else + ((patch++)) +fi + +new_version="${major}.${minor}.${patch}" +current="$(read_version file)" + +# Only touch the file if the version actually changed +if [[ "$current" != "$new_version" ]]; then + sed -i '' "s/^version = \"${current}\"/version = \"${new_version}\"/" "$PYPROJECT" + git add "$PYPROJECT" + echo "Version: ${base_version} → ${new_version}" +fi diff --git a/docs/BENCHMARKS.md b/docs/BENCHMARKS.md index 7c15e5a08..75179502a 100644 --- a/docs/BENCHMARKS.md +++ b/docs/BENCHMARKS.md @@ -110,11 +110,12 @@ Search budget: ~3-4 trials per dimension (8 trials = 2-3 dims, 16 = 3-4 dims, 20 | Phase | Category | Envs | REINFORCE | SARSA | DQN | DDQN+PER | A2C | PPO | SAC | CrossQ | Overall | |-------|----------|------|-----------|-------|-----|----------|-----|-----|-----|--------|---------| | 1 | Classic Control | 3 | ✅ | ✅ | ⚠️ | ✅ | ✅ | ✅ | ✅ | ⚠️ | Done | -| 2 | Box2D | 2 | N/A | N/A | ⚠️ | ✅ | ❌ | ⚠️ | ⚠️ | ⚠️ | Done | +| 2 | Box2D | 2 | N/A | N/A | ⚠️ | ✅ | | ⚠️ | ⚠️ | ⚠️ | Done | | 3 | MuJoCo | 11 | N/A | N/A | N/A | N/A | N/A | ⚠️ | ⚠️ | ⚠️ | Done | -| 4 | Atari | 57 | N/A | N/A | N/A | Skip | Done | Done | Done | ❌ | Done | +| 4 | Atari | 57 | N/A | N/A | N/A | Skip | Done | Done | Done | | Done | +| 5 | Playground | 54 | N/A | N/A | N/A | N/A | N/A | 🔄 | 🔄 | N/A | In progress | -**Legend**: ✅ Solved | ⚠️ Close (>80%) | 📊 Acceptable | ❌ Failed | 🔄 In progress/Pending | Skip Not started | N/A Not applicable +**Legend**: ✅ Solved | ⚠️ Close (>80%) | 📊 Acceptable | Failed | 🔄 In progress/Pending | Skip Not started | N/A Not applicable --- @@ -166,7 +167,7 @@ Search budget: ~3-4 trials per dimension (8 trials = 2-3 dims, 16 = 3-4 dims, 20 | Algorithm | Status | MA | SPEC_FILE | SPEC_NAME | HF Data | |-----------|--------|-----|-----------|-----------|---------| -| A2C | ❌ | -820.74 | [slm_lab/spec/benchmark_arc/a2c/a2c_classic_arc.yaml](../slm_lab/spec/benchmark_arc/a2c/a2c_classic_arc.yaml) | a2c_gae_pendulum_arc | [a2c_gae_pendulum_arc_2026_02_11_162217](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/a2c_gae_pendulum_arc_2026_02_11_162217) | +| A2C | | -820.74 | [slm_lab/spec/benchmark_arc/a2c/a2c_classic_arc.yaml](../slm_lab/spec/benchmark_arc/a2c/a2c_classic_arc.yaml) | a2c_gae_pendulum_arc | [a2c_gae_pendulum_arc_2026_02_11_162217](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/a2c_gae_pendulum_arc_2026_02_11_162217) | | PPO | ✅ | -174.87 | [slm_lab/spec/benchmark_arc/ppo/ppo_classic_arc.yaml](../slm_lab/spec/benchmark_arc/ppo/ppo_classic_arc.yaml) | ppo_pendulum_arc | [ppo_pendulum_arc_2026_02_11_162156](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_pendulum_arc_2026_02_11_162156) | | SAC | ✅ | -150.97 | [slm_lab/spec/benchmark_arc/sac/sac_classic_arc.yaml](../slm_lab/spec/benchmark_arc/sac/sac_classic_arc.yaml) | sac_pendulum_arc | [sac_pendulum_arc_2026_02_11_162240](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/sac_pendulum_arc_2026_02_11_162240) | | CrossQ | ✅ | -145.66 | [slm_lab/spec/benchmark/crossq/crossq_classic.yaml](../slm_lab/spec/benchmark/crossq/crossq_classic.yaml) | crossq_pendulum | [crossq_pendulum_2026_02_28_130648](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/crossq_pendulum_2026_02_28_130648) | @@ -185,10 +186,10 @@ Search budget: ~3-4 trials per dimension (8 trials = 2-3 dims, 16 = 3-4 dims, 20 |-----------|--------|-----|-----------|-----------|---------| | DQN | ⚠️ | 195.21 | [slm_lab/spec/benchmark_arc/dqn/dqn_box2d_arc.yaml](../slm_lab/spec/benchmark_arc/dqn/dqn_box2d_arc.yaml) | dqn_concat_lunar_arc | [dqn_concat_lunar_arc_2026_02_11_201407](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/dqn_concat_lunar_arc_2026_02_11_201407) | | DDQN+PER | ✅ | 265.90 | [slm_lab/spec/benchmark_arc/dqn/dqn_box2d_arc.yaml](../slm_lab/spec/benchmark_arc/dqn/dqn_box2d_arc.yaml) | ddqn_per_concat_lunar_arc | [ddqn_per_concat_lunar_arc_2026_02_13_105115](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ddqn_per_concat_lunar_arc_2026_02_13_105115) | -| A2C | ❌ | 27.38 | [slm_lab/spec/benchmark_arc/a2c/a2c_classic_arc.yaml](../slm_lab/spec/benchmark_arc/a2c/a2c_classic_arc.yaml) | a2c_gae_lunar_arc | [a2c_gae_lunar_arc_2026_02_11_224304](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/a2c_gae_lunar_arc_2026_02_11_224304) | +| A2C | | 27.38 | [slm_lab/spec/benchmark_arc/a2c/a2c_classic_arc.yaml](../slm_lab/spec/benchmark_arc/a2c/a2c_classic_arc.yaml) | a2c_gae_lunar_arc | [a2c_gae_lunar_arc_2026_02_11_224304](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/a2c_gae_lunar_arc_2026_02_11_224304) | | PPO | ⚠️ | 183.30 | [slm_lab/spec/benchmark_arc/ppo/ppo_box2d_arc.yaml](../slm_lab/spec/benchmark_arc/ppo/ppo_box2d_arc.yaml) | ppo_lunar_arc | [ppo_lunar_arc_2026_02_11_201303](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_lunar_arc_2026_02_11_201303) | | SAC | ⚠️ | 106.17 | [slm_lab/spec/benchmark_arc/sac/sac_box2d_arc.yaml](../slm_lab/spec/benchmark_arc/sac/sac_box2d_arc.yaml) | sac_lunar_arc | [sac_lunar_arc_2026_02_11_201417](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/sac_lunar_arc_2026_02_11_201417) | -| CrossQ | ❌ | 139.21 | [slm_lab/spec/benchmark/crossq/crossq_box2d.yaml](../slm_lab/spec/benchmark/crossq/crossq_box2d.yaml) | crossq_lunar | [crossq_lunar_2026_02_28_130733](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/crossq_lunar_2026_02_28_130733) | +| CrossQ | | 139.21 | [slm_lab/spec/benchmark/crossq/crossq_box2d.yaml](../slm_lab/spec/benchmark/crossq/crossq_box2d.yaml) | crossq_lunar | [crossq_lunar_2026_02_28_130733](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/crossq_lunar_2026_02_28_130733) | ![LunarLander-v3](plots/LunarLander-v3_multi_trial_graph_mean_returns_ma_vs_frames.png) @@ -200,7 +201,7 @@ Search budget: ~3-4 trials per dimension (8 trials = 2-3 dims, 16 = 3-4 dims, 20 | Algorithm | Status | MA | SPEC_FILE | SPEC_NAME | HF Data | |-----------|--------|-----|-----------|-----------|---------| -| A2C | ❌ | -76.81 | [slm_lab/spec/benchmark_arc/a2c/a2c_classic_arc.yaml](../slm_lab/spec/benchmark_arc/a2c/a2c_classic_arc.yaml) | a2c_gae_lunar_continuous_arc | [a2c_gae_lunar_continuous_arc_2026_02_11_224301](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/a2c_gae_lunar_continuous_arc_2026_02_11_224301) | +| A2C | | -76.81 | [slm_lab/spec/benchmark_arc/a2c/a2c_classic_arc.yaml](../slm_lab/spec/benchmark_arc/a2c/a2c_classic_arc.yaml) | a2c_gae_lunar_continuous_arc | [a2c_gae_lunar_continuous_arc_2026_02_11_224301](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/a2c_gae_lunar_continuous_arc_2026_02_11_224301) | | PPO | ⚠️ | 132.58 | [slm_lab/spec/benchmark_arc/ppo/ppo_box2d_arc.yaml](../slm_lab/spec/benchmark_arc/ppo/ppo_box2d_arc.yaml) | ppo_lunar_continuous_arc | [ppo_lunar_continuous_arc_2026_02_11_224229](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_lunar_continuous_arc_2026_02_11_224229) | | SAC | ⚠️ | 125.00 | [slm_lab/spec/benchmark_arc/sac/sac_box2d_arc.yaml](../slm_lab/spec/benchmark_arc/sac/sac_box2d_arc.yaml) | sac_lunar_continuous_arc | [sac_lunar_continuous_arc_2026_02_12_222203](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/sac_lunar_continuous_arc_2026_02_12_222203) | | CrossQ | ✅ | 268.91 | [slm_lab/spec/benchmark/crossq/crossq_box2d.yaml](../slm_lab/spec/benchmark/crossq/crossq_box2d.yaml) | crossq_lunar_continuous | [crossq_lunar_continuous_2026_03_01_140517](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/crossq_lunar_continuous_2026_03_01_140517) | @@ -455,7 +456,7 @@ source .env && slm-lab run-remote --gpu \ - **A2C**: [a2c_atari_arc.yaml](../slm_lab/spec/benchmark_arc/a2c/a2c_atari_arc.yaml) - RMSprop (lr=7e-4), training_frequency=32 - **PPO**: [ppo_atari_arc.yaml](../slm_lab/spec/benchmark_arc/ppo/ppo_atari_arc.yaml) - AdamW (lr=2.5e-4), minibatch=256, horizon=128, epochs=4, max_frame=10e6 - **SAC**: [sac_atari_arc.yaml](../slm_lab/spec/benchmark_arc/sac/sac_atari_arc.yaml) - Categorical SAC, AdamW (lr=3e-4), training_iter=3, training_frequency=4, max_frame=2e6 -- **CrossQ**: [crossq_atari.yaml](../slm_lab/spec/benchmark/crossq/crossq_atari.yaml) - Categorical CrossQ, AdamW (lr=1e-3), training_iter=3, training_frequency=4, max_frame=2e6 (experimental — limited results on 6 games) +- **CrossQ**: [crossq_atari.yaml](../slm_lab/spec/benchmark/crossq/crossq_atari.yaml) - Categorical CrossQ, Adam (lr=1e-3), training_iter=1, training_frequency=4, max_frame=2e6 (experimental — limited results on 6 games) **PPO Lambda Variants** (table shows best result per game): @@ -486,7 +487,7 @@ source .env && slm-lab run-remote --gpu -s env=ENV \ > **Note**: HF Data links marked "-" indicate runs completed but not yet uploaded to HuggingFace. Scores are extracted from local trial_metrics. -| ENV | Score | SPEC_NAME | HF Data | +| ENV | MA | SPEC_NAME | HF Data | |-----|-------|-----------|---------| | ALE/AirRaid-v5 | 7042.84 | ppo_atari_arc | [ppo_atari_arc_airraid_2026_02_13_124015](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_atari_arc_airraid_2026_02_13_124015) | | | 1832.54 | sac_atari_arc | [sac_atari_arc_airraid_2026_02_17_104002](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/sac_atari_arc_airraid_2026_02_17_104002) | @@ -530,7 +531,7 @@ source .env && slm-lab run-remote --gpu -s env=ENV \ | ALE/Breakout-v5 | 326.47 | ppo_atari_lam70_arc | [ppo_atari_lam70_arc_breakout_2026_02_13_230455](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_atari_lam70_arc_breakout_2026_02_13_230455) | | | 20.23 | sac_atari_arc | [sac_atari_arc_breakout_2026_02_15_201235](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/sac_atari_arc_breakout_2026_02_15_201235) | | | 273 | a2c_gae_atari_arc | [a2c_gae_atari_breakout_2026_01_31_213610](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/a2c_gae_atari_breakout_2026_01_31_213610) | -| | ❌ 4.40 | crossq_atari | [crossq_atari_breakout_2026_02_25_030241](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/crossq_atari_breakout_2026_02_25_030241) | +| | 4.40 | crossq_atari | [crossq_atari_breakout_2026_02_25_030241](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/crossq_atari_breakout_2026_02_25_030241) | | ALE/Carnival-v5 | 3912.59 | ppo_atari_lam70_arc | [ppo_atari_lam70_arc_carnival_2026_02_13_230438](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_atari_lam70_arc_carnival_2026_02_13_230438) | | | 3501.37 | sac_atari_arc | [sac_atari_arc_carnival_2026_02_17_105834](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/sac_atari_arc_carnival_2026_02_17_105834) | | | 2170 | a2c_gae_atari_arc | [a2c_gae_atari_carnival_2026_02_01_082726](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/a2c_gae_atari_carnival_2026_02_01_082726) | @@ -594,7 +595,7 @@ source .env && slm-lab run-remote --gpu -s env=ENV \ | ALE/MsPacman-v5 | 2330.74 | ppo_atari_lam85_arc | [ppo_atari_lam85_arc_mspacman_2026_02_14_102435](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_atari_lam85_arc_mspacman_2026_02_14_102435) | | | 1336.96 | sac_atari_arc | [sac_atari_arc_mspacman_2026_02_17_221523](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/sac_atari_arc_mspacman_2026_02_17_221523) | | | 2110 | a2c_gae_atari_arc | [a2c_gae_atari_mspacman_2026_02_01_001100](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/a2c_gae_atari_mspacman_2026_02_01_001100) | -| | ❌ 327.79 | crossq_atari | [crossq_atari_mspacman_2026_02_23_171317](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/crossq_atari_mspacman_2026_02_23_171317) | +| | 327.79 | crossq_atari | [crossq_atari_mspacman_2026_02_23_171317](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/crossq_atari_mspacman_2026_02_23_171317) | | ALE/NameThisGame-v5 | 6879.23 | ppo_atari_arc | [ppo_atari_arc_namethisgame_2026_02_14_103319](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_atari_arc_namethisgame_2026_02_14_103319) | | | 3992.71 | sac_atari_arc | [sac_atari_arc_namethisgame_2026_02_17_220905](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/sac_atari_arc_namethisgame_2026_02_17_220905) | | | 5412 | a2c_gae_atari_arc | [a2c_gae_atari_namethisgame_2026_02_01_132733](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/a2c_gae_atari_namethisgame_2026_02_01_132733) | @@ -604,14 +605,14 @@ source .env && slm-lab run-remote --gpu -s env=ENV \ | ALE/Pong-v5 | 16.69 | ppo_atari_lam85_arc | [ppo_atari_lam85_arc_pong_2026_02_14_103722](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_atari_lam85_arc_pong_2026_02_14_103722) | | | 10.89 | sac_atari_arc | [sac_atari_arc_pong_2026_02_17_160429](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/sac_atari_arc_pong_2026_02_17_160429) | | | 10.17 | a2c_gae_atari_arc | [a2c_gae_atari_pong_2026_01_31_213635](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/a2c_gae_atari_pong_2026_01_31_213635) | -| | ❌ -20.59 | crossq_atari | [crossq_atari_pong_2026_02_23_171158](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/crossq_atari_pong_2026_02_23_171158) | +| | -20.59 | crossq_atari | [crossq_atari_pong_2026_02_23_171158](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/crossq_atari_pong_2026_02_23_171158) | | ALE/Pooyan-v5 | 5308.66 | ppo_atari_lam70_arc | [ppo_atari_lam70_arc_pooyan_2026_02_14_114730](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_atari_lam70_arc_pooyan_2026_02_14_114730) | | | 2530.78 | sac_atari_arc | [sac_atari_arc_pooyan_2026_02_17_220346](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/sac_atari_arc_pooyan_2026_02_17_220346) | | | 2997 | a2c_gae_atari_arc | [a2c_gae_atari_pooyan_2026_02_01_132748](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/a2c_gae_atari_pooyan_2026_02_01_132748) | | ALE/Qbert-v5 | 15460.48 | ppo_atari_arc | [ppo_atari_arc_qbert_2026_02_14_120409](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_atari_arc_qbert_2026_02_14_120409) | | | 3331.98 | sac_atari_arc | [sac_atari_arc_qbert_2026_02_17_223117](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/sac_atari_arc_qbert_2026_02_17_223117) | | | 12619 | a2c_gae_atari_arc | [a2c_gae_atari_qbert_2026_01_31_213720](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/a2c_gae_atari_qbert_2026_01_31_213720) | -| | ❌ 3189.73 | crossq_atari | [crossq_atari_qbert_2026_02_25_030458](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/crossq_atari_qbert_2026_02_25_030458) | +| | 3189.73 | crossq_atari | [crossq_atari_qbert_2026_02_25_030458](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/crossq_atari_qbert_2026_02_25_030458) | | ALE/Riverraid-v5 | 9599.75 | ppo_atari_lam85_arc | [ppo_atari_lam85_arc_riverraid_2026_02_14_124700](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_atari_lam85_arc_riverraid_2026_02_14_124700) | | | 4744.95 | sac_atari_arc | [sac_atari_arc_riverraid_2026_02_18_014310](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/sac_atari_arc_riverraid_2026_02_18_014310) | | | 6558 | a2c_gae_atari_arc | [a2c_gae_atari_riverraid_2026_02_01_132507](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/a2c_gae_atari_riverraid_2026_02_01_132507) | @@ -624,7 +625,7 @@ source .env && slm-lab run-remote --gpu -s env=ENV \ | ALE/Seaquest-v5 | 1775.14 | ppo_atari_arc | [ppo_atari_arc_seaquest_2026_02_11_095444](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_atari_arc_seaquest_2026_02_11_095444) | | | 1565.44 | sac_atari_arc | [sac_atari_arc_seaquest_2026_02_18_020822](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/sac_atari_arc_seaquest_2026_02_18_020822) | | | 850 | a2c_gae_atari_arc | [a2c_gae_atari_seaquest_2026_02_01_001001](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/a2c_gae_atari_seaquest_2026_02_01_001001) | -| | ❌ 234.63 | crossq_atari | [crossq_atari_seaquest_2026_02_25_030441](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/crossq_atari_seaquest_2026_02_25_030441) | +| | 234.63 | crossq_atari | [crossq_atari_seaquest_2026_02_25_030441](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/crossq_atari_seaquest_2026_02_25_030441) | | ALE/Skiing-v5 | -28217.28 | ppo_atari_arc | [ppo_atari_arc_skiing_2026_02_14_174807](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_atari_arc_skiing_2026_02_14_174807) | | | -17464.22 | sac_atari_arc | [sac_atari_arc_skiing_2026_02_18_024444](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/sac_atari_arc_skiing_2026_02_18_024444) | | | -14235 | a2c_gae_atari_arc | [a2c_gae_atari_skiing_2026_02_01_132451](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/a2c_gae_atari_skiing_2026_02_01_132451) | @@ -634,7 +635,7 @@ source .env && slm-lab run-remote --gpu -s env=ENV \ | ALE/SpaceInvaders-v5 | 892.49 | ppo_atari_arc | [ppo_atari_arc_spaceinvaders_2026_02_14_131114](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_atari_arc_spaceinvaders_2026_02_14_131114) | | | 507.33 | sac_atari_arc | [sac_atari_arc_spaceinvaders_2026_02_18_033139](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/sac_atari_arc_spaceinvaders_2026_02_18_033139) | | | 784 | a2c_gae_atari_arc | [a2c_gae_atari_spaceinvaders_2026_02_01_000950](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/a2c_gae_atari_spaceinvaders_2026_02_01_000950) | -| | ❌ 404.50 | crossq_atari | [crossq_atari_spaceinvaders_2026_02_25_030410](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/crossq_atari_spaceinvaders_2026_02_25_030410) | +| | 404.50 | crossq_atari | [crossq_atari_spaceinvaders_2026_02_25_030410](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/crossq_atari_spaceinvaders_2026_02_25_030410) | | ALE/StarGunner-v5 | 49328.73 | ppo_atari_lam70_arc | [ppo_atari_lam70_arc_stargunner_2026_02_14_131149](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_atari_lam70_arc_stargunner_2026_02_14_131149) | | | 4295.97 | sac_atari_arc | [sac_atari_arc_stargunner_2026_02_18_033151](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/sac_atari_arc_stargunner_2026_02_18_033151) | | | 8665 | a2c_gae_atari_arc | [a2c_gae_atari_stargunner_2026_02_01_132406](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/a2c_gae_atari_stargunner_2026_02_01_132406) | @@ -760,3 +761,123 @@ source .env && slm-lab run-remote --gpu -s env=ENV \ +--- + +### Phase 5: MuJoCo Playground (JAX/MJX GPU-Accelerated) + +[MuJoCo Playground](https://google-deepmind.github.io/mujoco_playground/) | Continuous state/action | MJWarp GPU backend + +**Settings**: max_frame 100M | num_envs 2048 | max_session 4 + +**Spec file**: [ppo_playground.yaml](../slm_lab/spec/benchmark_arc/ppo/ppo_playground.yaml) — all envs via `-s env=playground/ENV` + +**Reproduce**: +```bash +source .env && slm-lab run-remote --gpu \ + slm_lab/spec/benchmark_arc/ppo/ppo_playground.yaml SPEC_NAME train \ + -s env=playground/ENV -s max_frame=100000000 -n NAME +``` + +#### Phase 5.1: DM Control Suite (25 envs) + +Classic control and locomotion tasks from the DeepMind Control Suite, ported to MJWarp GPU simulation. + +| ENV | MA | SPEC_NAME | HF Data | +|-----|-----|-----------|---------| +| playground/AcrobotSwingup | 253.24 | ppo_playground_vnorm | [ppo_playground_acrobotswingup_2026_03_12_175809](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_acrobotswingup_2026_03_12_175809) | +| playground/AcrobotSwingupSparse | 146.98 | ppo_playground_vnorm | [ppo_playground_vnorm_acrobotswingupsparse_2026_03_14_161212](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_vnorm_acrobotswingupsparse_2026_03_14_161212) | +| playground/BallInCup | 942.44 | ppo_playground_vnorm | [ppo_playground_ballincup_2026_03_12_105443](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_ballincup_2026_03_12_105443) | +| playground/CartpoleBalance | 968.23 | ppo_playground_vnorm | [ppo_playground_cartpolebalance_2026_03_12_141924](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_cartpolebalance_2026_03_12_141924) | +| playground/CartpoleBalanceSparse | 995.34 | ppo_playground_constlr | [ppo_playground_constlr_cartpolebalancesparse_2026_03_14_000352](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_constlr_cartpolebalancesparse_2026_03_14_000352) | +| playground/CartpoleSwingup | 729.09 | ppo_playground_constlr | [ppo_playground_constlr_cartpoleswingup_2026_03_17_041102](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_constlr_cartpoleswingup_2026_03_17_041102) | +| playground/CartpoleSwingupSparse | 521.98 | ppo_playground_constlr | [ppo_playground_constlr_cartpoleswingupsparse_2026_03_13_233449](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_constlr_cartpoleswingupsparse_2026_03_13_233449) | +| playground/CheetahRun | 883.44 | ppo_playground_vnorm | [ppo_playground_vnorm_cheetahrun_2026_03_14_161211](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_vnorm_cheetahrun_2026_03_14_161211) | +| playground/FingerSpin | 713.35 | ppo_playground_fingerspin | [ppo_playground_fingerspin_fingerspin_2026_03_13_033911](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_fingerspin_fingerspin_2026_03_13_033911) | +| playground/FingerTurnEasy | 663.58 | ppo_playground_vnorm | [ppo_playground_fingerturneasy_2026_03_12_175835](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_fingerturneasy_2026_03_12_175835) | +| playground/FingerTurnHard | 590.43 | ppo_playground_vnorm_constlr | [ppo_playground_vnorm_constlr_fingerturnhard_2026_03_16_234509](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_vnorm_constlr_fingerturnhard_2026_03_16_234509) | +| playground/FishSwim | 580.57 | ppo_playground_vnorm_constlr_clip03 | [ppo_playground_vnorm_constlr_clip03_fishswim_2026_03_14_002112](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_vnorm_constlr_clip03_fishswim_2026_03_14_002112) | +| playground/HopperHop | 22.00 | ppo_playground_vnorm | [ppo_playground_hopperhop_2026_03_12_110855](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_hopperhop_2026_03_12_110855) | +| playground/HopperStand | 237.15 | ppo_playground_vnorm | [ppo_playground_vnorm_hopperstand_2026_03_14_095438](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_vnorm_hopperstand_2026_03_14_095438) | +| playground/HumanoidRun | 18.83 | ppo_playground_humanoid | [ppo_playground_humanoid_humanoidrun_2026_03_14_115522](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_humanoid_humanoidrun_2026_03_14_115522) | +| playground/HumanoidStand | 114.86 | ppo_playground_humanoid | [ppo_playground_humanoid_humanoidstand_2026_03_14_115516](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_humanoid_humanoidstand_2026_03_14_115516) | +| playground/HumanoidWalk | 47.01 | ppo_playground_humanoid | [ppo_playground_humanoid_humanoidwalk_2026_03_14_172235](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_humanoid_humanoidwalk_2026_03_14_172235) | +| playground/PendulumSwingup | 637.46 | ppo_playground_pendulum | [ppo_playground_pendulum_pendulumswingup_2026_03_13_033818](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_pendulum_pendulumswingup_2026_03_13_033818) | +| playground/PointMass | 868.09 | ppo_playground_vnorm_constlr | [ppo_playground_vnorm_constlr_pointmass_2026_03_14_095452](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_vnorm_constlr_pointmass_2026_03_14_095452) | +| playground/ReacherEasy | 955.08 | ppo_playground_vnorm | [ppo_playground_reachereasy_2026_03_12_122115](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_reachereasy_2026_03_12_122115) | +| playground/ReacherHard | 946.99 | ppo_playground_vnorm | [ppo_playground_reacherhard_2026_03_12_123226](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_reacherhard_2026_03_12_123226) | +| playground/SwimmerSwimmer6 | 591.13 | ppo_playground_vnorm_constlr | [ppo_playground_vnorm_constlr_swimmerswimmer6_2026_03_14_000406](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_vnorm_constlr_swimmerswimmer6_2026_03_14_000406) | +| playground/WalkerRun | 759.71 | ppo_playground_vnorm | [ppo_playground_vnorm_walkerrun_2026_03_14_161354](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_vnorm_walkerrun_2026_03_14_161354) | +| playground/WalkerStand | 948.35 | ppo_playground_vnorm | [ppo_playground_vnorm_walkerstand_2026_03_14_161415](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_vnorm_walkerstand_2026_03_14_161415) | +| playground/WalkerWalk | 945.31 | ppo_playground_vnorm | [ppo_playground_vnorm_walkerwalk_2026_03_14_161338](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_vnorm_walkerwalk_2026_03_14_161338) | + +| | | | +|---|---|---| +| ![AcrobotSwingup](plots/AcrobotSwingup_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![AcrobotSwingupSparse](plots/AcrobotSwingupSparse_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![BallInCup](plots/BallInCup_multi_trial_graph_mean_returns_ma_vs_frames.png) | +| ![CartpoleBalance](plots/CartpoleBalance_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![CartpoleBalanceSparse](plots/CartpoleBalanceSparse_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![CartpoleSwingup](plots/CartpoleSwingup_multi_trial_graph_mean_returns_ma_vs_frames.png) | +| ![CartpoleSwingupSparse](plots/CartpoleSwingupSparse_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![CheetahRun](plots/CheetahRun_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![FingerSpin](plots/FingerSpin_multi_trial_graph_mean_returns_ma_vs_frames.png) | +| ![FingerTurnEasy](plots/FingerTurnEasy_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![FingerTurnHard](plots/FingerTurnHard_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![FishSwim](plots/FishSwim_multi_trial_graph_mean_returns_ma_vs_frames.png) | +| ![HopperHop](plots/HopperHop_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![HopperStand](plots/HopperStand_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![HumanoidRun](plots/HumanoidRun_multi_trial_graph_mean_returns_ma_vs_frames.png) | +| ![HumanoidStand](plots/HumanoidStand_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![HumanoidWalk](plots/HumanoidWalk_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![PendulumSwingup](plots/PendulumSwingup_multi_trial_graph_mean_returns_ma_vs_frames.png) | +| ![PointMass](plots/PointMass_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![ReacherEasy](plots/ReacherEasy_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![ReacherHard](plots/ReacherHard_multi_trial_graph_mean_returns_ma_vs_frames.png) | +| ![SwimmerSwimmer6](plots/SwimmerSwimmer6_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![WalkerRun](plots/WalkerRun_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![WalkerStand](plots/WalkerStand_multi_trial_graph_mean_returns_ma_vs_frames.png) | +| ![WalkerWalk](plots/WalkerWalk_multi_trial_graph_mean_returns_ma_vs_frames.png) | | | + +#### Phase 5.2: Locomotion Robots (19 envs) + +Real-world robot locomotion — quadrupeds (Go1, Spot, Barkour) and humanoids (H1, G1, T1, Op3, Apollo, BerkeleyHumanoid) on flat and rough terrain. + +| ENV | MA | SPEC_NAME | HF Data | +|-----|-----|-----------|---------| +| playground/ApolloJoystickFlatTerrain | 17.44 | ppo_playground_loco_precise | [ppo_playground_loco_precise_apollojoystickflatterrain_2026_03_14_210939](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_loco_precise_apollojoystickflatterrain_2026_03_14_210939) | +| playground/BarkourJoystick | 0.0 | ppo_playground_loco | [ppo_playground_loco_barkourjoystick_2026_03_14_194525](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_loco_barkourjoystick_2026_03_14_194525) | +| playground/BerkeleyHumanoidJoystickFlatTerrain | 32.29 | ppo_playground_loco_precise | [ppo_playground_loco_precise_berkeleyhumanoidjoystickflatterrain_2026_03_14_213019](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_loco_precise_berkeleyhumanoidjoystickflatterrain_2026_03_14_213019) | +| playground/BerkeleyHumanoidJoystickRoughTerrain | 21.25 | ppo_playground_loco_precise | [ppo_playground_loco_precise_berkeleyhumanoidjoystickroughterrain_2026_03_15_150211](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_loco_precise_berkeleyhumanoidjoystickroughterrain_2026_03_15_150211) | +| playground/G1JoystickFlatTerrain | 1.85 | ppo_playground_loco_precise | [ppo_playground_loco_precise_g1joystickflatterrain_2026_03_15_150219](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_loco_precise_g1joystickflatterrain_2026_03_15_150219) | +| playground/G1JoystickRoughTerrain | -2.75 | ppo_playground_loco_precise | [ppo_playground_loco_precise_g1joystickroughterrain_2026_03_19_015137](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_loco_precise_g1joystickroughterrain_2026_03_19_015137) | +| playground/Go1Footstand | 23.48 | ppo_playground_loco_precise | [ppo_playground_loco_precise_go1footstand_2026_03_16_174009](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_loco_precise_go1footstand_2026_03_16_174009) | +| playground/Go1Getup | 18.16 | ppo_playground_loco_go1 | [ppo_playground_loco_go1_go1getup_2026_03_16_132801](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_loco_go1_go1getup_2026_03_16_132801) | +| playground/Go1Handstand | 17.88 | ppo_playground_loco_precise | [ppo_playground_loco_precise_go1handstand_2026_03_16_155437](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_loco_precise_go1handstand_2026_03_16_155437) | +| playground/Go1JoystickFlatTerrain | 0.0 | ppo_playground_loco | [ppo_playground_loco_go1joystickflatterrain_2026_03_14_204658](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_loco_go1joystickflatterrain_2026_03_14_204658) | +| playground/Go1JoystickRoughTerrain | 0.00 | ppo_playground_loco | [ppo_playground_loco_go1joystickroughterrain_2026_03_15_150321](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_loco_go1joystickroughterrain_2026_03_15_150321) | +| playground/H1InplaceGaitTracking | 11.95 | ppo_playground_loco_precise | [ppo_playground_loco_precise_h1inplacegaittracking_2026_03_16_170327](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_loco_precise_h1inplacegaittracking_2026_03_16_170327) | +| playground/H1JoystickGaitTracking | 31.11 | ppo_playground_loco_precise | [ppo_playground_loco_precise_h1joystickgaittracking_2026_03_16_170412](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_loco_precise_h1joystickgaittracking_2026_03_16_170412) | +| playground/Op3Joystick | 0.00 | ppo_playground_loco | [ppo_playground_loco_op3joystick_2026_03_15_150120](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_loco_op3joystick_2026_03_15_150120) | +| playground/SpotFlatTerrainJoystick | 48.58 | ppo_playground_loco_precise | [ppo_playground_loco_precise_spotflatterrainjoystick_2026_03_16_180747](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_loco_precise_spotflatterrainjoystick_2026_03_16_180747) | +| playground/SpotGetup | 19.39 | ppo_playground_loco | [ppo_playground_loco_spotgetup_2026_03_14_213703](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_loco_spotgetup_2026_03_14_213703) | +| playground/SpotJoystickGaitTracking | 36.90 | ppo_playground_loco | [ppo_playground_loco_spotjoystickgaittracking_2026_03_19_015106](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_loco_spotjoystickgaittracking_2026_03_19_015106) | +| playground/T1JoystickFlatTerrain | 13.42 | ppo_playground_loco_precise | [ppo_playground_loco_precise_t1joystickflatterrain_2026_03_14_220250](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_loco_precise_t1joystickflatterrain_2026_03_14_220250) | +| playground/T1JoystickRoughTerrain | 2.58 | ppo_playground_loco_precise | [ppo_playground_loco_precise_t1joystickroughterrain_2026_03_15_162332](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_loco_precise_t1joystickroughterrain_2026_03_15_162332) | + +| | | | +|---|---|---| +| ![ApolloJoystickFlatTerrain](plots/ApolloJoystickFlatTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![BarkourJoystick](plots/BarkourJoystick_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![BerkeleyHumanoidJoystickFlatTerrain](plots/BerkeleyHumanoidJoystickFlatTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png) | +| ![G1JoystickFlatTerrain](plots/G1JoystickFlatTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![Go1Footstand](plots/Go1Footstand_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![Go1Handstand](plots/Go1Handstand_multi_trial_graph_mean_returns_ma_vs_frames.png) | +| ![H1InplaceGaitTracking](plots/H1InplaceGaitTracking_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![H1JoystickGaitTracking](plots/H1JoystickGaitTracking_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![Op3Joystick](plots/Op3Joystick_multi_trial_graph_mean_returns_ma_vs_frames.png) | +| ![SpotFlatTerrainJoystick](plots/SpotFlatTerrainJoystick_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![SpotGetup](plots/SpotGetup_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![SpotJoystickGaitTracking](plots/SpotJoystickGaitTracking_multi_trial_graph_mean_returns_ma_vs_frames.png) | +| ![BerkeleyHumanoidJoystickRoughTerrain](plots/BerkeleyHumanoidJoystickRoughTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![Go1Getup](plots/Go1Getup_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![Go1JoystickFlatTerrain](plots/Go1JoystickFlatTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png) | +| ![Go1JoystickRoughTerrain](plots/Go1JoystickRoughTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![T1JoystickFlatTerrain](plots/T1JoystickFlatTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![T1JoystickRoughTerrain](plots/T1JoystickRoughTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png) | + +#### Phase 5.3: Manipulation (10 envs) + +Robotic manipulation — Panda arm pick/place, Aloha bimanual, Leap dexterous hand, and AeroCube orientation tasks. + +| ENV | MA | SPEC_NAME | HF Data | +|-----|-----|-----------|---------| +| playground/AeroCubeRotateZAxis | -3.09 | ppo_playground_loco | [ppo_playground_loco_aerocuberotatezaxis_2026_03_20_012502](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_loco_aerocuberotatezaxis_2026_03_20_012502) | +| playground/AlohaHandOver | 3.65 | ppo_playground_loco | [ppo_playground_loco_alohahandover_2026_03_15_023712](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_loco_alohahandover_2026_03_15_023712) | +| playground/AlohaSinglePegInsertion | 220.93 | ppo_playground_manip_aloha_peg | [ppo_playground_manip_aloha_peg_alohasinglepeginsertion_2026_03_17_122613](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_manip_aloha_peg_alohasinglepeginsertion_2026_03_17_122613) | +| playground/LeapCubeReorient | 74.68 | ppo_playground_loco | [ppo_playground_loco_leapcubereorient_2026_03_15_150420](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_loco_leapcubereorient_2026_03_15_150420) | +| playground/LeapCubeRotateZAxis | 91.65 | ppo_playground_loco | [ppo_playground_loco_leapcuberotatezaxis_2026_03_15_150334](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_loco_leapcuberotatezaxis_2026_03_15_150334) | +| playground/PandaOpenCabinet | 11081.51 | ppo_playground_loco | [ppo_playground_loco_pandaopencabinet_2026_03_15_150318](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_loco_pandaopencabinet_2026_03_15_150318) | +| playground/PandaPickCube | 4586.13 | ppo_playground_loco | [ppo_playground_loco_pandapickcube_2026_03_15_023744](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_loco_pandapickcube_2026_03_15_023744) | +| playground/PandaPickCubeCartesian | 10.58 | ppo_playground_loco | [ppo_playground_loco_pandapickcubecartesian_2026_03_15_023810](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_loco_pandapickcubecartesian_2026_03_15_023810) | +| playground/PandaPickCubeOrientation | 4281.66 | ppo_playground_loco | [ppo_playground_loco_pandapickcubeorientation_2026_03_19_015108](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_loco_pandapickcubeorientation_2026_03_19_015108) | +| playground/PandaRobotiqPushCube | 1.31 | ppo_playground_loco | [ppo_playground_loco_pandarobotiqpushcube_2026_03_15_042131](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_loco_pandarobotiqpushcube_2026_03_15_042131) | + +| | | | +|---|---|---| +| ![AeroCubeRotateZAxis](plots/AeroCubeRotateZAxis_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![AlohaHandOver](plots/AlohaHandOver_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![AlohaSinglePegInsertion](plots/AlohaSinglePegInsertion_multi_trial_graph_mean_returns_ma_vs_frames.png) | +| ![LeapCubeReorient](plots/LeapCubeReorient_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![LeapCubeRotateZAxis](plots/LeapCubeRotateZAxis_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![PandaOpenCabinet](plots/PandaOpenCabinet_multi_trial_graph_mean_returns_ma_vs_frames.png) | +| ![PandaPickCube](plots/PandaPickCube_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![PandaPickCubeCartesian](plots/PandaPickCubeCartesian_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![PandaPickCubeOrientation](plots/PandaPickCubeOrientation_multi_trial_graph_mean_returns_ma_vs_frames.png) | +| ![PandaRobotiqPushCube](plots/PandaRobotiqPushCube_multi_trial_graph_mean_returns_ma_vs_frames.png) | | | + diff --git a/docs/plots/AcrobotSwingupSparse_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/AcrobotSwingupSparse_multi_trial_graph_mean_returns_ma_vs_frames.png new file mode 100644 index 000000000..ca1cb681e Binary files /dev/null and b/docs/plots/AcrobotSwingupSparse_multi_trial_graph_mean_returns_ma_vs_frames.png differ diff --git a/docs/plots/AcrobotSwingup_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/AcrobotSwingup_multi_trial_graph_mean_returns_ma_vs_frames.png new file mode 100644 index 000000000..e9f5d1993 Binary files /dev/null and b/docs/plots/AcrobotSwingup_multi_trial_graph_mean_returns_ma_vs_frames.png differ diff --git a/docs/plots/AeroCubeRotateZAxis_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/AeroCubeRotateZAxis_multi_trial_graph_mean_returns_ma_vs_frames.png new file mode 100644 index 000000000..afedaef80 Binary files /dev/null and b/docs/plots/AeroCubeRotateZAxis_multi_trial_graph_mean_returns_ma_vs_frames.png differ diff --git a/docs/plots/AlohaHandOver_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/AlohaHandOver_multi_trial_graph_mean_returns_ma_vs_frames.png new file mode 100644 index 000000000..7a236a555 Binary files /dev/null and b/docs/plots/AlohaHandOver_multi_trial_graph_mean_returns_ma_vs_frames.png differ diff --git a/docs/plots/AlohaSinglePegInsertion_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/AlohaSinglePegInsertion_multi_trial_graph_mean_returns_ma_vs_frames.png new file mode 100644 index 000000000..cabb7331f Binary files /dev/null and b/docs/plots/AlohaSinglePegInsertion_multi_trial_graph_mean_returns_ma_vs_frames.png differ diff --git a/docs/plots/ApolloJoystickFlatTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/ApolloJoystickFlatTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png new file mode 100644 index 000000000..775a55fe6 Binary files /dev/null and b/docs/plots/ApolloJoystickFlatTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png differ diff --git a/docs/plots/BallInCup_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/BallInCup_multi_trial_graph_mean_returns_ma_vs_frames.png new file mode 100644 index 000000000..b0d09734b Binary files /dev/null and b/docs/plots/BallInCup_multi_trial_graph_mean_returns_ma_vs_frames.png differ diff --git a/docs/plots/BarkourJoystick_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/BarkourJoystick_multi_trial_graph_mean_returns_ma_vs_frames.png new file mode 100644 index 000000000..d8e917f57 Binary files /dev/null and b/docs/plots/BarkourJoystick_multi_trial_graph_mean_returns_ma_vs_frames.png differ diff --git a/docs/plots/BerkeleyHumanoidJoystickFlatTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/BerkeleyHumanoidJoystickFlatTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png new file mode 100644 index 000000000..1301dc6aa Binary files /dev/null and b/docs/plots/BerkeleyHumanoidJoystickFlatTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png differ diff --git a/docs/plots/BerkeleyHumanoidJoystickRoughTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/BerkeleyHumanoidJoystickRoughTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png new file mode 100644 index 000000000..164c0576d Binary files /dev/null and b/docs/plots/BerkeleyHumanoidJoystickRoughTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png differ diff --git a/docs/plots/CartpoleBalanceSparse_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/CartpoleBalanceSparse_multi_trial_graph_mean_returns_ma_vs_frames.png new file mode 100644 index 000000000..36393690e Binary files /dev/null and b/docs/plots/CartpoleBalanceSparse_multi_trial_graph_mean_returns_ma_vs_frames.png differ diff --git a/docs/plots/CartpoleBalance_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/CartpoleBalance_multi_trial_graph_mean_returns_ma_vs_frames.png new file mode 100644 index 000000000..4754ef437 Binary files /dev/null and b/docs/plots/CartpoleBalance_multi_trial_graph_mean_returns_ma_vs_frames.png differ diff --git a/docs/plots/CartpoleSwingupSparse_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/CartpoleSwingupSparse_multi_trial_graph_mean_returns_ma_vs_frames.png new file mode 100644 index 000000000..994552715 Binary files /dev/null and b/docs/plots/CartpoleSwingupSparse_multi_trial_graph_mean_returns_ma_vs_frames.png differ diff --git a/docs/plots/CartpoleSwingup_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/CartpoleSwingup_multi_trial_graph_mean_returns_ma_vs_frames.png new file mode 100644 index 000000000..5f02730b8 Binary files /dev/null and b/docs/plots/CartpoleSwingup_multi_trial_graph_mean_returns_ma_vs_frames.png differ diff --git a/docs/plots/CheetahRun_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/CheetahRun_multi_trial_graph_mean_returns_ma_vs_frames.png new file mode 100644 index 000000000..29eb8bd98 Binary files /dev/null and b/docs/plots/CheetahRun_multi_trial_graph_mean_returns_ma_vs_frames.png differ diff --git a/docs/plots/FingerSpin_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/FingerSpin_multi_trial_graph_mean_returns_ma_vs_frames.png new file mode 100644 index 000000000..ee2438497 Binary files /dev/null and b/docs/plots/FingerSpin_multi_trial_graph_mean_returns_ma_vs_frames.png differ diff --git a/docs/plots/FingerTurnEasy_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/FingerTurnEasy_multi_trial_graph_mean_returns_ma_vs_frames.png new file mode 100644 index 000000000..ad60d0252 Binary files /dev/null and b/docs/plots/FingerTurnEasy_multi_trial_graph_mean_returns_ma_vs_frames.png differ diff --git a/docs/plots/FingerTurnHard_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/FingerTurnHard_multi_trial_graph_mean_returns_ma_vs_frames.png new file mode 100644 index 000000000..a3de98da2 Binary files /dev/null and b/docs/plots/FingerTurnHard_multi_trial_graph_mean_returns_ma_vs_frames.png differ diff --git a/docs/plots/FishSwim_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/FishSwim_multi_trial_graph_mean_returns_ma_vs_frames.png new file mode 100644 index 000000000..1a994e2ff Binary files /dev/null and b/docs/plots/FishSwim_multi_trial_graph_mean_returns_ma_vs_frames.png differ diff --git a/docs/plots/G1JoystickFlatTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/G1JoystickFlatTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png new file mode 100644 index 000000000..1cf4b529f Binary files /dev/null and b/docs/plots/G1JoystickFlatTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png differ diff --git a/docs/plots/G1JoystickRoughTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/G1JoystickRoughTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png new file mode 100644 index 000000000..ca0c19cd6 Binary files /dev/null and b/docs/plots/G1JoystickRoughTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png differ diff --git a/docs/plots/Go1Footstand_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/Go1Footstand_multi_trial_graph_mean_returns_ma_vs_frames.png new file mode 100644 index 000000000..7ecf0aec2 Binary files /dev/null and b/docs/plots/Go1Footstand_multi_trial_graph_mean_returns_ma_vs_frames.png differ diff --git a/docs/plots/Go1Getup_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/Go1Getup_multi_trial_graph_mean_returns_ma_vs_frames.png new file mode 100644 index 000000000..2f65a7f6b Binary files /dev/null and b/docs/plots/Go1Getup_multi_trial_graph_mean_returns_ma_vs_frames.png differ diff --git a/docs/plots/Go1Handstand_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/Go1Handstand_multi_trial_graph_mean_returns_ma_vs_frames.png new file mode 100644 index 000000000..6886cb0de Binary files /dev/null and b/docs/plots/Go1Handstand_multi_trial_graph_mean_returns_ma_vs_frames.png differ diff --git a/docs/plots/Go1JoystickFlatTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/Go1JoystickFlatTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png new file mode 100644 index 000000000..49885e784 Binary files /dev/null and b/docs/plots/Go1JoystickFlatTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png differ diff --git a/docs/plots/Go1JoystickRoughTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/Go1JoystickRoughTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png new file mode 100644 index 000000000..189e680ae Binary files /dev/null and b/docs/plots/Go1JoystickRoughTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png differ diff --git a/docs/plots/H1InplaceGaitTracking_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/H1InplaceGaitTracking_multi_trial_graph_mean_returns_ma_vs_frames.png new file mode 100644 index 000000000..8a5bd1630 Binary files /dev/null and b/docs/plots/H1InplaceGaitTracking_multi_trial_graph_mean_returns_ma_vs_frames.png differ diff --git a/docs/plots/H1JoystickGaitTracking_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/H1JoystickGaitTracking_multi_trial_graph_mean_returns_ma_vs_frames.png new file mode 100644 index 000000000..11e4e9dfe Binary files /dev/null and b/docs/plots/H1JoystickGaitTracking_multi_trial_graph_mean_returns_ma_vs_frames.png differ diff --git a/docs/plots/HopperHop_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/HopperHop_multi_trial_graph_mean_returns_ma_vs_frames.png new file mode 100644 index 000000000..91b18f6cb Binary files /dev/null and b/docs/plots/HopperHop_multi_trial_graph_mean_returns_ma_vs_frames.png differ diff --git a/docs/plots/HopperStand_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/HopperStand_multi_trial_graph_mean_returns_ma_vs_frames.png new file mode 100644 index 000000000..c81509155 Binary files /dev/null and b/docs/plots/HopperStand_multi_trial_graph_mean_returns_ma_vs_frames.png differ diff --git a/docs/plots/HumanoidRun_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/HumanoidRun_multi_trial_graph_mean_returns_ma_vs_frames.png new file mode 100644 index 000000000..030061127 Binary files /dev/null and b/docs/plots/HumanoidRun_multi_trial_graph_mean_returns_ma_vs_frames.png differ diff --git a/docs/plots/HumanoidStand_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/HumanoidStand_multi_trial_graph_mean_returns_ma_vs_frames.png new file mode 100644 index 000000000..98e5bcd21 Binary files /dev/null and b/docs/plots/HumanoidStand_multi_trial_graph_mean_returns_ma_vs_frames.png differ diff --git a/docs/plots/HumanoidWalk_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/HumanoidWalk_multi_trial_graph_mean_returns_ma_vs_frames.png new file mode 100644 index 000000000..e450f5cd2 Binary files /dev/null and b/docs/plots/HumanoidWalk_multi_trial_graph_mean_returns_ma_vs_frames.png differ diff --git a/docs/plots/LeapCubeReorient_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/LeapCubeReorient_multi_trial_graph_mean_returns_ma_vs_frames.png new file mode 100644 index 000000000..08184ab09 Binary files /dev/null and b/docs/plots/LeapCubeReorient_multi_trial_graph_mean_returns_ma_vs_frames.png differ diff --git a/docs/plots/LeapCubeRotateZAxis_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/LeapCubeRotateZAxis_multi_trial_graph_mean_returns_ma_vs_frames.png new file mode 100644 index 000000000..d2010bf16 Binary files /dev/null and b/docs/plots/LeapCubeRotateZAxis_multi_trial_graph_mean_returns_ma_vs_frames.png differ diff --git a/docs/plots/Op3Joystick_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/Op3Joystick_multi_trial_graph_mean_returns_ma_vs_frames.png new file mode 100644 index 000000000..d7d975c07 Binary files /dev/null and b/docs/plots/Op3Joystick_multi_trial_graph_mean_returns_ma_vs_frames.png differ diff --git a/docs/plots/PandaOpenCabinet_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/PandaOpenCabinet_multi_trial_graph_mean_returns_ma_vs_frames.png new file mode 100644 index 000000000..f39e41677 Binary files /dev/null and b/docs/plots/PandaOpenCabinet_multi_trial_graph_mean_returns_ma_vs_frames.png differ diff --git a/docs/plots/PandaPickCubeCartesian_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/PandaPickCubeCartesian_multi_trial_graph_mean_returns_ma_vs_frames.png new file mode 100644 index 000000000..3ee1f8e19 Binary files /dev/null and b/docs/plots/PandaPickCubeCartesian_multi_trial_graph_mean_returns_ma_vs_frames.png differ diff --git a/docs/plots/PandaPickCubeOrientation_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/PandaPickCubeOrientation_multi_trial_graph_mean_returns_ma_vs_frames.png new file mode 100644 index 000000000..cb032577c Binary files /dev/null and b/docs/plots/PandaPickCubeOrientation_multi_trial_graph_mean_returns_ma_vs_frames.png differ diff --git a/docs/plots/PandaPickCube_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/PandaPickCube_multi_trial_graph_mean_returns_ma_vs_frames.png new file mode 100644 index 000000000..63a1b6cfe Binary files /dev/null and b/docs/plots/PandaPickCube_multi_trial_graph_mean_returns_ma_vs_frames.png differ diff --git a/docs/plots/PandaRobotiqPushCube_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/PandaRobotiqPushCube_multi_trial_graph_mean_returns_ma_vs_frames.png new file mode 100644 index 000000000..d62c4ef62 Binary files /dev/null and b/docs/plots/PandaRobotiqPushCube_multi_trial_graph_mean_returns_ma_vs_frames.png differ diff --git a/docs/plots/PendulumSwingup_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/PendulumSwingup_multi_trial_graph_mean_returns_ma_vs_frames.png new file mode 100644 index 000000000..9b2f3d06e Binary files /dev/null and b/docs/plots/PendulumSwingup_multi_trial_graph_mean_returns_ma_vs_frames.png differ diff --git a/docs/plots/PointMass_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/PointMass_multi_trial_graph_mean_returns_ma_vs_frames.png new file mode 100644 index 000000000..9f98091a6 Binary files /dev/null and b/docs/plots/PointMass_multi_trial_graph_mean_returns_ma_vs_frames.png differ diff --git a/docs/plots/ReacherEasy_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/ReacherEasy_multi_trial_graph_mean_returns_ma_vs_frames.png new file mode 100644 index 000000000..5ed0c345c Binary files /dev/null and b/docs/plots/ReacherEasy_multi_trial_graph_mean_returns_ma_vs_frames.png differ diff --git a/docs/plots/ReacherHard_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/ReacherHard_multi_trial_graph_mean_returns_ma_vs_frames.png new file mode 100644 index 000000000..382a7a08b Binary files /dev/null and b/docs/plots/ReacherHard_multi_trial_graph_mean_returns_ma_vs_frames.png differ diff --git a/docs/plots/SpotFlatTerrainJoystick_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/SpotFlatTerrainJoystick_multi_trial_graph_mean_returns_ma_vs_frames.png new file mode 100644 index 000000000..0abd8074f Binary files /dev/null and b/docs/plots/SpotFlatTerrainJoystick_multi_trial_graph_mean_returns_ma_vs_frames.png differ diff --git a/docs/plots/SpotGetup_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/SpotGetup_multi_trial_graph_mean_returns_ma_vs_frames.png new file mode 100644 index 000000000..961900e6f Binary files /dev/null and b/docs/plots/SpotGetup_multi_trial_graph_mean_returns_ma_vs_frames.png differ diff --git a/docs/plots/SpotJoystickGaitTracking_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/SpotJoystickGaitTracking_multi_trial_graph_mean_returns_ma_vs_frames.png new file mode 100644 index 000000000..b2d04cee2 Binary files /dev/null and b/docs/plots/SpotJoystickGaitTracking_multi_trial_graph_mean_returns_ma_vs_frames.png differ diff --git a/docs/plots/SwimmerSwimmer6_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/SwimmerSwimmer6_multi_trial_graph_mean_returns_ma_vs_frames.png new file mode 100644 index 000000000..a610b40af Binary files /dev/null and b/docs/plots/SwimmerSwimmer6_multi_trial_graph_mean_returns_ma_vs_frames.png differ diff --git a/docs/plots/T1JoystickFlatTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/T1JoystickFlatTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png new file mode 100644 index 000000000..83f229232 Binary files /dev/null and b/docs/plots/T1JoystickFlatTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png differ diff --git a/docs/plots/T1JoystickRoughTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/T1JoystickRoughTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png new file mode 100644 index 000000000..da9f0154b Binary files /dev/null and b/docs/plots/T1JoystickRoughTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png differ diff --git a/docs/plots/WalkerRun_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/WalkerRun_multi_trial_graph_mean_returns_ma_vs_frames.png new file mode 100644 index 000000000..c2abfaf13 Binary files /dev/null and b/docs/plots/WalkerRun_multi_trial_graph_mean_returns_ma_vs_frames.png differ diff --git a/docs/plots/WalkerStand_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/WalkerStand_multi_trial_graph_mean_returns_ma_vs_frames.png new file mode 100644 index 000000000..1da1e3fc9 Binary files /dev/null and b/docs/plots/WalkerStand_multi_trial_graph_mean_returns_ma_vs_frames.png differ diff --git a/docs/plots/WalkerWalk_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/WalkerWalk_multi_trial_graph_mean_returns_ma_vs_frames.png new file mode 100644 index 000000000..e2f12f1e7 Binary files /dev/null and b/docs/plots/WalkerWalk_multi_trial_graph_mean_returns_ma_vs_frames.png differ diff --git a/pyproject.toml b/pyproject.toml index 624956e0d..7e6b54cf5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "slm-lab" -version = "5.2.0" +version = "5.3.0" description = "Modular Deep Reinforcement Learning framework in PyTorch." readme = "README.md" requires-python = ">=3.12.0" diff --git a/slm_lab/env/__init__.py b/slm_lab/env/__init__.py index e0cf8568a..ffac34c1a 100644 --- a/slm_lab/env/__init__.py +++ b/slm_lab/env/__init__.py @@ -20,6 +20,7 @@ NormalizeReward as VectorNormalizeReward, RecordEpisodeStatistics as VectorRecordEpisodeStatistics, RescaleAction as VectorRescaleAction, + TransformReward as VectorTransformReward, ) from slm_lab.env.wrappers import ( @@ -45,6 +46,22 @@ except ImportError: pass +# Register Pavlovian environment +gym.register( + id="SLM/Pavlovian-v0", + entry_point="slm_lab.env.pavlovian:PavlovianEnv", + max_episode_steps=1000, +) + +# Register Sensorimotor environments (TC-11 through TC-24) +for _tc_id in range(11, 25): + gym.register( + id=f"SLM-Sensorimotor-TC{_tc_id:02d}-v0", + entry_point="slm_lab.env.sensorimotor:SLMSensorimotor", + kwargs={"task_id": f"TC-{_tc_id:02d}"}, + max_episode_steps=500, + ) + logger = logger.get_logger(__name__) # Keys handled by make_env, not passed to gym.make @@ -57,6 +74,8 @@ "normalize_reward", "clip_obs", "clip_reward", + "device", + "reward_scale", } @@ -150,16 +169,92 @@ def _set_env_attributes(env: gym.Env, spec: dict[str, Any]) -> None: env.done = False +def _make_playground_env( + name: str, + num_envs: int, + normalize_obs: bool, + normalize_reward: bool, + clip_obs: float | None, + clip_reward: float | None, + gamma: float, + device: str | None = None, + render_mode: str | None = None, + reward_scale: float = 1.0, +) -> gym.Env: + """Create a MuJoCo Playground vectorized environment.""" + try: + from slm_lab.env.playground import PlaygroundVecEnv + from slm_lab.env.wrappers import ( + PlaygroundRenderWrapper, + TorchNormalizeObservation, + ) + except ImportError: + raise ImportError( + "MuJoCo Playground is required for playground/ environments. " + "Install with: uv sync --group playground" + ) + + # Prevent JAX from pre-allocating GPU memory when sharing with PyTorch + if device is not None: + os.environ.setdefault("XLA_PYTHON_CLIENT_PREALLOCATE", "false") + + # Strip "playground/" prefix to get the env name for the registry + pg_env_name = name.removeprefix("playground/") + env = PlaygroundVecEnv(pg_env_name, num_envs, device=device) + logger.info(f"Playground: JAX→PyTorch via {'DLPack zero-copy (GPU)' if device else 'numpy (CPU)'}") + + if _needs_action_rescaling(env): + action_space = env.single_action_space + logger.info( + f"Action rescaling: [{action_space.low.min():.1f}, {action_space.high.max():.1f}] → [-1, 1]" + ) + env = VectorRescaleAction(env, min_action=-1.0, max_action=1.0) + + env = VectorRecordEpisodeStatistics(env) + + if reward_scale != 1.0: + env = VectorTransformReward(env, lambda r: r * reward_scale) + + if render_mode: + env = PlaygroundRenderWrapper(env) + + if device is not None: + if normalize_obs: + env = TorchNormalizeObservation(env) + + # Skip numpy-only wrappers in GPU mode (network-level normalization used instead) + if device is None: + if normalize_obs: + env = VectorNormalizeObservation(env) + if clip_obs is not None: + env = VectorClipObservation(env, bound=float(clip_obs)) + if normalize_reward: + env = VectorNormalizeReward(env, gamma=gamma) + if clip_reward is not None: + if isinstance(clip_reward, (int, float)): + env = VectorClipReward( + env, min_reward=-clip_reward, max_reward=clip_reward + ) + else: + env = VectorClipReward( + env, min_reward=clip_reward[0], max_reward=clip_reward[1] + ) + + return env + + def make_env(spec: dict[str, Any]) -> gym.Env: """Create a gymnasium environment. Gymnasium defaults are sensible - only override what's needed. For Atari (ALE/*), AtariVectorEnv handles all preprocessing natively. + For Playground (playground/*), uses JAX-based MuJoCo Playground backend. """ env_spec = spec["env"] name = env_spec["name"] num_envs = env_spec.get("num_envs", 1) is_atari = name.startswith("ALE/") + is_playground = name.startswith("playground/") render_mode = "human" if render() else None # Pass through env kwargs (life_loss_info, repeat_action_probability, etc.) @@ -172,7 +267,27 @@ def make_env(spec: dict[str, Any]) -> gym.Env: clip_reward = env_spec.get("clip_reward", 10.0 if normalize_reward else None) gamma = spec.get("agent", {}).get("algorithm", {}).get("gamma", 0.99) - if num_envs > 1: + device = env_spec.get("device") + if is_playground and (device is None or device == "auto"): + import torch + device = "cuda" if torch.cuda.is_available() else None + + if is_playground: + logger.info(f"Playground device: {'GPU (cuda) — DLPack zero-copy' if device else 'CPU — numpy transfer'}") + reward_scale = env_spec.get("reward_scale", 1.0) + env = _make_playground_env( + name, + num_envs, + normalize_obs, + normalize_reward, + clip_obs, + clip_reward, + gamma, + device=device, + render_mode=render_mode, + reward_scale=reward_scale, + ) + elif num_envs > 1: env = _make_vector_env( name, num_envs, diff --git a/slm_lab/env/playground.py b/slm_lab/env/playground.py new file mode 100644 index 000000000..fcc7a52e4 --- /dev/null +++ b/slm_lab/env/playground.py @@ -0,0 +1,215 @@ +"""MuJoCo Playground environment wrapper for SLM-Lab. + +Wraps MuJoCo Playground (JAX/MJWarp) environments as gymnasium VectorEnv, +enabling use with SLM-Lab's training loop. BraxAutoResetWrapper handles +batched step/reset internally; arrays are converted to numpy at the boundary. + +Uses MJWarp backend (Warp-accelerated MJX) uniformly for GPU simulation. +JAX is the dispatch/tracing layer; Warp CUDA kernels handle physics. +""" + +import os +import gymnasium as gym +import jax +import jax.numpy as jnp +import numpy as np +from gymnasium import spaces +from gymnasium.vector.utils import batch_space + +try: + from mujoco_playground import registry as pg_registry + from mujoco_playground import wrapper as pg_wrapper + from mujoco_playground._src import mjx_env as _mjx_env_module +except ImportError: + raise ImportError( + "MuJoCo Playground is required for playground environments. " + "Install with: uv sync --group playground" + ) + +# Monkey-patch mjx_env.make_data to ensure naccdmax is set when missing. +# Some mujoco_warp versions default naccdmax=None to 0, causing CCD buffer +# overflow for envs with mesh/convex colliders. We resolve None to naconmax +# (the total active-contact buffer), which is always a safe upper bound. +_original_make_data = _mjx_env_module.make_data + + +def _patched_make_data(*args, **kwargs): + naccdmax = kwargs.get("naccdmax") + naconmax = kwargs.get("naconmax") + if naccdmax is None and naconmax is not None: + kwargs["naccdmax"] = naconmax + return _original_make_data(*args, **kwargs) + + +_mjx_env_module.make_data = _patched_make_data + +# Suppress MuJoCo C-level stderr warnings (ccd_iterations, nefc/broadphase overflow). +# These repeat every step for 100M frames, exploding log/output size on dstack. +# Suppressed permanently after first step — no per-call overhead or sync barriers. +_stderr_suppressed = False + + +# Per-env action_repeat from official dm_control_suite_params.py +# These match mujoco_playground's canonical training configs exactly. +_ACTION_REPEAT: dict[str, int] = { + "PendulumSwingup": 4, +} + + +def _build_config_overrides(env_name: str) -> dict: + """Build config overrides for the given env. + + Sets impl='warp' for envs that support backend selection. + When njmax is 0, sets None to trigger auto-detection via _default_njmax(). + """ + default_cfg = pg_registry.get_default_config(env_name) + overrides = {"impl": "warp"} if hasattr(default_cfg, "impl") else {} + njmax = getattr(default_cfg, "njmax", None) + + if njmax is not None and njmax == 0: + overrides["njmax"] = None + + return overrides + + +class PlaygroundVecEnv(gym.vector.VectorEnv): + """Vectorized wrapper for MuJoCo Playground environments. + + Uses MJWarp backend uniformly (impl='warp'). BraxAutoResetWrapper handles + batched execution internally. Converts JAX arrays to numpy or torch tensors + via DLPack at the API boundary for SLM-Lab's PyTorch training loop. + """ + + def __init__( + self, + env_name: str, + num_envs: int, + seed: int = 0, + episode_length: int = 1000, + device: str | None = None, + ): + self._env_name = env_name + self._device = device + if device is not None: + import torch + + self._torch_device = torch.device(device) + + # Load the MJX environment and wrap for batched training + # wrap_for_brax_training applies: VmapWrapper → EpisodeWrapper → BraxAutoResetWrapper + # impl='warp' selects MJWarp (Warp-accelerated MJX) on CUDA; 'jax' on CPU + config_overrides = _build_config_overrides(env_name) + self._base_env = pg_registry.load( + env_name, config_overrides=config_overrides + ) # kept for rendering + base_env = self._base_env + action_repeat = _ACTION_REPEAT.get(env_name, 1) + self._env = pg_wrapper.wrap_for_brax_training( + base_env, episode_length=episode_length, action_repeat=action_repeat + ) + + # Build observation and action spaces + obs_size = base_env.observation_size + if isinstance(obs_size, dict): + if "state" in obs_size: + # Use only "state" key — excludes privileged_state from actor input + total_obs_dim = obs_size["state"] if not isinstance(obs_size["state"], tuple) else np.prod(obs_size["state"]) + else: + total_obs_dim = sum( + np.prod(s) if isinstance(s, tuple) else s for s in obs_size.values() + ) + else: + total_obs_dim = obs_size + act_size = base_env.action_size + obs_space = spaces.Box( + low=-np.inf, high=np.inf, shape=(int(total_obs_dim),), dtype=np.float32 + ) + act_space = spaces.Box(low=-1.0, high=1.0, shape=(act_size,), dtype=np.float32) + + # Set VectorEnv attributes directly (gymnasium 1.x has no __init__) + self.num_envs = num_envs + self.single_observation_space = obs_space + self.single_action_space = act_space + self.observation_space = batch_space(obs_space, num_envs) + self.action_space = batch_space(act_space, num_envs) + + # JIT-compile reset and step (BraxAutoResetWrapper handles batching internally) + self._jit_reset = jax.jit(self._env.reset) + self._jit_step = jax.jit(self._env.step) + + # Initialize RNG + self._rng = jax.random.PRNGKey(seed) + self._state = None + + def _to_output(self, x: jax.Array): + """Convert JAX array to output format. DLPack zero-copy when JAX+PyTorch both on GPU.""" + if self._device is not None: + import torch + + t = torch.from_dlpack(x) + # If JAX is on CPU but device is cuda, move explicitly (CPU->GPU copy) + return t if t.is_cuda else t.to(self._device) + return np.asarray(x).astype(np.float32) + + def _get_obs(self, state): + obs = state.obs + if isinstance(obs, dict): + # Use only "state" key when available — excludes privileged_state from actor + obs = obs.get("state", jnp.concatenate([obs[k] for k in sorted(obs.keys())], axis=-1)) + return self._to_output(obs) + + def reset(self, *, seed: int | None = None, options: dict | None = None): + if seed is not None: + self._rng = jax.random.PRNGKey(seed) + self._rng, *sub_keys = jax.random.split(self._rng, self.num_envs + 1) + sub_keys = jnp.stack(sub_keys) + self._state = self._jit_reset(sub_keys) + obs = self._get_obs(self._state) + return obs, {} + + def step(self, actions: np.ndarray): + jax_actions = jnp.array(actions, dtype=jnp.float32) + self._state = self._jit_step(self._state, jax_actions) + # Suppress stderr permanently after first step — MuJoCo C warnings + # repeat every step, but JAX async means we can't suppress per-call + # without block_until_ready (which kills performance ~10x for slow envs). + global _stderr_suppressed + if not _stderr_suppressed: + _stderr_suppressed = True + devnull = os.open(os.devnull, os.O_WRONLY) + os.dup2(devnull, 2) + os.close(devnull) + + obs = self._get_obs(self._state) + # Rewards, dones, info always numpy (used for control flow and memory) + rewards = np.asarray(self._state.reward).astype(np.float32) + dones = np.asarray(self._state.done).astype(bool) + + # Brax EpisodeWrapper sets state.info['truncation'] (1 = time limit, 0 = not) + truncation = self._state.info.get("truncation", None) + if truncation is not None: + truncated = np.asarray(truncation).astype(bool) + terminated = dones & ~truncated + else: + terminated = dones + truncated = np.zeros_like(dones, dtype=bool) + + # Extract metrics as info + info = {} + if self._state.metrics: + for k, v in self._state.metrics.items(): + info[k] = np.asarray(v) + + return obs, rewards, terminated, truncated, info + + def close(self): + self._state = None + + def render(self): + """Render env[0] as an RGB array using MuJoCo renderer.""" + if self._state is None: + return None + # Extract first env's state from the batched pytree + state_0 = jax.tree.map(lambda x: x[0], self._state) + frames = self._base_env.render([state_0], height=240, width=320) + return np.array(frames[0]) diff --git a/slm_lab/env/wrappers.py b/slm_lab/env/wrappers.py index 82de4ffc5..9edbed4a2 100644 --- a/slm_lab/env/wrappers.py +++ b/slm_lab/env/wrappers.py @@ -6,6 +6,7 @@ import gymnasium as gym import numpy as np import pandas as pd +import torch from slm_lab.lib import util @@ -86,7 +87,9 @@ def total_reward(self): Priority: VectorFullGameStatistics > RecordEpisodeStatistics > TrackReward This ensures we report full-game scores for Atari with life_loss_info. """ - from gymnasium.wrappers.vector import RecordEpisodeStatistics as VectorRecordEpisodeStatistics + from gymnasium.wrappers.vector import ( + RecordEpisodeStatistics as VectorRecordEpisodeStatistics, + ) env = self.env while env is not None: @@ -240,8 +243,8 @@ def step(self, actions): def _get_base_env(self): """Find base env with call() method.""" env = self.env - while hasattr(env, 'env'): - if hasattr(env, 'call'): + while hasattr(env, "env"): + if hasattr(env, "call"): return env env = env.env return env @@ -253,14 +256,16 @@ def _render_grid(self): return base_env = self._get_base_env() - frames = base_env.call("render") if hasattr(base_env, 'call') else None + frames = base_env.call("render") if hasattr(base_env, "call") else None if frames is None or frames[0] is None: return if self.window is None: pygame.init() frame_h, frame_w = frames[0].shape[:2] - self.window = pygame.display.set_mode((frame_w * self.grid_cols, frame_h * self.grid_rows)) + self.window = pygame.display.set_mode( + (frame_w * self.grid_cols, frame_h * self.grid_rows) + ) pygame.display.set_caption(f"Vector Env ({self.num_envs} envs)") self.clock = pygame.time.Clock() @@ -286,6 +291,99 @@ def _render_grid(self): def close(self): if self.window is not None: import pygame + pygame.quit() self.window = None return super().close() + + +class PlaygroundRenderWrapper(gym.vector.VectorWrapper): + """Render MuJoCo Playground env[0] via pygame after each step.""" + + def __init__(self, env: gym.vector.VectorEnv, render_freq: int = 1): + super().__init__(env) + self.render_freq = render_freq + self.step_count = 0 + self.window = None + self.clock = None + + def step(self, actions): + result = self.env.step(actions) + self.step_count += 1 + if self.step_count % self.render_freq == 0: + self._show() + return result + + def reset(self, **kwargs): + result = self.env.reset(**kwargs) + self._show() + return result + + def _show(self): + try: + import pygame + except ImportError: + return + frame = self.env.render() + if frame is None: + return + if self.window is None: + pygame.init() + h, w = frame.shape[:2] + self.window = pygame.display.set_mode((w, h)) + pygame.display.set_caption("MuJoCo Playground") + self.clock = pygame.time.Clock() + surface = pygame.surfarray.make_surface(frame.swapaxes(0, 1)) + self.window.blit(surface, (0, 0)) + pygame.display.flip() + self.clock.tick(60) + for event in pygame.event.get(): + if event.type == pygame.QUIT: + self.close() + raise KeyboardInterrupt("Render window closed") + + def close(self): + if self.window is not None: + import pygame + + pygame.quit() + self.window = None + return super().close() + + +class TorchNormalizeObservation(gym.vector.VectorWrapper): + """Running-mean normalization for CUDA tensor observations (Welford algorithm).""" + + def __init__(self, env: gym.vector.VectorEnv, epsilon: float = 1e-8): + super().__init__(env) + self.epsilon = epsilon + self._mean = None + self._var = None + self._count = 0 + + def _update_and_normalize(self, obs): + if self._mean is None: + self._mean = torch.zeros_like(obs[0]) + self._var = torch.ones_like(obs[0]) + batch_mean = obs.mean(dim=0) + batch_var = obs.var(dim=0, unbiased=False) + batch_count = obs.shape[0] + # Welford parallel update + total = self._count + batch_count + delta = batch_mean - self._mean + self._mean = self._mean + delta * batch_count / total + self._var = ( + self._var * self._count + + batch_var * batch_count + + delta**2 * self._count * batch_count / total + ) / total + self._count = total + return (obs - self._mean) / (self._var + self.epsilon).sqrt() + + def step(self, actions): + obs, *rest = self.env.step(actions) + return self._update_and_normalize(obs), *rest + + def reset(self, **kwargs): + obs, info = self.env.reset(**kwargs) + return self._update_and_normalize(obs), info diff --git a/slm_lab/spec/benchmark_arc/ppo/ppo_playground.yaml b/slm_lab/spec/benchmark_arc/ppo/ppo_playground.yaml new file mode 100644 index 000000000..0e02d636e --- /dev/null +++ b/slm_lab/spec/benchmark_arc/ppo/ppo_playground.yaml @@ -0,0 +1,800 @@ +# PPO MuJoCo Playground — MJWarp GPU +# +# Variants: +# DM Control Suite (Phase 5.1): +# ppo_playground — default (gamma=0.995, 16 epochs) +# ppo_playground_vnorm — + normalize_v_targets=true (precision/dexterous envs) +# ppo_playground_fingerspin — FingerSpin: gamma=0.95 (official override) +# ppo_playground_pendulum — PendulumSwingup: 4 epochs (official); action_repeat=4 in playground.py +# ppo_playground_humanoid — Humanoid: wider policy (2x256), NormalTanh, constant LR, reward_scale=10 +# ppo_playground_rs10 — + reward_scale=10.0 + constant LR (Brax default for ALL DM Control) +# ppo_playground_constlr — + constant LR (no decay) +# ppo_playground_vnorm_constlr — + vnorm + constant LR +# ppo_playground_constlr_clip03 — + constant LR + clip_eps=0.3 +# ppo_playground_vnorm_constlr_clip03 — + vnorm + constant LR + clip_eps=0.3 +# ppo_playground_brax_policy — 4x32 Brax policy + constant LR + vnorm (RETIRED: underperformed) +# +# Locomotion (Phase 5.2): +# ppo_playground_loco — default loco (4x128 policy, 5x256 value, gamma=0.97, lr=3e-4 constant) +# ppo_playground_loco_go1 — Go1/G1/T1 joystick (512-256-128 both nets, clip=0.3) +# ppo_playground_loco_precise — G1/BerkeleyHumanoid/T1/Apollo (clip=0.2, entropy=0.005) +# +# Manipulation (Phase 5.3): +# ppo_playground_manip — Panda tasks (4x32 policy, gamma=0.97, epoch=8, th=10) +# ppo_playground_manip_aloha — Aloha bimanual (3x256 policy, entropy=0.02) +# ppo_playground_manip_aloha_peg — AlohaSinglePegInsertion (4x256, th=40, lr=3e-4) +# ppo_playground_manip_dexterous — Leap/Aero dexterous (512-256-128, lr=3e-4, th=40, gamma=0.99) +# ppo_playground_manip_robotiq — PandaRobotiqPushCube (4x64 policy, gamma=0.994, th=100, lr=6e-4) +# +# DM Control architecture: asymmetric policy=[64,64]+SiLU, value=[256,256,256]+SiLU +# Loco architecture: policy=[128,128,128,128]+SiLU, value=[256,256,256,256,256]+SiLU +# +# Usage: +# slm-lab ... ppo_playground train -s env=playground/CartpoleBalance -s max_frame=100000000 +# slm-lab ... ppo_playground_loco train -s env=playground/Go1Getup -s max_frame=100000000 +# slm-lab ... ppo_playground_manip train -s env=playground/PandaPickCube -s max_frame=20000000 +# +# Batch math: +# DM Control: 2048 envs x 30 steps = 61K, 15 minibatches, 16 epochs = 240 grad steps +# Loco: 2048 envs x 20 steps = 41K, 32 minibatches, 4 epochs = 128 grad steps +# Manip: 2048 envs x 10 steps = 20K, varies by task +# Robotiq: 2048 envs x 100 steps = 205K, 32 minibatches, 8 epochs = 256 grad steps + +# --- Shared --- + +_policy_body: &policy_body + modules: + body: + Sequential: + - LazyLinear: {out_features: 64} + - SiLU: + - LazyLinear: {out_features: 64} + - SiLU: + graph: + input: x + modules: + body: [x] + output: body + +_value_body: &value_body + modules: + body: + Sequential: + - LazyLinear: {out_features: 256} + - SiLU: + - LazyLinear: {out_features: 256} + - SiLU: + - LazyLinear: {out_features: 256} + - SiLU: + graph: + input: x + modules: + body: [x] + output: body + +_memory: &memory + name: OnPolicyBatchReplay + +_meta: &meta + distributed: false + log_frequency: 100000 + eval_frequency: 100000 + max_session: 4 + max_trial: 1 + +_env: &env + name: "${env}" + max_t: null + max_frame: "${max_frame}" + normalize_obs: true + +_algorithm: &algorithm + name: PPO + action_pdtype: Normal + gamma: 0.99 + lam: 0.95 + clip_eps_spec: + name: no_decay + start_val: 0.2 + entropy_coef_spec: + name: no_decay + start_val: 0.01 + val_loss_coef: 0.5 + minibatch_size: 4096 + normalize_v_targets: false # Brax default; some envs may need true (see docs/phase5_ops.md) + +_net: &net + type: TorchArcNet + actor_arc: *policy_body + critic_arc: *value_body + shared: false + hid_layers_activation: relu + init_fn: orthogonal_ + clip_grad_val: 1.0 + use_same_optim: false + loss_spec: + name: MSELoss + optim_spec: + name: Adam + lr: 1.0e-3 + eps: 1.0e-5 + lr_scheduler_spec: + name: LinearToMin + frame: "${max_frame}" + min_factor: 0.033 + gpu: auto + +# --- DM Control: gamma=0.995, 16 epochs, 2048 envs --- + +ppo_playground: + agent: + name: PPO + algorithm: + <<: *algorithm + gamma: 0.995 + time_horizon: 30 + training_epoch: 16 + minibatch_size: 2048 + memory: *memory + net: *net + env: + <<: *env + num_envs: 2048 + meta: *meta + +# --- FingerSpin: gamma=0.95 (official dm_control_suite_params.py override) --- + +ppo_playground_fingerspin: + agent: + name: PPO + algorithm: + <<: *algorithm + gamma: 0.95 + time_horizon: 30 + training_epoch: 16 + minibatch_size: 4096 + memory: *memory + net: *net + env: + <<: *env + num_envs: 2048 + meta: *meta + +# --- PendulumSwingup: training_epoch=4 (official); action_repeat=4 handled in playground.py --- + +ppo_playground_pendulum: + agent: + name: PPO + algorithm: + <<: *algorithm + gamma: 0.995 + time_horizon: 30 + training_epoch: 4 + minibatch_size: 4096 + memory: *memory + net: *net + env: + <<: *env + num_envs: 2048 + meta: *meta + +# --- DM Control + normalize_v_targets=true: for precision/dexterous envs --- +# Use for: AcrobotSwingup, SwimmerSwimmer6, PointMass, FingerTurnEasy/Hard, FishSwim + +ppo_playground_vnorm: + agent: + name: PPO + algorithm: + <<: *algorithm + gamma: 0.995 + time_horizon: 30 + training_epoch: 16 + minibatch_size: 2048 + normalize_v_targets: true + memory: *memory + net: *net + env: + <<: *env + num_envs: 2048 + meta: *meta + +# --- Humanoid DM Control: wider policy (2x256), constant LR, reward_scale=10 --- +# Humanoid has 21 DOF — needs wider policy than 2x64 for multi-joint coordination +# Phase 3 solved Gymnasium Humanoid-v5 (2661 MA) with 2x256 policy + constant LR +# Brax uses reward_scaling=10.0 for ALL DM Control envs (dm_control_suite_params.py) +# Humanoid reward is multiplicative (standing*upright*move*control), all [0,1] — raw signal too small + +ppo_playground_humanoid: + agent: + name: PPO + algorithm: + <<: *algorithm + action_pdtype: NormalTanh # Brax stores pre-tanh actions; avoids unstable atanh in 21-DOF space + gamma: 0.995 + time_horizon: 30 + training_epoch: 16 + minibatch_size: 2048 + normalize_v_targets: true + memory: *memory + net: + <<: *net + actor_arc: + modules: + body: + Sequential: + - LazyLinear: {out_features: 256} + - SiLU: + - LazyLinear: {out_features: 256} + - SiLU: + graph: + input: x + modules: + body: [x] + output: body + lr_scheduler_spec: null # constant LR — Brax default, Phase 3 used constant + env: + <<: *env + num_envs: 2048 + reward_scale: 10.0 # Brax default for DM Control — critical for Humanoid's tiny rewards + meta: *meta + +# --- reward_scale=10.0: Brax default for ALL DM Control envs --- +# Research: dm_control_suite_params.py applies reward_scaling=10.0 universally. +# Previously only ppo_playground_humanoid had this. Test on all underperforming envs. + +ppo_playground_rs10: + agent: + name: PPO + algorithm: + <<: *algorithm + gamma: 0.995 + time_horizon: 30 + training_epoch: 16 + minibatch_size: 2048 + memory: *memory + net: + <<: *net + lr_scheduler_spec: null # constant LR — Brax default + env: + <<: *env + num_envs: 2048 + reward_scale: 10.0 + meta: *meta + +# --- Constant LR variants: test Brax default (no LR decay) in isolation --- + +ppo_playground_constlr: + agent: + name: PPO + algorithm: + <<: *algorithm + gamma: 0.995 + time_horizon: 30 + training_epoch: 16 + minibatch_size: 4096 + memory: *memory + net: + <<: *net + lr_scheduler_spec: null # constant LR — Brax default + env: + <<: *env + num_envs: 2048 + meta: *meta + +ppo_playground_vnorm_constlr: + agent: + name: PPO + algorithm: + <<: *algorithm + gamma: 0.995 + time_horizon: 30 + training_epoch: 16 + minibatch_size: 2048 + normalize_v_targets: true + memory: *memory + net: + <<: *net + lr_scheduler_spec: null # constant LR — Brax default + env: + <<: *env + num_envs: 2048 + meta: *meta + +# --- Constant LR + clip_eps=0.3: both Brax defaults, tested together --- + +ppo_playground_constlr_clip03: + agent: + name: PPO + algorithm: + <<: *algorithm + gamma: 0.995 + time_horizon: 30 + training_epoch: 16 + minibatch_size: 2048 + clip_eps_spec: + name: no_decay + start_val: 0.3 + memory: *memory + net: + <<: *net + lr_scheduler_spec: null # constant LR — Brax default + env: + <<: *env + num_envs: 2048 + meta: *meta + +ppo_playground_vnorm_constlr_clip03: + agent: + name: PPO + algorithm: + <<: *algorithm + gamma: 0.995 + time_horizon: 30 + training_epoch: 16 + minibatch_size: 2048 + normalize_v_targets: true + clip_eps_spec: + name: no_decay + start_val: 0.3 + memory: *memory + net: + <<: *net + lr_scheduler_spec: null # constant LR — Brax default + env: + <<: *env + num_envs: 2048 + meta: *meta + +# --- Brax-matched policy (4x32): deeper narrower policy matching Brax default --- + +ppo_playground_brax_policy: + agent: + name: PPO + algorithm: + <<: *algorithm + gamma: 0.995 + time_horizon: 30 + training_epoch: 16 + minibatch_size: 2048 + normalize_v_targets: true + memory: *memory + net: + <<: *net + actor_arc: + modules: + body: + Sequential: + - LazyLinear: {out_features: 32} + - SiLU: + - LazyLinear: {out_features: 32} + - SiLU: + - LazyLinear: {out_features: 32} + - SiLU: + - LazyLinear: {out_features: 32} + - SiLU: + graph: + input: x + modules: + body: [x] + output: body + lr_scheduler_spec: null # constant LR — Brax default + env: + <<: *env + num_envs: 2048 + meta: *meta + +# --- Locomotion: official Brax defaults (gamma=0.97, lr=3e-4 constant, clip=0.3) --- +# Policy: 4x128, Value: 5x256 (official default for most locomotion envs) +# Use for: BarkourJoystick, H1*, Op3, Spot* (default-config envs) +# num_envs=2048 — official uses 8192; all Phase 5.2 benchmark runs used 2048 + +ppo_playground_loco: + agent: + name: PPO + algorithm: + <<: *algorithm + gamma: 0.97 + time_horizon: 20 + training_epoch: 4 + minibatch_size: 4096 + clip_eps_spec: + name: no_decay + start_val: 0.3 + memory: *memory + net: + <<: *net + actor_arc: + modules: + body: + Sequential: + - LazyLinear: {out_features: 128} + - SiLU: + - LazyLinear: {out_features: 128} + - SiLU: + - LazyLinear: {out_features: 128} + - SiLU: + - LazyLinear: {out_features: 128} + - SiLU: + graph: + input: x + modules: + body: [x] + output: body + critic_arc: + modules: + body: + Sequential: + - LazyLinear: {out_features: 256} + - SiLU: + - LazyLinear: {out_features: 256} + - SiLU: + - LazyLinear: {out_features: 256} + - SiLU: + - LazyLinear: {out_features: 256} + - SiLU: + - LazyLinear: {out_features: 256} + - SiLU: + graph: + input: x + modules: + body: [x] + output: body + optim_spec: + name: Adam + lr: 3.0e-4 + eps: 1.0e-5 + lr_scheduler_spec: null # constant LR — Brax default + env: + <<: *env + num_envs: 2048 + meta: *meta + +# --- Locomotion Go1/G1/T1: 512-256-128 both nets --- +# Use for: Go1Joystick*, Go1Getup, Go1Handstand, Go1Footstand, Go1Backflip, G1*, T1* +# These envs provide privileged_state obs (flattened into obs alongside policy state) +# num_envs=2048 — official uses 8192; all Phase 5.2 benchmark runs used 2048 + +ppo_playground_loco_go1: + agent: + name: PPO + algorithm: + <<: *algorithm + gamma: 0.97 + time_horizon: 20 + training_epoch: 4 + minibatch_size: 4096 + clip_eps_spec: + name: no_decay + start_val: 0.3 + memory: *memory + net: + <<: *net + actor_arc: + modules: + body: + Sequential: + - LazyLinear: {out_features: 512} + - SiLU: + - LazyLinear: {out_features: 256} + - SiLU: + - LazyLinear: {out_features: 128} + - SiLU: + graph: + input: x + modules: + body: [x] + output: body + critic_arc: + modules: + body: + Sequential: + - LazyLinear: {out_features: 512} + - SiLU: + - LazyLinear: {out_features: 256} + - SiLU: + - LazyLinear: {out_features: 128} + - SiLU: + graph: + input: x + modules: + body: [x] + output: body + optim_spec: + name: Adam + lr: 3.0e-4 + eps: 1.0e-5 + lr_scheduler_spec: null # constant LR — Brax default + env: + <<: *env + num_envs: 2048 + meta: *meta + +# --- Locomotion precise: G1, BerkeleyHumanoid, T1, Apollo (clip=0.2, entropy=0.005) --- + +ppo_playground_loco_precise: + agent: + name: PPO + algorithm: + <<: *algorithm + gamma: 0.97 + time_horizon: 20 + training_epoch: 4 + minibatch_size: 4096 + clip_eps_spec: + name: no_decay + start_val: 0.2 + entropy_coef_spec: + name: no_decay + start_val: 0.005 + memory: *memory + net: + <<: *net + actor_arc: + modules: + body: + Sequential: + - LazyLinear: {out_features: 128} + - SiLU: + - LazyLinear: {out_features: 128} + - SiLU: + - LazyLinear: {out_features: 128} + - SiLU: + - LazyLinear: {out_features: 128} + - SiLU: + graph: + input: x + modules: + body: [x] + output: body + critic_arc: + modules: + body: + Sequential: + - LazyLinear: {out_features: 256} + - SiLU: + - LazyLinear: {out_features: 256} + - SiLU: + - LazyLinear: {out_features: 256} + - SiLU: + - LazyLinear: {out_features: 256} + - SiLU: + - LazyLinear: {out_features: 256} + - SiLU: + graph: + input: x + modules: + body: [x] + output: body + optim_spec: + name: Adam + lr: 3.0e-4 + eps: 1.0e-5 + lr_scheduler_spec: null + env: + <<: *env + num_envs: 2048 + meta: *meta + +# --- Manipulation: Panda tasks (4x32 policy, epoch=8, th=10, entropy=0.02) --- + +ppo_playground_manip: + agent: + name: PPO + algorithm: + <<: *algorithm + gamma: 0.97 + time_horizon: 10 + training_epoch: 8 + minibatch_size: 4096 + entropy_coef_spec: + name: no_decay + start_val: 0.02 + memory: *memory + net: + <<: *net + actor_arc: + modules: + body: + Sequential: + - LazyLinear: {out_features: 32} + - SiLU: + - LazyLinear: {out_features: 32} + - SiLU: + - LazyLinear: {out_features: 32} + - SiLU: + - LazyLinear: {out_features: 32} + - SiLU: + graph: + input: x + modules: + body: [x] + output: body + optim_spec: + name: Adam + lr: 1.0e-3 + eps: 1.0e-5 + lr_scheduler_spec: null + env: + <<: *env + num_envs: 2048 + meta: *meta + +# --- Manipulation: Aloha bimanual (3x256 policy, entropy=0.02) --- + +ppo_playground_manip_aloha: + agent: + name: PPO + algorithm: + <<: *algorithm + gamma: 0.97 + time_horizon: 15 + training_epoch: 8 + minibatch_size: 4096 + entropy_coef_spec: + name: no_decay + start_val: 0.02 + memory: *memory + net: + <<: *net + actor_arc: + modules: + body: + Sequential: + - LazyLinear: {out_features: 256} + - SiLU: + - LazyLinear: {out_features: 256} + - SiLU: + - LazyLinear: {out_features: 256} + - SiLU: + graph: + input: x + modules: + body: [x] + output: body + optim_spec: + name: Adam + lr: 1.0e-3 + eps: 1.0e-5 + lr_scheduler_spec: null + env: + <<: *env + num_envs: 2048 + meta: *meta + +# --- Manipulation: AlohaSinglePegInsertion (4x256 policy, th=40, lr=3e-4, entropy=0.01) --- +# Official config differs significantly from AlohaHandOver: deeper policy, lower lr/entropy, longer horizon + +ppo_playground_manip_aloha_peg: + agent: + name: PPO + algorithm: + <<: *algorithm + gamma: 0.97 + time_horizon: 40 + training_epoch: 8 + minibatch_size: 4096 + memory: *memory + net: + <<: *net + actor_arc: + modules: + body: + Sequential: + - LazyLinear: {out_features: 256} + - SiLU: + - LazyLinear: {out_features: 256} + - SiLU: + - LazyLinear: {out_features: 256} + - SiLU: + - LazyLinear: {out_features: 256} + - SiLU: + graph: + input: x + modules: + body: [x] + output: body + optim_spec: + name: Adam + lr: 3.0e-4 + eps: 1.0e-5 + lr_scheduler_spec: null + env: + <<: *env + num_envs: 2048 + meta: *meta + +# --- Manipulation: Leap/Aero dexterous (512-256-128, lr=3e-4, th=40, gamma=0.99) --- +# Official uses gamma=0.99 (not 0.97) for LeapCube and AeroCube envs + +ppo_playground_manip_dexterous: + agent: + name: PPO + algorithm: + <<: *algorithm + gamma: 0.99 + time_horizon: 40 + training_epoch: 4 + minibatch_size: 4096 + memory: *memory + net: + <<: *net + actor_arc: + modules: + body: + Sequential: + - LazyLinear: {out_features: 512} + - SiLU: + - LazyLinear: {out_features: 256} + - SiLU: + - LazyLinear: {out_features: 128} + - SiLU: + graph: + input: x + modules: + body: [x] + output: body + critic_arc: + modules: + body: + Sequential: + - LazyLinear: {out_features: 512} + - SiLU: + - LazyLinear: {out_features: 256} + - SiLU: + - LazyLinear: {out_features: 128} + - SiLU: + graph: + input: x + modules: + body: [x] + output: body + optim_spec: + name: Adam + lr: 3.0e-4 + eps: 1.0e-5 + lr_scheduler_spec: null + env: + <<: *env + num_envs: 2048 + meta: *meta + +# --- Manipulation: PandaRobotiqPushCube (4x64 policy, gamma=0.994, th=100, lr=6e-4) --- + +ppo_playground_manip_robotiq: + agent: + name: PPO + algorithm: + <<: *algorithm + gamma: 0.994 + time_horizon: 100 + training_epoch: 8 + minibatch_size: 4096 + memory: *memory + net: + <<: *net + actor_arc: + modules: + body: + Sequential: + - LazyLinear: {out_features: 64} + - SiLU: + - LazyLinear: {out_features: 64} + - SiLU: + - LazyLinear: {out_features: 64} + - SiLU: + - LazyLinear: {out_features: 64} + - SiLU: + graph: + input: x + modules: + body: [x] + output: body + optim_spec: + name: Adam + lr: 6.0e-4 + eps: 1.0e-5 + lr_scheduler_spec: null + env: + <<: *env + num_envs: 2048 + meta: *meta diff --git a/test/env/test_playground.py b/test/env/test_playground.py new file mode 100644 index 000000000..1416d7356 --- /dev/null +++ b/test/env/test_playground.py @@ -0,0 +1,225 @@ +"""Tests for MuJoCo Playground integration.""" + +from unittest.mock import MagicMock, patch + +import gymnasium as gym +from gymnasium import spaces +import numpy as np +import pytest + + +# ============================================================================ +# PlaygroundVecEnv tests (require mujoco_playground) +# ============================================================================ + + +class TestPlaygroundVecEnv: + """Tests for PlaygroundVecEnv with live mujoco_playground.""" + + @pytest.fixture(autouse=True) + def check_playground_available(self): + pytest.importorskip("mujoco_playground") + + @pytest.fixture + def env(self): + from slm_lab.env.playground import PlaygroundVecEnv + + env = PlaygroundVecEnv("CartpoleBalance", num_envs=4) + yield env + env.close() + + def test_instantiation(self, env): + assert env.num_envs == 4 + + def test_spaces(self, env): + assert env.single_observation_space is not None + assert env.single_action_space is not None + obs_dim = env.single_observation_space.shape[0] + act_dim = env.single_action_space.shape[0] + assert obs_dim > 0 + assert act_dim > 0 + # Batched spaces should have num_envs in first dim + assert env.observation_space.shape == (4, obs_dim) + assert env.action_space.shape == (4, act_dim) + + def test_reset(self, env): + obs, info = env.reset() + assert isinstance(obs, np.ndarray) + assert obs.shape == (4, env.single_observation_space.shape[0]) + assert obs.dtype == np.float32 + assert isinstance(info, dict) + + def test_step(self, env): + env.reset() + actions = np.random.uniform(-1, 1, size=env.action_space.shape).astype(np.float32) + obs, rewards, terminated, truncated, info = env.step(actions) + + assert obs.shape == (4, env.single_observation_space.shape[0]) + assert obs.dtype == np.float32 + assert rewards.shape == (4,) + assert rewards.dtype == np.float32 + assert terminated.shape == (4,) + assert terminated.dtype == bool + assert truncated.shape == (4,) + assert truncated.dtype == bool + assert isinstance(info, dict) + + def test_reset_with_seed(self, env): + obs1, _ = env.reset(seed=42) + obs2, _ = env.reset(seed=42) + np.testing.assert_array_equal(obs1, obs2) + + def test_multiple_steps(self, env): + env.reset() + for _ in range(10): + actions = np.random.uniform(-1, 1, size=env.action_space.shape).astype(np.float32) + obs, rewards, terminated, truncated, info = env.step(actions) + assert obs.shape[0] == 4 + + +# ============================================================================ +# make_env routing tests (mocked — no mujoco_playground needed) +# ============================================================================ + + +class TestMakeEnvPlaygroundRouting: + """Test that make_env routes playground/ envs to _make_playground_env.""" + + def test_playground_prefix_routes_correctly(self): + spec = { + "agent": {"algorithm": {"gamma": 0.99}}, + "env": { + "name": "playground/CartpoleBalance", + "num_envs": 4, + "max_frame": 100000, + }, + "meta": { + "distributed": False, + "eval_frequency": 5000, + "log_frequency": 5000, + "max_session": 1, + }, + } + + with patch("slm_lab.env._make_playground_env") as mock_pg: + # Create a mock env with real gymnasium spaces + obs_space = spaces.Box(low=-np.inf, high=np.inf, shape=(5,), dtype=np.float32) + act_space = spaces.Box(low=-1.0, high=1.0, shape=(1,), dtype=np.float32) + mock_env = MagicMock(spec=gym.vector.VectorEnv) + mock_env.num_envs = 4 + mock_env.is_venv = True + mock_env.single_observation_space = obs_space + mock_env.single_action_space = act_space + mock_env.observation_space = obs_space + mock_env.action_space = act_space + mock_env.spec = None + mock_pg.return_value = mock_env + + from slm_lab.env import make_env + + make_env(spec) + mock_pg.assert_called_once() + call_args = mock_pg.call_args + assert call_args[0][0] == "playground/CartpoleBalance" + assert call_args[0][1] == 4 + + def test_non_playground_does_not_route(self): + spec = { + "agent": {"algorithm": {"gamma": 0.99}}, + "env": { + "name": "CartPole-v1", + "num_envs": 1, + "max_frame": 1000, + }, + "meta": { + "distributed": False, + "eval_frequency": 1000, + "log_frequency": 1000, + "max_session": 1, + }, + } + + with patch("slm_lab.env._make_playground_env") as mock_pg: + from slm_lab.env import make_env + + env = make_env(spec) + mock_pg.assert_not_called() + env.close() + + +# ============================================================================ +# PlaygroundVecEnv impl detection tests (require mujoco_playground) +# ============================================================================ + + +class TestPlaygroundImplDetection: + """Test that PlaygroundVecEnv selects the right impl based on hardware.""" + + @pytest.fixture(autouse=True) + def check_playground_available(self): + pytest.importorskip("mujoco_playground") + + def test_impl_is_warp_on_cuda(self): + """On CUDA GPU, impl should be 'warp'.""" + import jax + + if not any(d.platform == "gpu" for d in jax.devices()): + pytest.skip("No CUDA GPU available") + import slm_lab.env.playground as pg_module + + assert pg_module._impl == "warp" + + from slm_lab.env.playground import PlaygroundVecEnv + + env = PlaygroundVecEnv("CartpoleBalance", num_envs=2) + env.close() + + def test_impl_is_jax_on_cpu(self): + """On CPU (no CUDA), impl should be 'jax'.""" + import jax + + if any(d.platform == "gpu" for d in jax.devices()): + pytest.skip("CUDA GPU present — test is for CPU only") + import slm_lab.env.playground as pg_module + + assert pg_module._impl == "jax" + + from slm_lab.env.playground import PlaygroundVecEnv + + env = PlaygroundVecEnv("CartpoleBalance", num_envs=2) + env.close() + + def test_config_overrides_matches_impl(self): + """_config_overrides dict must reflect the selected impl.""" + import slm_lab.env.playground as pg_module + + assert pg_module._config_overrides == {"impl": pg_module._impl} + + def test_impl_is_consistent_with_cuda_flag(self): + """_impl and _has_cuda must agree: warp iff CUDA present.""" + import slm_lab.env.playground as pg_module + + if pg_module._has_cuda: + assert pg_module._impl == "warp" + else: + assert pg_module._impl == "jax" + + +# ============================================================================ +# Import guard tests +# ============================================================================ + + +class TestImportGuard: + """Test that slm_lab.env imports cleanly without mujoco_playground.""" + + def test_env_module_imports_without_playground(self): + """Importing slm_lab.env should not fail if playground is missing. + + The playground import is lazy (inside _make_playground_env), so the + env module should always import successfully. + """ + import slm_lab.env + + assert hasattr(slm_lab.env, "make_env") + assert hasattr(slm_lab.env, "_make_playground_env")