diff --git a/.dstack/run-gpu-train.yml b/.dstack/run-gpu-train.yml
index ac3e34865..02e70d925 100644
--- a/.dstack/run-gpu-train.yml
+++ b/.dstack/run-gpu-train.yml
@@ -16,10 +16,13 @@ env:
   - PROFILE
   - PROF_SKIP
   - PROF_ACTIVE
+  - XLA_PYTHON_CLIENT_PREALLOCATE=false
+  - UV_HTTP_TIMEOUT=300
 
 commands:
   - apt-get update && apt-get install -y swig libgl1-mesa-glx libglib2.0-0 libsm6 libxext6 libxrender-dev libgomp1
-  - cd /workflow && uv sync
+  - cd /workflow && uv sync --group playground
+  - cd /workflow && uv run python -c "from mujoco_playground._src.mjx_env import ensure_menagerie_exists; ensure_menagerie_exists()"
   - cd /workflow && uv run slm-lab run ${SPEC_VARS} ${SPEC_FILE} ${SPEC_NAME} ${LAB_MODE} --upload-hf
 
 resources:
@@ -29,7 +32,7 @@ resources:
   memory: 32GB..
 
 spot_policy: auto
-max_duration: 8h
+max_duration: 6h
 max_price: 0.50
 retry:
   on_events: [no-capacity]
diff --git a/.githooks/commit-msg b/.githooks/commit-msg
index bdbccfec0..d8d6a30b9 100755
--- a/.githooks/commit-msg
+++ b/.githooks/commit-msg
@@ -1,22 +1,78 @@
 #!/usr/bin/env bash
-# Validate conventional commit format: type: message | type(scope): message
+set -euo pipefail
 
-commit_msg_file="$1"
-commit_msg=$(head -1 "$commit_msg_file")
+# Conventional Commits validation + idempotent semantic version bump.
+#
+# Bumps pyproject.toml version based on commit type, always relative
+# to the base branch (master) version so repeated commits on a
+# feature branch converge to the same result.
+#
+# Rules (semver):
+#   breaking (!)  → major  (X.0.0)
+#   feat          → minor  (_.X.0)
+#   everything else → patch (_._.X)
 
-# Skip merge commits and fixup/squash
-if echo "$commit_msg" | grep -qE '^(Merge |fixup! |squash! )'; then
-    exit 0
-fi
+readonly COMMIT_MSG_FILE="$1"
+readonly COMMIT_MSG="$(head -1 "$COMMIT_MSG_FILE")"
+readonly TYPES="feat|fix|docs|chore|refactor|test|perf|ci|style|build"
+readonly PYPROJECT="pyproject.toml"
+readonly BASE_BRANCH="master"
+
+# --- Validation ---
+
+# Skip non-standard commits
+[[ "$COMMIT_MSG" =~ ^(Merge\ |fixup!\ |squash!\ ) ]] && exit 0
+
+# Enforce conventional commit format
+if ! [[ "$COMMIT_MSG" =~ ^($TYPES)(\(.+\))?!?:\ .+ ]]; then
+    cat >&2 <<MSG
+ERROR: Invalid commit message format.
 
-# Validate format
-if ! echo "$commit_msg" | grep -qE '^(feat|fix|docs|chore|refactor|test|perf|ci|style|build)(\(.+\))?: .+'; then
-    echo "ERROR: Invalid commit message format."
-    echo ""
-    echo "  Expected: type: message"
-    echo "  Example:  feat: add user authentication"
-    echo "  Example:  fix(parser): handle empty input"
-    echo ""
-    echo "  Allowed types: feat fix docs chore refactor test perf ci style build"
+  Expected: type: message
+  Example:  feat: add user authentication
+  Example:  fix(parser): handle empty input
+
+  Types: $TYPES
+MSG
     exit 1
 fi
+
+# --- Version bump ---
+
+[[ -f "$PYPROJECT" ]] || exit 0
+
+# Read base version from merge target (idempotent across feature branch commits)
+read_version() {
+    local source="$1"
+    if [[ "$source" == "file" ]]; then
+        grep '^version = ' "$PYPROJECT" | head -1 | sed 's/version = "//;s/"//'
+    else
+        git show "${source}:${PYPROJECT}" 2>/dev/null \
+            | grep '^version = ' | head -1 | sed 's/version = "//;s/"//' || true
+    fi
+}
+
+base_version="$(read_version "$BASE_BRANCH")"
+base_version="${base_version:-$(read_version file)}"
+[[ -z "$base_version" ]] && exit 0
+
+IFS='.' read -r major minor patch <<< "$base_version"
+
+# Classify commit
+if [[ "$COMMIT_MSG" =~ ^($TYPES)(\(.+\))?!: ]]; then
+    ((major++)); minor=0; patch=0
+elif [[ "$COMMIT_MSG" =~ ^feat(\(.+\))?:  ]]; then
+    ((minor++)); patch=0
+else
+    ((patch++))
+fi
+
+new_version="${major}.${minor}.${patch}"
+current="$(read_version file)"
+
+# Only touch the file if the version actually changed
+if [[ "$current" != "$new_version" ]]; then
+    sed -i '' "s/^version = \"${current}\"/version = \"${new_version}\"/" "$PYPROJECT"
+    git add "$PYPROJECT"
+    echo "Version: ${base_version} → ${new_version}"
+fi
diff --git a/docs/BENCHMARKS.md b/docs/BENCHMARKS.md
index 7c15e5a08..75179502a 100644
--- a/docs/BENCHMARKS.md
+++ b/docs/BENCHMARKS.md
@@ -110,11 +110,12 @@ Search budget: ~3-4 trials per dimension (8 trials = 2-3 dims, 16 = 3-4 dims, 20
 | Phase | Category | Envs | REINFORCE | SARSA | DQN | DDQN+PER | A2C | PPO | SAC | CrossQ | Overall |
 |-------|----------|------|-----------|-------|-----|----------|-----|-----|-----|--------|---------|
 | 1 | Classic Control | 3 | ✅ | ✅ | ⚠️ | ✅ | ✅ | ✅ | ✅ | ⚠️ | Done |
-| 2 | Box2D | 2 | N/A | N/A | ⚠️ | ✅ | ❌ | ⚠️ | ⚠️ | ⚠️ | Done |
+| 2 | Box2D | 2 | N/A | N/A | ⚠️ | ✅ | | ⚠️ | ⚠️ | ⚠️ | Done |
 | 3 | MuJoCo | 11 | N/A | N/A | N/A | N/A | N/A | ⚠️ | ⚠️ | ⚠️ | Done |
-| 4 | Atari | 57 | N/A | N/A | N/A | Skip | Done | Done | Done | ❌ | Done |
+| 4 | Atari | 57 | N/A | N/A | N/A | Skip | Done | Done | Done | | Done |
+| 5 | Playground | 54 | N/A | N/A | N/A | N/A | N/A | 🔄 | 🔄 | N/A | In progress |
 
-**Legend**: ✅ Solved | ⚠️ Close (>80%) | 📊 Acceptable | ❌ Failed | 🔄 In progress/Pending | Skip Not started | N/A Not applicable
+**Legend**: ✅ Solved | ⚠️ Close (>80%) | 📊 Acceptable | Failed | 🔄 In progress/Pending | Skip Not started | N/A Not applicable
 
 ---
 
@@ -166,7 +167,7 @@ Search budget: ~3-4 trials per dimension (8 trials = 2-3 dims, 16 = 3-4 dims, 20
 
 | Algorithm | Status | MA | SPEC_FILE | SPEC_NAME | HF Data |
 |-----------|--------|-----|-----------|-----------|---------|
-| A2C | ❌ | -820.74 | [slm_lab/spec/benchmark_arc/a2c/a2c_classic_arc.yaml](../slm_lab/spec/benchmark_arc/a2c/a2c_classic_arc.yaml) | a2c_gae_pendulum_arc | [a2c_gae_pendulum_arc_2026_02_11_162217](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/a2c_gae_pendulum_arc_2026_02_11_162217) |
+| A2C | | -820.74 | [slm_lab/spec/benchmark_arc/a2c/a2c_classic_arc.yaml](../slm_lab/spec/benchmark_arc/a2c/a2c_classic_arc.yaml) | a2c_gae_pendulum_arc | [a2c_gae_pendulum_arc_2026_02_11_162217](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/a2c_gae_pendulum_arc_2026_02_11_162217) |
 | PPO | ✅ | -174.87 | [slm_lab/spec/benchmark_arc/ppo/ppo_classic_arc.yaml](../slm_lab/spec/benchmark_arc/ppo/ppo_classic_arc.yaml) | ppo_pendulum_arc | [ppo_pendulum_arc_2026_02_11_162156](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_pendulum_arc_2026_02_11_162156) |
 | SAC | ✅ | -150.97 | [slm_lab/spec/benchmark_arc/sac/sac_classic_arc.yaml](../slm_lab/spec/benchmark_arc/sac/sac_classic_arc.yaml) | sac_pendulum_arc | [sac_pendulum_arc_2026_02_11_162240](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/sac_pendulum_arc_2026_02_11_162240) |
 | CrossQ | ✅ | -145.66 | [slm_lab/spec/benchmark/crossq/crossq_classic.yaml](../slm_lab/spec/benchmark/crossq/crossq_classic.yaml) | crossq_pendulum | [crossq_pendulum_2026_02_28_130648](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/crossq_pendulum_2026_02_28_130648) |
@@ -185,10 +186,10 @@ Search budget: ~3-4 trials per dimension (8 trials = 2-3 dims, 16 = 3-4 dims, 20
 |-----------|--------|-----|-----------|-----------|---------|
 | DQN | ⚠️ | 195.21 | [slm_lab/spec/benchmark_arc/dqn/dqn_box2d_arc.yaml](../slm_lab/spec/benchmark_arc/dqn/dqn_box2d_arc.yaml) | dqn_concat_lunar_arc | [dqn_concat_lunar_arc_2026_02_11_201407](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/dqn_concat_lunar_arc_2026_02_11_201407) |
 | DDQN+PER | ✅ | 265.90 | [slm_lab/spec/benchmark_arc/dqn/dqn_box2d_arc.yaml](../slm_lab/spec/benchmark_arc/dqn/dqn_box2d_arc.yaml) | ddqn_per_concat_lunar_arc | [ddqn_per_concat_lunar_arc_2026_02_13_105115](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ddqn_per_concat_lunar_arc_2026_02_13_105115) |
-| A2C | ❌ | 27.38 | [slm_lab/spec/benchmark_arc/a2c/a2c_classic_arc.yaml](../slm_lab/spec/benchmark_arc/a2c/a2c_classic_arc.yaml) | a2c_gae_lunar_arc | [a2c_gae_lunar_arc_2026_02_11_224304](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/a2c_gae_lunar_arc_2026_02_11_224304) |
+| A2C | | 27.38 | [slm_lab/spec/benchmark_arc/a2c/a2c_classic_arc.yaml](../slm_lab/spec/benchmark_arc/a2c/a2c_classic_arc.yaml) | a2c_gae_lunar_arc | [a2c_gae_lunar_arc_2026_02_11_224304](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/a2c_gae_lunar_arc_2026_02_11_224304) |
 | PPO | ⚠️ | 183.30 | [slm_lab/spec/benchmark_arc/ppo/ppo_box2d_arc.yaml](../slm_lab/spec/benchmark_arc/ppo/ppo_box2d_arc.yaml) | ppo_lunar_arc | [ppo_lunar_arc_2026_02_11_201303](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_lunar_arc_2026_02_11_201303) |
 | SAC | ⚠️ | 106.17 | [slm_lab/spec/benchmark_arc/sac/sac_box2d_arc.yaml](../slm_lab/spec/benchmark_arc/sac/sac_box2d_arc.yaml) | sac_lunar_arc | [sac_lunar_arc_2026_02_11_201417](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/sac_lunar_arc_2026_02_11_201417) |
-| CrossQ | ❌ | 139.21 | [slm_lab/spec/benchmark/crossq/crossq_box2d.yaml](../slm_lab/spec/benchmark/crossq/crossq_box2d.yaml) | crossq_lunar | [crossq_lunar_2026_02_28_130733](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/crossq_lunar_2026_02_28_130733) |
+| CrossQ | | 139.21 | [slm_lab/spec/benchmark/crossq/crossq_box2d.yaml](../slm_lab/spec/benchmark/crossq/crossq_box2d.yaml) | crossq_lunar | [crossq_lunar_2026_02_28_130733](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/crossq_lunar_2026_02_28_130733) |
 
 ![LunarLander-v3](plots/LunarLander-v3_multi_trial_graph_mean_returns_ma_vs_frames.png)
 
@@ -200,7 +201,7 @@ Search budget: ~3-4 trials per dimension (8 trials = 2-3 dims, 16 = 3-4 dims, 20
 
 | Algorithm | Status | MA | SPEC_FILE | SPEC_NAME | HF Data |
 |-----------|--------|-----|-----------|-----------|---------|
-| A2C | ❌ | -76.81 | [slm_lab/spec/benchmark_arc/a2c/a2c_classic_arc.yaml](../slm_lab/spec/benchmark_arc/a2c/a2c_classic_arc.yaml) | a2c_gae_lunar_continuous_arc | [a2c_gae_lunar_continuous_arc_2026_02_11_224301](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/a2c_gae_lunar_continuous_arc_2026_02_11_224301) |
+| A2C | | -76.81 | [slm_lab/spec/benchmark_arc/a2c/a2c_classic_arc.yaml](../slm_lab/spec/benchmark_arc/a2c/a2c_classic_arc.yaml) | a2c_gae_lunar_continuous_arc | [a2c_gae_lunar_continuous_arc_2026_02_11_224301](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/a2c_gae_lunar_continuous_arc_2026_02_11_224301) |
 | PPO | ⚠️ | 132.58 | [slm_lab/spec/benchmark_arc/ppo/ppo_box2d_arc.yaml](../slm_lab/spec/benchmark_arc/ppo/ppo_box2d_arc.yaml) | ppo_lunar_continuous_arc | [ppo_lunar_continuous_arc_2026_02_11_224229](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_lunar_continuous_arc_2026_02_11_224229) |
 | SAC | ⚠️ | 125.00 | [slm_lab/spec/benchmark_arc/sac/sac_box2d_arc.yaml](../slm_lab/spec/benchmark_arc/sac/sac_box2d_arc.yaml) | sac_lunar_continuous_arc | [sac_lunar_continuous_arc_2026_02_12_222203](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/sac_lunar_continuous_arc_2026_02_12_222203) |
 | CrossQ | ✅ | 268.91 | [slm_lab/spec/benchmark/crossq/crossq_box2d.yaml](../slm_lab/spec/benchmark/crossq/crossq_box2d.yaml) | crossq_lunar_continuous | [crossq_lunar_continuous_2026_03_01_140517](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/crossq_lunar_continuous_2026_03_01_140517) |
@@ -455,7 +456,7 @@ source .env && slm-lab run-remote --gpu \
 - **A2C**: [a2c_atari_arc.yaml](../slm_lab/spec/benchmark_arc/a2c/a2c_atari_arc.yaml) - RMSprop (lr=7e-4), training_frequency=32
 - **PPO**: [ppo_atari_arc.yaml](../slm_lab/spec/benchmark_arc/ppo/ppo_atari_arc.yaml) - AdamW (lr=2.5e-4), minibatch=256, horizon=128, epochs=4, max_frame=10e6
 - **SAC**: [sac_atari_arc.yaml](../slm_lab/spec/benchmark_arc/sac/sac_atari_arc.yaml) - Categorical SAC, AdamW (lr=3e-4), training_iter=3, training_frequency=4, max_frame=2e6
-- **CrossQ**: [crossq_atari.yaml](../slm_lab/spec/benchmark/crossq/crossq_atari.yaml) - Categorical CrossQ, AdamW (lr=1e-3), training_iter=3, training_frequency=4, max_frame=2e6 (experimental — limited results on 6 games)
+- **CrossQ**: [crossq_atari.yaml](../slm_lab/spec/benchmark/crossq/crossq_atari.yaml) - Categorical CrossQ, Adam (lr=1e-3), training_iter=1, training_frequency=4, max_frame=2e6 (experimental — limited results on 6 games)
 
 **PPO Lambda Variants** (table shows best result per game):
 
@@ -486,7 +487,7 @@ source .env && slm-lab run-remote --gpu -s env=ENV \
 
 > **Note**: HF Data links marked "-" indicate runs completed but not yet uploaded to HuggingFace. Scores are extracted from local trial_metrics.
 
-| ENV | Score | SPEC_NAME | HF Data |
+| ENV | MA | SPEC_NAME | HF Data |
 |-----|-------|-----------|---------|
 | ALE/AirRaid-v5 | 7042.84 | ppo_atari_arc | [ppo_atari_arc_airraid_2026_02_13_124015](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_atari_arc_airraid_2026_02_13_124015) |
 | | 1832.54 | sac_atari_arc | [sac_atari_arc_airraid_2026_02_17_104002](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/sac_atari_arc_airraid_2026_02_17_104002) |
@@ -530,7 +531,7 @@ source .env && slm-lab run-remote --gpu -s env=ENV \
 | ALE/Breakout-v5 | 326.47 | ppo_atari_lam70_arc | [ppo_atari_lam70_arc_breakout_2026_02_13_230455](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_atari_lam70_arc_breakout_2026_02_13_230455) |
 | | 20.23 | sac_atari_arc | [sac_atari_arc_breakout_2026_02_15_201235](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/sac_atari_arc_breakout_2026_02_15_201235) |
 | | 273 | a2c_gae_atari_arc | [a2c_gae_atari_breakout_2026_01_31_213610](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/a2c_gae_atari_breakout_2026_01_31_213610) |
-| | ❌ 4.40 | crossq_atari | [crossq_atari_breakout_2026_02_25_030241](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/crossq_atari_breakout_2026_02_25_030241) |
+| | 4.40 | crossq_atari | [crossq_atari_breakout_2026_02_25_030241](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/crossq_atari_breakout_2026_02_25_030241) |
 | ALE/Carnival-v5 | 3912.59 | ppo_atari_lam70_arc | [ppo_atari_lam70_arc_carnival_2026_02_13_230438](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_atari_lam70_arc_carnival_2026_02_13_230438) |
 | | 3501.37 | sac_atari_arc | [sac_atari_arc_carnival_2026_02_17_105834](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/sac_atari_arc_carnival_2026_02_17_105834) |
 | | 2170 | a2c_gae_atari_arc | [a2c_gae_atari_carnival_2026_02_01_082726](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/a2c_gae_atari_carnival_2026_02_01_082726) |
@@ -594,7 +595,7 @@ source .env && slm-lab run-remote --gpu -s env=ENV \
 | ALE/MsPacman-v5 | 2330.74 | ppo_atari_lam85_arc | [ppo_atari_lam85_arc_mspacman_2026_02_14_102435](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_atari_lam85_arc_mspacman_2026_02_14_102435) |
 | | 1336.96 | sac_atari_arc | [sac_atari_arc_mspacman_2026_02_17_221523](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/sac_atari_arc_mspacman_2026_02_17_221523) |
 | | 2110 | a2c_gae_atari_arc | [a2c_gae_atari_mspacman_2026_02_01_001100](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/a2c_gae_atari_mspacman_2026_02_01_001100) |
-| | ❌ 327.79 | crossq_atari | [crossq_atari_mspacman_2026_02_23_171317](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/crossq_atari_mspacman_2026_02_23_171317) |
+| | 327.79 | crossq_atari | [crossq_atari_mspacman_2026_02_23_171317](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/crossq_atari_mspacman_2026_02_23_171317) |
 | ALE/NameThisGame-v5 | 6879.23 | ppo_atari_arc | [ppo_atari_arc_namethisgame_2026_02_14_103319](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_atari_arc_namethisgame_2026_02_14_103319) |
 | | 3992.71 | sac_atari_arc | [sac_atari_arc_namethisgame_2026_02_17_220905](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/sac_atari_arc_namethisgame_2026_02_17_220905) |
 | | 5412 | a2c_gae_atari_arc | [a2c_gae_atari_namethisgame_2026_02_01_132733](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/a2c_gae_atari_namethisgame_2026_02_01_132733) |
@@ -604,14 +605,14 @@ source .env && slm-lab run-remote --gpu -s env=ENV \
 | ALE/Pong-v5 | 16.69 | ppo_atari_lam85_arc | [ppo_atari_lam85_arc_pong_2026_02_14_103722](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_atari_lam85_arc_pong_2026_02_14_103722) |
 | | 10.89 | sac_atari_arc | [sac_atari_arc_pong_2026_02_17_160429](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/sac_atari_arc_pong_2026_02_17_160429) |
 | | 10.17 | a2c_gae_atari_arc | [a2c_gae_atari_pong_2026_01_31_213635](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/a2c_gae_atari_pong_2026_01_31_213635) |
-| | ❌ -20.59 | crossq_atari | [crossq_atari_pong_2026_02_23_171158](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/crossq_atari_pong_2026_02_23_171158) |
+| | -20.59 | crossq_atari | [crossq_atari_pong_2026_02_23_171158](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/crossq_atari_pong_2026_02_23_171158) |
 | ALE/Pooyan-v5 | 5308.66 | ppo_atari_lam70_arc | [ppo_atari_lam70_arc_pooyan_2026_02_14_114730](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_atari_lam70_arc_pooyan_2026_02_14_114730) |
 | | 2530.78 | sac_atari_arc | [sac_atari_arc_pooyan_2026_02_17_220346](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/sac_atari_arc_pooyan_2026_02_17_220346) |
 | | 2997 | a2c_gae_atari_arc | [a2c_gae_atari_pooyan_2026_02_01_132748](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/a2c_gae_atari_pooyan_2026_02_01_132748) |
 | ALE/Qbert-v5 | 15460.48 | ppo_atari_arc | [ppo_atari_arc_qbert_2026_02_14_120409](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_atari_arc_qbert_2026_02_14_120409) |
 | | 3331.98 | sac_atari_arc | [sac_atari_arc_qbert_2026_02_17_223117](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/sac_atari_arc_qbert_2026_02_17_223117) |
 | | 12619 | a2c_gae_atari_arc | [a2c_gae_atari_qbert_2026_01_31_213720](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/a2c_gae_atari_qbert_2026_01_31_213720) |
-| | ❌ 3189.73 | crossq_atari | [crossq_atari_qbert_2026_02_25_030458](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/crossq_atari_qbert_2026_02_25_030458) |
+| | 3189.73 | crossq_atari | [crossq_atari_qbert_2026_02_25_030458](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/crossq_atari_qbert_2026_02_25_030458) |
 | ALE/Riverraid-v5 | 9599.75 | ppo_atari_lam85_arc | [ppo_atari_lam85_arc_riverraid_2026_02_14_124700](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_atari_lam85_arc_riverraid_2026_02_14_124700) |
 | | 4744.95 | sac_atari_arc | [sac_atari_arc_riverraid_2026_02_18_014310](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/sac_atari_arc_riverraid_2026_02_18_014310) |
 | | 6558 | a2c_gae_atari_arc | [a2c_gae_atari_riverraid_2026_02_01_132507](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/a2c_gae_atari_riverraid_2026_02_01_132507) |
@@ -624,7 +625,7 @@ source .env && slm-lab run-remote --gpu -s env=ENV \
 | ALE/Seaquest-v5 | 1775.14 | ppo_atari_arc | [ppo_atari_arc_seaquest_2026_02_11_095444](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_atari_arc_seaquest_2026_02_11_095444) |
 | | 1565.44 | sac_atari_arc | [sac_atari_arc_seaquest_2026_02_18_020822](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/sac_atari_arc_seaquest_2026_02_18_020822) |
 | | 850 | a2c_gae_atari_arc | [a2c_gae_atari_seaquest_2026_02_01_001001](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/a2c_gae_atari_seaquest_2026_02_01_001001) |
-| | ❌ 234.63 | crossq_atari | [crossq_atari_seaquest_2026_02_25_030441](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/crossq_atari_seaquest_2026_02_25_030441) |
+| | 234.63 | crossq_atari | [crossq_atari_seaquest_2026_02_25_030441](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/crossq_atari_seaquest_2026_02_25_030441) |
 | ALE/Skiing-v5 | -28217.28 | ppo_atari_arc | [ppo_atari_arc_skiing_2026_02_14_174807](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_atari_arc_skiing_2026_02_14_174807) |
 | | -17464.22 | sac_atari_arc | [sac_atari_arc_skiing_2026_02_18_024444](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/sac_atari_arc_skiing_2026_02_18_024444) |
 | | -14235 | a2c_gae_atari_arc | [a2c_gae_atari_skiing_2026_02_01_132451](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/a2c_gae_atari_skiing_2026_02_01_132451) |
@@ -634,7 +635,7 @@ source .env && slm-lab run-remote --gpu -s env=ENV \
 | ALE/SpaceInvaders-v5 | 892.49 | ppo_atari_arc | [ppo_atari_arc_spaceinvaders_2026_02_14_131114](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_atari_arc_spaceinvaders_2026_02_14_131114) |
 | | 507.33 | sac_atari_arc | [sac_atari_arc_spaceinvaders_2026_02_18_033139](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/sac_atari_arc_spaceinvaders_2026_02_18_033139) |
 | | 784 | a2c_gae_atari_arc | [a2c_gae_atari_spaceinvaders_2026_02_01_000950](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/a2c_gae_atari_spaceinvaders_2026_02_01_000950) |
-| | ❌ 404.50 | crossq_atari | [crossq_atari_spaceinvaders_2026_02_25_030410](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/crossq_atari_spaceinvaders_2026_02_25_030410) |
+| | 404.50 | crossq_atari | [crossq_atari_spaceinvaders_2026_02_25_030410](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/crossq_atari_spaceinvaders_2026_02_25_030410) |
 | ALE/StarGunner-v5 | 49328.73 | ppo_atari_lam70_arc | [ppo_atari_lam70_arc_stargunner_2026_02_14_131149](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_atari_lam70_arc_stargunner_2026_02_14_131149) |
 | | 4295.97 | sac_atari_arc | [sac_atari_arc_stargunner_2026_02_18_033151](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/sac_atari_arc_stargunner_2026_02_18_033151) |
 | | 8665 | a2c_gae_atari_arc | [a2c_gae_atari_stargunner_2026_02_01_132406](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/a2c_gae_atari_stargunner_2026_02_01_132406) |
@@ -760,3 +761,123 @@ source .env && slm-lab run-remote --gpu -s env=ENV \
 
 </details>
 
+---
+
+### Phase 5: MuJoCo Playground (JAX/MJX GPU-Accelerated)
+
+[MuJoCo Playground](https://google-deepmind.github.io/mujoco_playground/) | Continuous state/action | MJWarp GPU backend
+
+**Settings**: max_frame 100M | num_envs 2048 | max_session 4
+
+**Spec file**: [ppo_playground.yaml](../slm_lab/spec/benchmark_arc/ppo/ppo_playground.yaml) — all envs via `-s env=playground/ENV`
+
+**Reproduce**:
+```bash
+source .env && slm-lab run-remote --gpu \
+  slm_lab/spec/benchmark_arc/ppo/ppo_playground.yaml SPEC_NAME train \
+  -s env=playground/ENV -s max_frame=100000000 -n NAME
+```
+
+#### Phase 5.1: DM Control Suite (25 envs)
+
+Classic control and locomotion tasks from the DeepMind Control Suite, ported to MJWarp GPU simulation.
+
+| ENV | MA | SPEC_NAME | HF Data |
+|-----|-----|-----------|---------|
+| playground/AcrobotSwingup | 253.24 | ppo_playground_vnorm | [ppo_playground_acrobotswingup_2026_03_12_175809](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_acrobotswingup_2026_03_12_175809) |
+| playground/AcrobotSwingupSparse | 146.98 | ppo_playground_vnorm | [ppo_playground_vnorm_acrobotswingupsparse_2026_03_14_161212](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_vnorm_acrobotswingupsparse_2026_03_14_161212) |
+| playground/BallInCup | 942.44 | ppo_playground_vnorm | [ppo_playground_ballincup_2026_03_12_105443](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_ballincup_2026_03_12_105443) |
+| playground/CartpoleBalance | 968.23 | ppo_playground_vnorm | [ppo_playground_cartpolebalance_2026_03_12_141924](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_cartpolebalance_2026_03_12_141924) |
+| playground/CartpoleBalanceSparse | 995.34 | ppo_playground_constlr | [ppo_playground_constlr_cartpolebalancesparse_2026_03_14_000352](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_constlr_cartpolebalancesparse_2026_03_14_000352) |
+| playground/CartpoleSwingup | 729.09 | ppo_playground_constlr | [ppo_playground_constlr_cartpoleswingup_2026_03_17_041102](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_constlr_cartpoleswingup_2026_03_17_041102) |
+| playground/CartpoleSwingupSparse | 521.98 | ppo_playground_constlr | [ppo_playground_constlr_cartpoleswingupsparse_2026_03_13_233449](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_constlr_cartpoleswingupsparse_2026_03_13_233449) |
+| playground/CheetahRun | 883.44 | ppo_playground_vnorm | [ppo_playground_vnorm_cheetahrun_2026_03_14_161211](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_vnorm_cheetahrun_2026_03_14_161211) |
+| playground/FingerSpin | 713.35 | ppo_playground_fingerspin | [ppo_playground_fingerspin_fingerspin_2026_03_13_033911](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_fingerspin_fingerspin_2026_03_13_033911) |
+| playground/FingerTurnEasy | 663.58 | ppo_playground_vnorm | [ppo_playground_fingerturneasy_2026_03_12_175835](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_fingerturneasy_2026_03_12_175835) |
+| playground/FingerTurnHard | 590.43 | ppo_playground_vnorm_constlr | [ppo_playground_vnorm_constlr_fingerturnhard_2026_03_16_234509](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_vnorm_constlr_fingerturnhard_2026_03_16_234509) |
+| playground/FishSwim | 580.57 | ppo_playground_vnorm_constlr_clip03 | [ppo_playground_vnorm_constlr_clip03_fishswim_2026_03_14_002112](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_vnorm_constlr_clip03_fishswim_2026_03_14_002112) |
+| playground/HopperHop | 22.00 | ppo_playground_vnorm | [ppo_playground_hopperhop_2026_03_12_110855](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_hopperhop_2026_03_12_110855) |
+| playground/HopperStand | 237.15 | ppo_playground_vnorm | [ppo_playground_vnorm_hopperstand_2026_03_14_095438](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_vnorm_hopperstand_2026_03_14_095438) |
+| playground/HumanoidRun | 18.83 | ppo_playground_humanoid | [ppo_playground_humanoid_humanoidrun_2026_03_14_115522](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_humanoid_humanoidrun_2026_03_14_115522) |
+| playground/HumanoidStand | 114.86 | ppo_playground_humanoid | [ppo_playground_humanoid_humanoidstand_2026_03_14_115516](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_humanoid_humanoidstand_2026_03_14_115516) |
+| playground/HumanoidWalk | 47.01 | ppo_playground_humanoid | [ppo_playground_humanoid_humanoidwalk_2026_03_14_172235](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_humanoid_humanoidwalk_2026_03_14_172235) |
+| playground/PendulumSwingup | 637.46 | ppo_playground_pendulum | [ppo_playground_pendulum_pendulumswingup_2026_03_13_033818](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_pendulum_pendulumswingup_2026_03_13_033818) |
+| playground/PointMass | 868.09 | ppo_playground_vnorm_constlr | [ppo_playground_vnorm_constlr_pointmass_2026_03_14_095452](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_vnorm_constlr_pointmass_2026_03_14_095452) |
+| playground/ReacherEasy | 955.08 | ppo_playground_vnorm | [ppo_playground_reachereasy_2026_03_12_122115](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_reachereasy_2026_03_12_122115) |
+| playground/ReacherHard | 946.99 | ppo_playground_vnorm | [ppo_playground_reacherhard_2026_03_12_123226](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_reacherhard_2026_03_12_123226) |
+| playground/SwimmerSwimmer6 | 591.13 | ppo_playground_vnorm_constlr | [ppo_playground_vnorm_constlr_swimmerswimmer6_2026_03_14_000406](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_vnorm_constlr_swimmerswimmer6_2026_03_14_000406) |
+| playground/WalkerRun | 759.71 | ppo_playground_vnorm | [ppo_playground_vnorm_walkerrun_2026_03_14_161354](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_vnorm_walkerrun_2026_03_14_161354) |
+| playground/WalkerStand | 948.35 | ppo_playground_vnorm | [ppo_playground_vnorm_walkerstand_2026_03_14_161415](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_vnorm_walkerstand_2026_03_14_161415) |
+| playground/WalkerWalk | 945.31 | ppo_playground_vnorm | [ppo_playground_vnorm_walkerwalk_2026_03_14_161338](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_vnorm_walkerwalk_2026_03_14_161338) |
+
+| | | |
+|---|---|---|
+| ![AcrobotSwingup](plots/AcrobotSwingup_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![AcrobotSwingupSparse](plots/AcrobotSwingupSparse_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![BallInCup](plots/BallInCup_multi_trial_graph_mean_returns_ma_vs_frames.png) |
+| ![CartpoleBalance](plots/CartpoleBalance_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![CartpoleBalanceSparse](plots/CartpoleBalanceSparse_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![CartpoleSwingup](plots/CartpoleSwingup_multi_trial_graph_mean_returns_ma_vs_frames.png) |
+| ![CartpoleSwingupSparse](plots/CartpoleSwingupSparse_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![CheetahRun](plots/CheetahRun_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![FingerSpin](plots/FingerSpin_multi_trial_graph_mean_returns_ma_vs_frames.png) |
+| ![FingerTurnEasy](plots/FingerTurnEasy_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![FingerTurnHard](plots/FingerTurnHard_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![FishSwim](plots/FishSwim_multi_trial_graph_mean_returns_ma_vs_frames.png) |
+| ![HopperHop](plots/HopperHop_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![HopperStand](plots/HopperStand_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![HumanoidRun](plots/HumanoidRun_multi_trial_graph_mean_returns_ma_vs_frames.png) |
+| ![HumanoidStand](plots/HumanoidStand_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![HumanoidWalk](plots/HumanoidWalk_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![PendulumSwingup](plots/PendulumSwingup_multi_trial_graph_mean_returns_ma_vs_frames.png) |
+| ![PointMass](plots/PointMass_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![ReacherEasy](plots/ReacherEasy_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![ReacherHard](plots/ReacherHard_multi_trial_graph_mean_returns_ma_vs_frames.png) |
+| ![SwimmerSwimmer6](plots/SwimmerSwimmer6_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![WalkerRun](plots/WalkerRun_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![WalkerStand](plots/WalkerStand_multi_trial_graph_mean_returns_ma_vs_frames.png) |
+| ![WalkerWalk](plots/WalkerWalk_multi_trial_graph_mean_returns_ma_vs_frames.png) | | |
+
+#### Phase 5.2: Locomotion Robots (19 envs)
+
+Real-world robot locomotion — quadrupeds (Go1, Spot, Barkour) and humanoids (H1, G1, T1, Op3, Apollo, BerkeleyHumanoid) on flat and rough terrain.
+
+| ENV | MA | SPEC_NAME | HF Data |
+|-----|-----|-----------|---------|
+| playground/ApolloJoystickFlatTerrain | 17.44 | ppo_playground_loco_precise | [ppo_playground_loco_precise_apollojoystickflatterrain_2026_03_14_210939](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_loco_precise_apollojoystickflatterrain_2026_03_14_210939) |
+| playground/BarkourJoystick | 0.0 | ppo_playground_loco | [ppo_playground_loco_barkourjoystick_2026_03_14_194525](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_loco_barkourjoystick_2026_03_14_194525) |
+| playground/BerkeleyHumanoidJoystickFlatTerrain | 32.29 | ppo_playground_loco_precise | [ppo_playground_loco_precise_berkeleyhumanoidjoystickflatterrain_2026_03_14_213019](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_loco_precise_berkeleyhumanoidjoystickflatterrain_2026_03_14_213019) |
+| playground/BerkeleyHumanoidJoystickRoughTerrain | 21.25 | ppo_playground_loco_precise | [ppo_playground_loco_precise_berkeleyhumanoidjoystickroughterrain_2026_03_15_150211](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_loco_precise_berkeleyhumanoidjoystickroughterrain_2026_03_15_150211) |
+| playground/G1JoystickFlatTerrain | 1.85 | ppo_playground_loco_precise | [ppo_playground_loco_precise_g1joystickflatterrain_2026_03_15_150219](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_loco_precise_g1joystickflatterrain_2026_03_15_150219) |
+| playground/G1JoystickRoughTerrain | -2.75 | ppo_playground_loco_precise | [ppo_playground_loco_precise_g1joystickroughterrain_2026_03_19_015137](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_loco_precise_g1joystickroughterrain_2026_03_19_015137) |
+| playground/Go1Footstand | 23.48 | ppo_playground_loco_precise | [ppo_playground_loco_precise_go1footstand_2026_03_16_174009](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_loco_precise_go1footstand_2026_03_16_174009) |
+| playground/Go1Getup | 18.16 | ppo_playground_loco_go1 | [ppo_playground_loco_go1_go1getup_2026_03_16_132801](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_loco_go1_go1getup_2026_03_16_132801) |
+| playground/Go1Handstand | 17.88 | ppo_playground_loco_precise | [ppo_playground_loco_precise_go1handstand_2026_03_16_155437](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_loco_precise_go1handstand_2026_03_16_155437) |
+| playground/Go1JoystickFlatTerrain | 0.0 | ppo_playground_loco | [ppo_playground_loco_go1joystickflatterrain_2026_03_14_204658](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_loco_go1joystickflatterrain_2026_03_14_204658) |
+| playground/Go1JoystickRoughTerrain | 0.00 | ppo_playground_loco | [ppo_playground_loco_go1joystickroughterrain_2026_03_15_150321](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_loco_go1joystickroughterrain_2026_03_15_150321) |
+| playground/H1InplaceGaitTracking | 11.95 | ppo_playground_loco_precise | [ppo_playground_loco_precise_h1inplacegaittracking_2026_03_16_170327](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_loco_precise_h1inplacegaittracking_2026_03_16_170327) |
+| playground/H1JoystickGaitTracking | 31.11 | ppo_playground_loco_precise | [ppo_playground_loco_precise_h1joystickgaittracking_2026_03_16_170412](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_loco_precise_h1joystickgaittracking_2026_03_16_170412) |
+| playground/Op3Joystick | 0.00 | ppo_playground_loco | [ppo_playground_loco_op3joystick_2026_03_15_150120](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_loco_op3joystick_2026_03_15_150120) |
+| playground/SpotFlatTerrainJoystick | 48.58 | ppo_playground_loco_precise | [ppo_playground_loco_precise_spotflatterrainjoystick_2026_03_16_180747](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_loco_precise_spotflatterrainjoystick_2026_03_16_180747) |
+| playground/SpotGetup | 19.39 | ppo_playground_loco | [ppo_playground_loco_spotgetup_2026_03_14_213703](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_loco_spotgetup_2026_03_14_213703) |
+| playground/SpotJoystickGaitTracking | 36.90 | ppo_playground_loco | [ppo_playground_loco_spotjoystickgaittracking_2026_03_19_015106](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_loco_spotjoystickgaittracking_2026_03_19_015106) |
+| playground/T1JoystickFlatTerrain | 13.42 | ppo_playground_loco_precise | [ppo_playground_loco_precise_t1joystickflatterrain_2026_03_14_220250](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_loco_precise_t1joystickflatterrain_2026_03_14_220250) |
+| playground/T1JoystickRoughTerrain | 2.58 | ppo_playground_loco_precise | [ppo_playground_loco_precise_t1joystickroughterrain_2026_03_15_162332](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_loco_precise_t1joystickroughterrain_2026_03_15_162332) |
+
+| | | |
+|---|---|---|
+| ![ApolloJoystickFlatTerrain](plots/ApolloJoystickFlatTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![BarkourJoystick](plots/BarkourJoystick_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![BerkeleyHumanoidJoystickFlatTerrain](plots/BerkeleyHumanoidJoystickFlatTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png) |
+| ![G1JoystickFlatTerrain](plots/G1JoystickFlatTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![Go1Footstand](plots/Go1Footstand_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![Go1Handstand](plots/Go1Handstand_multi_trial_graph_mean_returns_ma_vs_frames.png) |
+| ![H1InplaceGaitTracking](plots/H1InplaceGaitTracking_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![H1JoystickGaitTracking](plots/H1JoystickGaitTracking_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![Op3Joystick](plots/Op3Joystick_multi_trial_graph_mean_returns_ma_vs_frames.png) |
+| ![SpotFlatTerrainJoystick](plots/SpotFlatTerrainJoystick_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![SpotGetup](plots/SpotGetup_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![SpotJoystickGaitTracking](plots/SpotJoystickGaitTracking_multi_trial_graph_mean_returns_ma_vs_frames.png) |
+| ![BerkeleyHumanoidJoystickRoughTerrain](plots/BerkeleyHumanoidJoystickRoughTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![Go1Getup](plots/Go1Getup_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![Go1JoystickFlatTerrain](plots/Go1JoystickFlatTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png) |
+| ![Go1JoystickRoughTerrain](plots/Go1JoystickRoughTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![T1JoystickFlatTerrain](plots/T1JoystickFlatTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![T1JoystickRoughTerrain](plots/T1JoystickRoughTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png) |
+
+#### Phase 5.3: Manipulation (10 envs)
+
+Robotic manipulation — Panda arm pick/place, Aloha bimanual, Leap dexterous hand, and AeroCube orientation tasks.
+
+| ENV | MA | SPEC_NAME | HF Data |
+|-----|-----|-----------|---------|
+| playground/AeroCubeRotateZAxis | -3.09 | ppo_playground_loco | [ppo_playground_loco_aerocuberotatezaxis_2026_03_20_012502](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_loco_aerocuberotatezaxis_2026_03_20_012502) |
+| playground/AlohaHandOver | 3.65 | ppo_playground_loco | [ppo_playground_loco_alohahandover_2026_03_15_023712](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_loco_alohahandover_2026_03_15_023712) |
+| playground/AlohaSinglePegInsertion | 220.93 | ppo_playground_manip_aloha_peg | [ppo_playground_manip_aloha_peg_alohasinglepeginsertion_2026_03_17_122613](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_manip_aloha_peg_alohasinglepeginsertion_2026_03_17_122613) |
+| playground/LeapCubeReorient | 74.68 | ppo_playground_loco | [ppo_playground_loco_leapcubereorient_2026_03_15_150420](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_loco_leapcubereorient_2026_03_15_150420) |
+| playground/LeapCubeRotateZAxis | 91.65 | ppo_playground_loco | [ppo_playground_loco_leapcuberotatezaxis_2026_03_15_150334](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_loco_leapcuberotatezaxis_2026_03_15_150334) |
+| playground/PandaOpenCabinet | 11081.51 | ppo_playground_loco | [ppo_playground_loco_pandaopencabinet_2026_03_15_150318](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_loco_pandaopencabinet_2026_03_15_150318) |
+| playground/PandaPickCube | 4586.13 | ppo_playground_loco | [ppo_playground_loco_pandapickcube_2026_03_15_023744](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_loco_pandapickcube_2026_03_15_023744) |
+| playground/PandaPickCubeCartesian | 10.58 | ppo_playground_loco | [ppo_playground_loco_pandapickcubecartesian_2026_03_15_023810](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_loco_pandapickcubecartesian_2026_03_15_023810) |
+| playground/PandaPickCubeOrientation | 4281.66 | ppo_playground_loco | [ppo_playground_loco_pandapickcubeorientation_2026_03_19_015108](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_loco_pandapickcubeorientation_2026_03_19_015108) |
+| playground/PandaRobotiqPushCube | 1.31 | ppo_playground_loco | [ppo_playground_loco_pandarobotiqpushcube_2026_03_15_042131](https://huggingface.co/datasets/SLM-Lab/benchmark/tree/main/data/ppo_playground_loco_pandarobotiqpushcube_2026_03_15_042131) |
+
+| | | |
+|---|---|---|
+| ![AeroCubeRotateZAxis](plots/AeroCubeRotateZAxis_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![AlohaHandOver](plots/AlohaHandOver_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![AlohaSinglePegInsertion](plots/AlohaSinglePegInsertion_multi_trial_graph_mean_returns_ma_vs_frames.png) |
+| ![LeapCubeReorient](plots/LeapCubeReorient_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![LeapCubeRotateZAxis](plots/LeapCubeRotateZAxis_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![PandaOpenCabinet](plots/PandaOpenCabinet_multi_trial_graph_mean_returns_ma_vs_frames.png) |
+| ![PandaPickCube](plots/PandaPickCube_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![PandaPickCubeCartesian](plots/PandaPickCubeCartesian_multi_trial_graph_mean_returns_ma_vs_frames.png) | ![PandaPickCubeOrientation](plots/PandaPickCubeOrientation_multi_trial_graph_mean_returns_ma_vs_frames.png) |
+| ![PandaRobotiqPushCube](plots/PandaRobotiqPushCube_multi_trial_graph_mean_returns_ma_vs_frames.png) | | |
+
diff --git a/docs/plots/AcrobotSwingupSparse_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/AcrobotSwingupSparse_multi_trial_graph_mean_returns_ma_vs_frames.png
new file mode 100644
index 000000000..ca1cb681e
Binary files /dev/null and b/docs/plots/AcrobotSwingupSparse_multi_trial_graph_mean_returns_ma_vs_frames.png differ
diff --git a/docs/plots/AcrobotSwingup_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/AcrobotSwingup_multi_trial_graph_mean_returns_ma_vs_frames.png
new file mode 100644
index 000000000..e9f5d1993
Binary files /dev/null and b/docs/plots/AcrobotSwingup_multi_trial_graph_mean_returns_ma_vs_frames.png differ
diff --git a/docs/plots/AeroCubeRotateZAxis_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/AeroCubeRotateZAxis_multi_trial_graph_mean_returns_ma_vs_frames.png
new file mode 100644
index 000000000..afedaef80
Binary files /dev/null and b/docs/plots/AeroCubeRotateZAxis_multi_trial_graph_mean_returns_ma_vs_frames.png differ
diff --git a/docs/plots/AlohaHandOver_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/AlohaHandOver_multi_trial_graph_mean_returns_ma_vs_frames.png
new file mode 100644
index 000000000..7a236a555
Binary files /dev/null and b/docs/plots/AlohaHandOver_multi_trial_graph_mean_returns_ma_vs_frames.png differ
diff --git a/docs/plots/AlohaSinglePegInsertion_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/AlohaSinglePegInsertion_multi_trial_graph_mean_returns_ma_vs_frames.png
new file mode 100644
index 000000000..cabb7331f
Binary files /dev/null and b/docs/plots/AlohaSinglePegInsertion_multi_trial_graph_mean_returns_ma_vs_frames.png differ
diff --git a/docs/plots/ApolloJoystickFlatTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/ApolloJoystickFlatTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png
new file mode 100644
index 000000000..775a55fe6
Binary files /dev/null and b/docs/plots/ApolloJoystickFlatTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png differ
diff --git a/docs/plots/BallInCup_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/BallInCup_multi_trial_graph_mean_returns_ma_vs_frames.png
new file mode 100644
index 000000000..b0d09734b
Binary files /dev/null and b/docs/plots/BallInCup_multi_trial_graph_mean_returns_ma_vs_frames.png differ
diff --git a/docs/plots/BarkourJoystick_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/BarkourJoystick_multi_trial_graph_mean_returns_ma_vs_frames.png
new file mode 100644
index 000000000..d8e917f57
Binary files /dev/null and b/docs/plots/BarkourJoystick_multi_trial_graph_mean_returns_ma_vs_frames.png differ
diff --git a/docs/plots/BerkeleyHumanoidJoystickFlatTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/BerkeleyHumanoidJoystickFlatTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png
new file mode 100644
index 000000000..1301dc6aa
Binary files /dev/null and b/docs/plots/BerkeleyHumanoidJoystickFlatTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png differ
diff --git a/docs/plots/BerkeleyHumanoidJoystickRoughTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/BerkeleyHumanoidJoystickRoughTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png
new file mode 100644
index 000000000..164c0576d
Binary files /dev/null and b/docs/plots/BerkeleyHumanoidJoystickRoughTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png differ
diff --git a/docs/plots/CartpoleBalanceSparse_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/CartpoleBalanceSparse_multi_trial_graph_mean_returns_ma_vs_frames.png
new file mode 100644
index 000000000..36393690e
Binary files /dev/null and b/docs/plots/CartpoleBalanceSparse_multi_trial_graph_mean_returns_ma_vs_frames.png differ
diff --git a/docs/plots/CartpoleBalance_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/CartpoleBalance_multi_trial_graph_mean_returns_ma_vs_frames.png
new file mode 100644
index 000000000..4754ef437
Binary files /dev/null and b/docs/plots/CartpoleBalance_multi_trial_graph_mean_returns_ma_vs_frames.png differ
diff --git a/docs/plots/CartpoleSwingupSparse_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/CartpoleSwingupSparse_multi_trial_graph_mean_returns_ma_vs_frames.png
new file mode 100644
index 000000000..994552715
Binary files /dev/null and b/docs/plots/CartpoleSwingupSparse_multi_trial_graph_mean_returns_ma_vs_frames.png differ
diff --git a/docs/plots/CartpoleSwingup_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/CartpoleSwingup_multi_trial_graph_mean_returns_ma_vs_frames.png
new file mode 100644
index 000000000..5f02730b8
Binary files /dev/null and b/docs/plots/CartpoleSwingup_multi_trial_graph_mean_returns_ma_vs_frames.png differ
diff --git a/docs/plots/CheetahRun_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/CheetahRun_multi_trial_graph_mean_returns_ma_vs_frames.png
new file mode 100644
index 000000000..29eb8bd98
Binary files /dev/null and b/docs/plots/CheetahRun_multi_trial_graph_mean_returns_ma_vs_frames.png differ
diff --git a/docs/plots/FingerSpin_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/FingerSpin_multi_trial_graph_mean_returns_ma_vs_frames.png
new file mode 100644
index 000000000..ee2438497
Binary files /dev/null and b/docs/plots/FingerSpin_multi_trial_graph_mean_returns_ma_vs_frames.png differ
diff --git a/docs/plots/FingerTurnEasy_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/FingerTurnEasy_multi_trial_graph_mean_returns_ma_vs_frames.png
new file mode 100644
index 000000000..ad60d0252
Binary files /dev/null and b/docs/plots/FingerTurnEasy_multi_trial_graph_mean_returns_ma_vs_frames.png differ
diff --git a/docs/plots/FingerTurnHard_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/FingerTurnHard_multi_trial_graph_mean_returns_ma_vs_frames.png
new file mode 100644
index 000000000..a3de98da2
Binary files /dev/null and b/docs/plots/FingerTurnHard_multi_trial_graph_mean_returns_ma_vs_frames.png differ
diff --git a/docs/plots/FishSwim_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/FishSwim_multi_trial_graph_mean_returns_ma_vs_frames.png
new file mode 100644
index 000000000..1a994e2ff
Binary files /dev/null and b/docs/plots/FishSwim_multi_trial_graph_mean_returns_ma_vs_frames.png differ
diff --git a/docs/plots/G1JoystickFlatTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/G1JoystickFlatTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png
new file mode 100644
index 000000000..1cf4b529f
Binary files /dev/null and b/docs/plots/G1JoystickFlatTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png differ
diff --git a/docs/plots/G1JoystickRoughTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/G1JoystickRoughTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png
new file mode 100644
index 000000000..ca0c19cd6
Binary files /dev/null and b/docs/plots/G1JoystickRoughTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png differ
diff --git a/docs/plots/Go1Footstand_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/Go1Footstand_multi_trial_graph_mean_returns_ma_vs_frames.png
new file mode 100644
index 000000000..7ecf0aec2
Binary files /dev/null and b/docs/plots/Go1Footstand_multi_trial_graph_mean_returns_ma_vs_frames.png differ
diff --git a/docs/plots/Go1Getup_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/Go1Getup_multi_trial_graph_mean_returns_ma_vs_frames.png
new file mode 100644
index 000000000..2f65a7f6b
Binary files /dev/null and b/docs/plots/Go1Getup_multi_trial_graph_mean_returns_ma_vs_frames.png differ
diff --git a/docs/plots/Go1Handstand_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/Go1Handstand_multi_trial_graph_mean_returns_ma_vs_frames.png
new file mode 100644
index 000000000..6886cb0de
Binary files /dev/null and b/docs/plots/Go1Handstand_multi_trial_graph_mean_returns_ma_vs_frames.png differ
diff --git a/docs/plots/Go1JoystickFlatTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/Go1JoystickFlatTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png
new file mode 100644
index 000000000..49885e784
Binary files /dev/null and b/docs/plots/Go1JoystickFlatTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png differ
diff --git a/docs/plots/Go1JoystickRoughTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/Go1JoystickRoughTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png
new file mode 100644
index 000000000..189e680ae
Binary files /dev/null and b/docs/plots/Go1JoystickRoughTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png differ
diff --git a/docs/plots/H1InplaceGaitTracking_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/H1InplaceGaitTracking_multi_trial_graph_mean_returns_ma_vs_frames.png
new file mode 100644
index 000000000..8a5bd1630
Binary files /dev/null and b/docs/plots/H1InplaceGaitTracking_multi_trial_graph_mean_returns_ma_vs_frames.png differ
diff --git a/docs/plots/H1JoystickGaitTracking_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/H1JoystickGaitTracking_multi_trial_graph_mean_returns_ma_vs_frames.png
new file mode 100644
index 000000000..11e4e9dfe
Binary files /dev/null and b/docs/plots/H1JoystickGaitTracking_multi_trial_graph_mean_returns_ma_vs_frames.png differ
diff --git a/docs/plots/HopperHop_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/HopperHop_multi_trial_graph_mean_returns_ma_vs_frames.png
new file mode 100644
index 000000000..91b18f6cb
Binary files /dev/null and b/docs/plots/HopperHop_multi_trial_graph_mean_returns_ma_vs_frames.png differ
diff --git a/docs/plots/HopperStand_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/HopperStand_multi_trial_graph_mean_returns_ma_vs_frames.png
new file mode 100644
index 000000000..c81509155
Binary files /dev/null and b/docs/plots/HopperStand_multi_trial_graph_mean_returns_ma_vs_frames.png differ
diff --git a/docs/plots/HumanoidRun_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/HumanoidRun_multi_trial_graph_mean_returns_ma_vs_frames.png
new file mode 100644
index 000000000..030061127
Binary files /dev/null and b/docs/plots/HumanoidRun_multi_trial_graph_mean_returns_ma_vs_frames.png differ
diff --git a/docs/plots/HumanoidStand_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/HumanoidStand_multi_trial_graph_mean_returns_ma_vs_frames.png
new file mode 100644
index 000000000..98e5bcd21
Binary files /dev/null and b/docs/plots/HumanoidStand_multi_trial_graph_mean_returns_ma_vs_frames.png differ
diff --git a/docs/plots/HumanoidWalk_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/HumanoidWalk_multi_trial_graph_mean_returns_ma_vs_frames.png
new file mode 100644
index 000000000..e450f5cd2
Binary files /dev/null and b/docs/plots/HumanoidWalk_multi_trial_graph_mean_returns_ma_vs_frames.png differ
diff --git a/docs/plots/LeapCubeReorient_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/LeapCubeReorient_multi_trial_graph_mean_returns_ma_vs_frames.png
new file mode 100644
index 000000000..08184ab09
Binary files /dev/null and b/docs/plots/LeapCubeReorient_multi_trial_graph_mean_returns_ma_vs_frames.png differ
diff --git a/docs/plots/LeapCubeRotateZAxis_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/LeapCubeRotateZAxis_multi_trial_graph_mean_returns_ma_vs_frames.png
new file mode 100644
index 000000000..d2010bf16
Binary files /dev/null and b/docs/plots/LeapCubeRotateZAxis_multi_trial_graph_mean_returns_ma_vs_frames.png differ
diff --git a/docs/plots/Op3Joystick_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/Op3Joystick_multi_trial_graph_mean_returns_ma_vs_frames.png
new file mode 100644
index 000000000..d7d975c07
Binary files /dev/null and b/docs/plots/Op3Joystick_multi_trial_graph_mean_returns_ma_vs_frames.png differ
diff --git a/docs/plots/PandaOpenCabinet_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/PandaOpenCabinet_multi_trial_graph_mean_returns_ma_vs_frames.png
new file mode 100644
index 000000000..f39e41677
Binary files /dev/null and b/docs/plots/PandaOpenCabinet_multi_trial_graph_mean_returns_ma_vs_frames.png differ
diff --git a/docs/plots/PandaPickCubeCartesian_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/PandaPickCubeCartesian_multi_trial_graph_mean_returns_ma_vs_frames.png
new file mode 100644
index 000000000..3ee1f8e19
Binary files /dev/null and b/docs/plots/PandaPickCubeCartesian_multi_trial_graph_mean_returns_ma_vs_frames.png differ
diff --git a/docs/plots/PandaPickCubeOrientation_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/PandaPickCubeOrientation_multi_trial_graph_mean_returns_ma_vs_frames.png
new file mode 100644
index 000000000..cb032577c
Binary files /dev/null and b/docs/plots/PandaPickCubeOrientation_multi_trial_graph_mean_returns_ma_vs_frames.png differ
diff --git a/docs/plots/PandaPickCube_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/PandaPickCube_multi_trial_graph_mean_returns_ma_vs_frames.png
new file mode 100644
index 000000000..63a1b6cfe
Binary files /dev/null and b/docs/plots/PandaPickCube_multi_trial_graph_mean_returns_ma_vs_frames.png differ
diff --git a/docs/plots/PandaRobotiqPushCube_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/PandaRobotiqPushCube_multi_trial_graph_mean_returns_ma_vs_frames.png
new file mode 100644
index 000000000..d62c4ef62
Binary files /dev/null and b/docs/plots/PandaRobotiqPushCube_multi_trial_graph_mean_returns_ma_vs_frames.png differ
diff --git a/docs/plots/PendulumSwingup_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/PendulumSwingup_multi_trial_graph_mean_returns_ma_vs_frames.png
new file mode 100644
index 000000000..9b2f3d06e
Binary files /dev/null and b/docs/plots/PendulumSwingup_multi_trial_graph_mean_returns_ma_vs_frames.png differ
diff --git a/docs/plots/PointMass_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/PointMass_multi_trial_graph_mean_returns_ma_vs_frames.png
new file mode 100644
index 000000000..9f98091a6
Binary files /dev/null and b/docs/plots/PointMass_multi_trial_graph_mean_returns_ma_vs_frames.png differ
diff --git a/docs/plots/ReacherEasy_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/ReacherEasy_multi_trial_graph_mean_returns_ma_vs_frames.png
new file mode 100644
index 000000000..5ed0c345c
Binary files /dev/null and b/docs/plots/ReacherEasy_multi_trial_graph_mean_returns_ma_vs_frames.png differ
diff --git a/docs/plots/ReacherHard_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/ReacherHard_multi_trial_graph_mean_returns_ma_vs_frames.png
new file mode 100644
index 000000000..382a7a08b
Binary files /dev/null and b/docs/plots/ReacherHard_multi_trial_graph_mean_returns_ma_vs_frames.png differ
diff --git a/docs/plots/SpotFlatTerrainJoystick_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/SpotFlatTerrainJoystick_multi_trial_graph_mean_returns_ma_vs_frames.png
new file mode 100644
index 000000000..0abd8074f
Binary files /dev/null and b/docs/plots/SpotFlatTerrainJoystick_multi_trial_graph_mean_returns_ma_vs_frames.png differ
diff --git a/docs/plots/SpotGetup_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/SpotGetup_multi_trial_graph_mean_returns_ma_vs_frames.png
new file mode 100644
index 000000000..961900e6f
Binary files /dev/null and b/docs/plots/SpotGetup_multi_trial_graph_mean_returns_ma_vs_frames.png differ
diff --git a/docs/plots/SpotJoystickGaitTracking_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/SpotJoystickGaitTracking_multi_trial_graph_mean_returns_ma_vs_frames.png
new file mode 100644
index 000000000..b2d04cee2
Binary files /dev/null and b/docs/plots/SpotJoystickGaitTracking_multi_trial_graph_mean_returns_ma_vs_frames.png differ
diff --git a/docs/plots/SwimmerSwimmer6_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/SwimmerSwimmer6_multi_trial_graph_mean_returns_ma_vs_frames.png
new file mode 100644
index 000000000..a610b40af
Binary files /dev/null and b/docs/plots/SwimmerSwimmer6_multi_trial_graph_mean_returns_ma_vs_frames.png differ
diff --git a/docs/plots/T1JoystickFlatTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/T1JoystickFlatTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png
new file mode 100644
index 000000000..83f229232
Binary files /dev/null and b/docs/plots/T1JoystickFlatTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png differ
diff --git a/docs/plots/T1JoystickRoughTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/T1JoystickRoughTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png
new file mode 100644
index 000000000..da9f0154b
Binary files /dev/null and b/docs/plots/T1JoystickRoughTerrain_multi_trial_graph_mean_returns_ma_vs_frames.png differ
diff --git a/docs/plots/WalkerRun_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/WalkerRun_multi_trial_graph_mean_returns_ma_vs_frames.png
new file mode 100644
index 000000000..c2abfaf13
Binary files /dev/null and b/docs/plots/WalkerRun_multi_trial_graph_mean_returns_ma_vs_frames.png differ
diff --git a/docs/plots/WalkerStand_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/WalkerStand_multi_trial_graph_mean_returns_ma_vs_frames.png
new file mode 100644
index 000000000..1da1e3fc9
Binary files /dev/null and b/docs/plots/WalkerStand_multi_trial_graph_mean_returns_ma_vs_frames.png differ
diff --git a/docs/plots/WalkerWalk_multi_trial_graph_mean_returns_ma_vs_frames.png b/docs/plots/WalkerWalk_multi_trial_graph_mean_returns_ma_vs_frames.png
new file mode 100644
index 000000000..e2f12f1e7
Binary files /dev/null and b/docs/plots/WalkerWalk_multi_trial_graph_mean_returns_ma_vs_frames.png differ
diff --git a/pyproject.toml b/pyproject.toml
index 624956e0d..7e6b54cf5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "slm-lab"
-version = "5.2.0"
+version = "5.3.0"
 description = "Modular Deep Reinforcement Learning framework in PyTorch."
 readme = "README.md"
 requires-python = ">=3.12.0"
diff --git a/slm_lab/env/__init__.py b/slm_lab/env/__init__.py
index e0cf8568a..ffac34c1a 100644
--- a/slm_lab/env/__init__.py
+++ b/slm_lab/env/__init__.py
@@ -20,6 +20,7 @@
     NormalizeReward as VectorNormalizeReward,
     RecordEpisodeStatistics as VectorRecordEpisodeStatistics,
     RescaleAction as VectorRescaleAction,
+    TransformReward as VectorTransformReward,
 )
 
 from slm_lab.env.wrappers import (
@@ -45,6 +46,22 @@
 except ImportError:
     pass
 
+# Register Pavlovian environment
+gym.register(
+    id="SLM/Pavlovian-v0",
+    entry_point="slm_lab.env.pavlovian:PavlovianEnv",
+    max_episode_steps=1000,
+)
+
+# Register Sensorimotor environments (TC-11 through TC-24)
+for _tc_id in range(11, 25):
+    gym.register(
+        id=f"SLM-Sensorimotor-TC{_tc_id:02d}-v0",
+        entry_point="slm_lab.env.sensorimotor:SLMSensorimotor",
+        kwargs={"task_id": f"TC-{_tc_id:02d}"},
+        max_episode_steps=500,
+    )
+
 logger = logger.get_logger(__name__)
 
 # Keys handled by make_env, not passed to gym.make
@@ -57,6 +74,8 @@
     "normalize_reward",
     "clip_obs",
     "clip_reward",
+    "device",
+    "reward_scale",
 }
 
 
@@ -150,16 +169,92 @@ def _set_env_attributes(env: gym.Env, spec: dict[str, Any]) -> None:
     env.done = False
 
 
+def _make_playground_env(
+    name: str,
+    num_envs: int,
+    normalize_obs: bool,
+    normalize_reward: bool,
+    clip_obs: float | None,
+    clip_reward: float | None,
+    gamma: float,
+    device: str | None = None,
+    render_mode: str | None = None,
+    reward_scale: float = 1.0,
+) -> gym.Env:
+    """Create a MuJoCo Playground vectorized environment."""
+    try:
+        from slm_lab.env.playground import PlaygroundVecEnv
+        from slm_lab.env.wrappers import (
+            PlaygroundRenderWrapper,
+            TorchNormalizeObservation,
+        )
+    except ImportError:
+        raise ImportError(
+            "MuJoCo Playground is required for playground/ environments. "
+            "Install with: uv sync --group playground"
+        )
+
+    # Prevent JAX from pre-allocating GPU memory when sharing with PyTorch
+    if device is not None:
+        os.environ.setdefault("XLA_PYTHON_CLIENT_PREALLOCATE", "false")
+
+    # Strip "playground/" prefix to get the env name for the registry
+    pg_env_name = name.removeprefix("playground/")
+    env = PlaygroundVecEnv(pg_env_name, num_envs, device=device)
+    logger.info(f"Playground: JAX→PyTorch via {'DLPack zero-copy (GPU)' if device else 'numpy (CPU)'}")
+
+    if _needs_action_rescaling(env):
+        action_space = env.single_action_space
+        logger.info(
+            f"Action rescaling: [{action_space.low.min():.1f}, {action_space.high.max():.1f}] → [-1, 1]"
+        )
+        env = VectorRescaleAction(env, min_action=-1.0, max_action=1.0)
+
+    env = VectorRecordEpisodeStatistics(env)
+
+    if reward_scale != 1.0:
+        env = VectorTransformReward(env, lambda r: r * reward_scale)
+
+    if render_mode:
+        env = PlaygroundRenderWrapper(env)
+
+    if device is not None:
+        if normalize_obs:
+            env = TorchNormalizeObservation(env)
+
+    # Skip numpy-only wrappers in GPU mode (network-level normalization used instead)
+    if device is None:
+        if normalize_obs:
+            env = VectorNormalizeObservation(env)
+        if clip_obs is not None:
+            env = VectorClipObservation(env, bound=float(clip_obs))
+        if normalize_reward:
+            env = VectorNormalizeReward(env, gamma=gamma)
+        if clip_reward is not None:
+            if isinstance(clip_reward, (int, float)):
+                env = VectorClipReward(
+                    env, min_reward=-clip_reward, max_reward=clip_reward
+                )
+            else:
+                env = VectorClipReward(
+                    env, min_reward=clip_reward[0], max_reward=clip_reward[1]
+                )
+
+    return env
+
+
 def make_env(spec: dict[str, Any]) -> gym.Env:
     """Create a gymnasium environment.
 
     Gymnasium defaults are sensible - only override what's needed.
     For Atari (ALE/*), AtariVectorEnv handles all preprocessing natively.
+    For Playground (playground/*), uses JAX-based MuJoCo Playground backend.
     """
     env_spec = spec["env"]
     name = env_spec["name"]
     num_envs = env_spec.get("num_envs", 1)
     is_atari = name.startswith("ALE/")
+    is_playground = name.startswith("playground/")
     render_mode = "human" if render() else None
 
     # Pass through env kwargs (life_loss_info, repeat_action_probability, etc.)
@@ -172,7 +267,27 @@ def make_env(spec: dict[str, Any]) -> gym.Env:
     clip_reward = env_spec.get("clip_reward", 10.0 if normalize_reward else None)
     gamma = spec.get("agent", {}).get("algorithm", {}).get("gamma", 0.99)
 
-    if num_envs > 1:
+    device = env_spec.get("device")
+    if is_playground and (device is None or device == "auto"):
+        import torch
+        device = "cuda" if torch.cuda.is_available() else None
+
+    if is_playground:
+        logger.info(f"Playground device: {'GPU (cuda) — DLPack zero-copy' if device else 'CPU — numpy transfer'}")
+        reward_scale = env_spec.get("reward_scale", 1.0)
+        env = _make_playground_env(
+            name,
+            num_envs,
+            normalize_obs,
+            normalize_reward,
+            clip_obs,
+            clip_reward,
+            gamma,
+            device=device,
+            render_mode=render_mode,
+            reward_scale=reward_scale,
+        )
+    elif num_envs > 1:
         env = _make_vector_env(
             name,
             num_envs,
diff --git a/slm_lab/env/playground.py b/slm_lab/env/playground.py
new file mode 100644
index 000000000..fcc7a52e4
--- /dev/null
+++ b/slm_lab/env/playground.py
@@ -0,0 +1,215 @@
+"""MuJoCo Playground environment wrapper for SLM-Lab.
+
+Wraps MuJoCo Playground (JAX/MJWarp) environments as gymnasium VectorEnv,
+enabling use with SLM-Lab's training loop. BraxAutoResetWrapper handles
+batched step/reset internally; arrays are converted to numpy at the boundary.
+
+Uses MJWarp backend (Warp-accelerated MJX) uniformly for GPU simulation.
+JAX is the dispatch/tracing layer; Warp CUDA kernels handle physics.
+"""
+
+import os
+import gymnasium as gym
+import jax
+import jax.numpy as jnp
+import numpy as np
+from gymnasium import spaces
+from gymnasium.vector.utils import batch_space
+
+try:
+    from mujoco_playground import registry as pg_registry
+    from mujoco_playground import wrapper as pg_wrapper
+    from mujoco_playground._src import mjx_env as _mjx_env_module
+except ImportError:
+    raise ImportError(
+        "MuJoCo Playground is required for playground environments. "
+        "Install with: uv sync --group playground"
+    )
+
+# Monkey-patch mjx_env.make_data to ensure naccdmax is set when missing.
+# Some mujoco_warp versions default naccdmax=None to 0, causing CCD buffer
+# overflow for envs with mesh/convex colliders. We resolve None to naconmax
+# (the total active-contact buffer), which is always a safe upper bound.
+_original_make_data = _mjx_env_module.make_data
+
+
+def _patched_make_data(*args, **kwargs):
+    naccdmax = kwargs.get("naccdmax")
+    naconmax = kwargs.get("naconmax")
+    if naccdmax is None and naconmax is not None:
+        kwargs["naccdmax"] = naconmax
+    return _original_make_data(*args, **kwargs)
+
+
+_mjx_env_module.make_data = _patched_make_data
+
+# Suppress MuJoCo C-level stderr warnings (ccd_iterations, nefc/broadphase overflow).
+# These repeat every step for 100M frames, exploding log/output size on dstack.
+# Suppressed permanently after first step — no per-call overhead or sync barriers.
+_stderr_suppressed = False
+
+
+# Per-env action_repeat from official dm_control_suite_params.py
+# These match mujoco_playground's canonical training configs exactly.
+_ACTION_REPEAT: dict[str, int] = {
+    "PendulumSwingup": 4,
+}
+
+
+def _build_config_overrides(env_name: str) -> dict:
+    """Build config overrides for the given env.
+
+    Sets impl='warp' for envs that support backend selection.
+    When njmax is 0, sets None to trigger auto-detection via _default_njmax().
+    """
+    default_cfg = pg_registry.get_default_config(env_name)
+    overrides = {"impl": "warp"} if hasattr(default_cfg, "impl") else {}
+    njmax = getattr(default_cfg, "njmax", None)
+
+    if njmax is not None and njmax == 0:
+        overrides["njmax"] = None
+
+    return overrides
+
+
+class PlaygroundVecEnv(gym.vector.VectorEnv):
+    """Vectorized wrapper for MuJoCo Playground environments.
+
+    Uses MJWarp backend uniformly (impl='warp'). BraxAutoResetWrapper handles
+    batched execution internally. Converts JAX arrays to numpy or torch tensors
+    via DLPack at the API boundary for SLM-Lab's PyTorch training loop.
+    """
+
+    def __init__(
+        self,
+        env_name: str,
+        num_envs: int,
+        seed: int = 0,
+        episode_length: int = 1000,
+        device: str | None = None,
+    ):
+        self._env_name = env_name
+        self._device = device
+        if device is not None:
+            import torch
+
+            self._torch_device = torch.device(device)
+
+        # Load the MJX environment and wrap for batched training
+        # wrap_for_brax_training applies: VmapWrapper → EpisodeWrapper → BraxAutoResetWrapper
+        # impl='warp' selects MJWarp (Warp-accelerated MJX) on CUDA; 'jax' on CPU
+        config_overrides = _build_config_overrides(env_name)
+        self._base_env = pg_registry.load(
+            env_name, config_overrides=config_overrides
+        )  # kept for rendering
+        base_env = self._base_env
+        action_repeat = _ACTION_REPEAT.get(env_name, 1)
+        self._env = pg_wrapper.wrap_for_brax_training(
+            base_env, episode_length=episode_length, action_repeat=action_repeat
+        )
+
+        # Build observation and action spaces
+        obs_size = base_env.observation_size
+        if isinstance(obs_size, dict):
+            if "state" in obs_size:
+                # Use only "state" key — excludes privileged_state from actor input
+                total_obs_dim = obs_size["state"] if not isinstance(obs_size["state"], tuple) else np.prod(obs_size["state"])
+            else:
+                total_obs_dim = sum(
+                    np.prod(s) if isinstance(s, tuple) else s for s in obs_size.values()
+                )
+        else:
+            total_obs_dim = obs_size
+        act_size = base_env.action_size
+        obs_space = spaces.Box(
+            low=-np.inf, high=np.inf, shape=(int(total_obs_dim),), dtype=np.float32
+        )
+        act_space = spaces.Box(low=-1.0, high=1.0, shape=(act_size,), dtype=np.float32)
+
+        # Set VectorEnv attributes directly (gymnasium 1.x has no __init__)
+        self.num_envs = num_envs
+        self.single_observation_space = obs_space
+        self.single_action_space = act_space
+        self.observation_space = batch_space(obs_space, num_envs)
+        self.action_space = batch_space(act_space, num_envs)
+
+        # JIT-compile reset and step (BraxAutoResetWrapper handles batching internally)
+        self._jit_reset = jax.jit(self._env.reset)
+        self._jit_step = jax.jit(self._env.step)
+
+        # Initialize RNG
+        self._rng = jax.random.PRNGKey(seed)
+        self._state = None
+
+    def _to_output(self, x: jax.Array):
+        """Convert JAX array to output format. DLPack zero-copy when JAX+PyTorch both on GPU."""
+        if self._device is not None:
+            import torch
+
+            t = torch.from_dlpack(x)
+            # If JAX is on CPU but device is cuda, move explicitly (CPU->GPU copy)
+            return t if t.is_cuda else t.to(self._device)
+        return np.asarray(x).astype(np.float32)
+
+    def _get_obs(self, state):
+        obs = state.obs
+        if isinstance(obs, dict):
+            # Use only "state" key when available — excludes privileged_state from actor
+            obs = obs.get("state", jnp.concatenate([obs[k] for k in sorted(obs.keys())], axis=-1))
+        return self._to_output(obs)
+
+    def reset(self, *, seed: int | None = None, options: dict | None = None):
+        if seed is not None:
+            self._rng = jax.random.PRNGKey(seed)
+        self._rng, *sub_keys = jax.random.split(self._rng, self.num_envs + 1)
+        sub_keys = jnp.stack(sub_keys)
+        self._state = self._jit_reset(sub_keys)
+        obs = self._get_obs(self._state)
+        return obs, {}
+
+    def step(self, actions: np.ndarray):
+        jax_actions = jnp.array(actions, dtype=jnp.float32)
+        self._state = self._jit_step(self._state, jax_actions)
+        # Suppress stderr permanently after first step — MuJoCo C warnings
+        # repeat every step, but JAX async means we can't suppress per-call
+        # without block_until_ready (which kills performance ~10x for slow envs).
+        global _stderr_suppressed
+        if not _stderr_suppressed:
+            _stderr_suppressed = True
+            devnull = os.open(os.devnull, os.O_WRONLY)
+            os.dup2(devnull, 2)
+            os.close(devnull)
+
+        obs = self._get_obs(self._state)
+        # Rewards, dones, info always numpy (used for control flow and memory)
+        rewards = np.asarray(self._state.reward).astype(np.float32)
+        dones = np.asarray(self._state.done).astype(bool)
+
+        # Brax EpisodeWrapper sets state.info['truncation'] (1 = time limit, 0 = not)
+        truncation = self._state.info.get("truncation", None)
+        if truncation is not None:
+            truncated = np.asarray(truncation).astype(bool)
+            terminated = dones & ~truncated
+        else:
+            terminated = dones
+            truncated = np.zeros_like(dones, dtype=bool)
+
+        # Extract metrics as info
+        info = {}
+        if self._state.metrics:
+            for k, v in self._state.metrics.items():
+                info[k] = np.asarray(v)
+
+        return obs, rewards, terminated, truncated, info
+
+    def close(self):
+        self._state = None
+
+    def render(self):
+        """Render env[0] as an RGB array using MuJoCo renderer."""
+        if self._state is None:
+            return None
+        # Extract first env's state from the batched pytree
+        state_0 = jax.tree.map(lambda x: x[0], self._state)
+        frames = self._base_env.render([state_0], height=240, width=320)
+        return np.array(frames[0])
diff --git a/slm_lab/env/wrappers.py b/slm_lab/env/wrappers.py
index 82de4ffc5..9edbed4a2 100644
--- a/slm_lab/env/wrappers.py
+++ b/slm_lab/env/wrappers.py
@@ -6,6 +6,7 @@
 import gymnasium as gym
 import numpy as np
 import pandas as pd
+import torch
 from slm_lab.lib import util
 
 
@@ -86,7 +87,9 @@ def total_reward(self):
         Priority: VectorFullGameStatistics > RecordEpisodeStatistics > TrackReward
         This ensures we report full-game scores for Atari with life_loss_info.
         """
-        from gymnasium.wrappers.vector import RecordEpisodeStatistics as VectorRecordEpisodeStatistics
+        from gymnasium.wrappers.vector import (
+            RecordEpisodeStatistics as VectorRecordEpisodeStatistics,
+        )
 
         env = self.env
         while env is not None:
@@ -240,8 +243,8 @@ def step(self, actions):
     def _get_base_env(self):
         """Find base env with call() method."""
         env = self.env
-        while hasattr(env, 'env'):
-            if hasattr(env, 'call'):
+        while hasattr(env, "env"):
+            if hasattr(env, "call"):
                 return env
             env = env.env
         return env
@@ -253,14 +256,16 @@ def _render_grid(self):
             return
 
         base_env = self._get_base_env()
-        frames = base_env.call("render") if hasattr(base_env, 'call') else None
+        frames = base_env.call("render") if hasattr(base_env, "call") else None
         if frames is None or frames[0] is None:
             return
 
         if self.window is None:
             pygame.init()
             frame_h, frame_w = frames[0].shape[:2]
-            self.window = pygame.display.set_mode((frame_w * self.grid_cols, frame_h * self.grid_rows))
+            self.window = pygame.display.set_mode(
+                (frame_w * self.grid_cols, frame_h * self.grid_rows)
+            )
             pygame.display.set_caption(f"Vector Env ({self.num_envs} envs)")
             self.clock = pygame.time.Clock()
 
@@ -286,6 +291,99 @@ def _render_grid(self):
     def close(self):
         if self.window is not None:
             import pygame
+
             pygame.quit()
             self.window = None
         return super().close()
+
+
+class PlaygroundRenderWrapper(gym.vector.VectorWrapper):
+    """Render MuJoCo Playground env[0] via pygame after each step."""
+
+    def __init__(self, env: gym.vector.VectorEnv, render_freq: int = 1):
+        super().__init__(env)
+        self.render_freq = render_freq
+        self.step_count = 0
+        self.window = None
+        self.clock = None
+
+    def step(self, actions):
+        result = self.env.step(actions)
+        self.step_count += 1
+        if self.step_count % self.render_freq == 0:
+            self._show()
+        return result
+
+    def reset(self, **kwargs):
+        result = self.env.reset(**kwargs)
+        self._show()
+        return result
+
+    def _show(self):
+        try:
+            import pygame
+        except ImportError:
+            return
+        frame = self.env.render()
+        if frame is None:
+            return
+        if self.window is None:
+            pygame.init()
+            h, w = frame.shape[:2]
+            self.window = pygame.display.set_mode((w, h))
+            pygame.display.set_caption("MuJoCo Playground")
+            self.clock = pygame.time.Clock()
+        surface = pygame.surfarray.make_surface(frame.swapaxes(0, 1))
+        self.window.blit(surface, (0, 0))
+        pygame.display.flip()
+        self.clock.tick(60)
+        for event in pygame.event.get():
+            if event.type == pygame.QUIT:
+                self.close()
+                raise KeyboardInterrupt("Render window closed")
+
+    def close(self):
+        if self.window is not None:
+            import pygame
+
+            pygame.quit()
+            self.window = None
+        return super().close()
+
+
+class TorchNormalizeObservation(gym.vector.VectorWrapper):
+    """Running-mean normalization for CUDA tensor observations (Welford algorithm)."""
+
+    def __init__(self, env: gym.vector.VectorEnv, epsilon: float = 1e-8):
+        super().__init__(env)
+        self.epsilon = epsilon
+        self._mean = None
+        self._var = None
+        self._count = 0
+
+    def _update_and_normalize(self, obs):
+        if self._mean is None:
+            self._mean = torch.zeros_like(obs[0])
+            self._var = torch.ones_like(obs[0])
+        batch_mean = obs.mean(dim=0)
+        batch_var = obs.var(dim=0, unbiased=False)
+        batch_count = obs.shape[0]
+        # Welford parallel update
+        total = self._count + batch_count
+        delta = batch_mean - self._mean
+        self._mean = self._mean + delta * batch_count / total
+        self._var = (
+            self._var * self._count
+            + batch_var * batch_count
+            + delta**2 * self._count * batch_count / total
+        ) / total
+        self._count = total
+        return (obs - self._mean) / (self._var + self.epsilon).sqrt()
+
+    def step(self, actions):
+        obs, *rest = self.env.step(actions)
+        return self._update_and_normalize(obs), *rest
+
+    def reset(self, **kwargs):
+        obs, info = self.env.reset(**kwargs)
+        return self._update_and_normalize(obs), info
diff --git a/slm_lab/spec/benchmark_arc/ppo/ppo_playground.yaml b/slm_lab/spec/benchmark_arc/ppo/ppo_playground.yaml
new file mode 100644
index 000000000..0e02d636e
--- /dev/null
+++ b/slm_lab/spec/benchmark_arc/ppo/ppo_playground.yaml
@@ -0,0 +1,800 @@
+# PPO MuJoCo Playground — MJWarp GPU
+#
+# Variants:
+#   DM Control Suite (Phase 5.1):
+#     ppo_playground            — default (gamma=0.995, 16 epochs)
+#     ppo_playground_vnorm      — + normalize_v_targets=true (precision/dexterous envs)
+#     ppo_playground_fingerspin — FingerSpin: gamma=0.95 (official override)
+#     ppo_playground_pendulum   — PendulumSwingup: 4 epochs (official); action_repeat=4 in playground.py
+#     ppo_playground_humanoid   — Humanoid: wider policy (2x256), NormalTanh, constant LR, reward_scale=10
+#     ppo_playground_rs10       — + reward_scale=10.0 + constant LR (Brax default for ALL DM Control)
+#     ppo_playground_constlr    — + constant LR (no decay)
+#     ppo_playground_vnorm_constlr — + vnorm + constant LR
+#     ppo_playground_constlr_clip03 — + constant LR + clip_eps=0.3
+#     ppo_playground_vnorm_constlr_clip03 — + vnorm + constant LR + clip_eps=0.3
+#     ppo_playground_brax_policy — 4x32 Brax policy + constant LR + vnorm (RETIRED: underperformed)
+#
+#   Locomotion (Phase 5.2):
+#     ppo_playground_loco       — default loco (4x128 policy, 5x256 value, gamma=0.97, lr=3e-4 constant)
+#     ppo_playground_loco_go1   — Go1/G1/T1 joystick (512-256-128 both nets, clip=0.3)
+#     ppo_playground_loco_precise — G1/BerkeleyHumanoid/T1/Apollo (clip=0.2, entropy=0.005)
+#
+#   Manipulation (Phase 5.3):
+#     ppo_playground_manip      — Panda tasks (4x32 policy, gamma=0.97, epoch=8, th=10)
+#     ppo_playground_manip_aloha — Aloha bimanual (3x256 policy, entropy=0.02)
+#     ppo_playground_manip_aloha_peg — AlohaSinglePegInsertion (4x256, th=40, lr=3e-4)
+#     ppo_playground_manip_dexterous — Leap/Aero dexterous (512-256-128, lr=3e-4, th=40, gamma=0.99)
+#     ppo_playground_manip_robotiq — PandaRobotiqPushCube (4x64 policy, gamma=0.994, th=100, lr=6e-4)
+#
+# DM Control architecture: asymmetric policy=[64,64]+SiLU, value=[256,256,256]+SiLU
+# Loco architecture: policy=[128,128,128,128]+SiLU, value=[256,256,256,256,256]+SiLU
+#
+# Usage:
+#   slm-lab ... ppo_playground train -s env=playground/CartpoleBalance -s max_frame=100000000
+#   slm-lab ... ppo_playground_loco train -s env=playground/Go1Getup -s max_frame=100000000
+#   slm-lab ... ppo_playground_manip train -s env=playground/PandaPickCube -s max_frame=20000000
+#
+# Batch math:
+#   DM Control: 2048 envs x 30 steps = 61K, 15 minibatches, 16 epochs = 240 grad steps
+#   Loco:       2048 envs x 20 steps = 41K, 32 minibatches,  4 epochs = 128 grad steps
+#   Manip:      2048 envs x 10 steps = 20K, varies by task
+#   Robotiq:    2048 envs x 100 steps = 205K, 32 minibatches, 8 epochs = 256 grad steps
+
+# --- Shared ---
+
+_policy_body: &policy_body
+  modules:
+    body:
+      Sequential:
+        - LazyLinear: {out_features: 64}
+        - SiLU:
+        - LazyLinear: {out_features: 64}
+        - SiLU:
+  graph:
+    input: x
+    modules:
+      body: [x]
+    output: body
+
+_value_body: &value_body
+  modules:
+    body:
+      Sequential:
+        - LazyLinear: {out_features: 256}
+        - SiLU:
+        - LazyLinear: {out_features: 256}
+        - SiLU:
+        - LazyLinear: {out_features: 256}
+        - SiLU:
+  graph:
+    input: x
+    modules:
+      body: [x]
+    output: body
+
+_memory: &memory
+  name: OnPolicyBatchReplay
+
+_meta: &meta
+  distributed: false
+  log_frequency: 100000
+  eval_frequency: 100000
+  max_session: 4
+  max_trial: 1
+
+_env: &env
+  name: "${env}"
+  max_t: null
+  max_frame: "${max_frame}"
+  normalize_obs: true
+
+_algorithm: &algorithm
+  name: PPO
+  action_pdtype: Normal
+  gamma: 0.99
+  lam: 0.95
+  clip_eps_spec:
+    name: no_decay
+    start_val: 0.2
+  entropy_coef_spec:
+    name: no_decay
+    start_val: 0.01
+  val_loss_coef: 0.5
+  minibatch_size: 4096
+  normalize_v_targets: false  # Brax default; some envs may need true (see docs/phase5_ops.md)
+
+_net: &net
+  type: TorchArcNet
+  actor_arc: *policy_body
+  critic_arc: *value_body
+  shared: false
+  hid_layers_activation: relu
+  init_fn: orthogonal_
+  clip_grad_val: 1.0
+  use_same_optim: false
+  loss_spec:
+    name: MSELoss
+  optim_spec:
+    name: Adam
+    lr: 1.0e-3
+    eps: 1.0e-5
+  lr_scheduler_spec:
+    name: LinearToMin
+    frame: "${max_frame}"
+    min_factor: 0.033
+  gpu: auto
+
+# --- DM Control: gamma=0.995, 16 epochs, 2048 envs ---
+
+ppo_playground:
+  agent:
+    name: PPO
+    algorithm:
+      <<: *algorithm
+      gamma: 0.995
+      time_horizon: 30
+      training_epoch: 16
+      minibatch_size: 2048
+    memory: *memory
+    net: *net
+  env:
+    <<: *env
+    num_envs: 2048
+  meta: *meta
+
+# --- FingerSpin: gamma=0.95 (official dm_control_suite_params.py override) ---
+
+ppo_playground_fingerspin:
+  agent:
+    name: PPO
+    algorithm:
+      <<: *algorithm
+      gamma: 0.95
+      time_horizon: 30
+      training_epoch: 16
+      minibatch_size: 4096
+    memory: *memory
+    net: *net
+  env:
+    <<: *env
+    num_envs: 2048
+  meta: *meta
+
+# --- PendulumSwingup: training_epoch=4 (official); action_repeat=4 handled in playground.py ---
+
+ppo_playground_pendulum:
+  agent:
+    name: PPO
+    algorithm:
+      <<: *algorithm
+      gamma: 0.995
+      time_horizon: 30
+      training_epoch: 4
+      minibatch_size: 4096
+    memory: *memory
+    net: *net
+  env:
+    <<: *env
+    num_envs: 2048
+  meta: *meta
+
+# --- DM Control + normalize_v_targets=true: for precision/dexterous envs ---
+# Use for: AcrobotSwingup, SwimmerSwimmer6, PointMass, FingerTurnEasy/Hard, FishSwim
+
+ppo_playground_vnorm:
+  agent:
+    name: PPO
+    algorithm:
+      <<: *algorithm
+      gamma: 0.995
+      time_horizon: 30
+      training_epoch: 16
+      minibatch_size: 2048
+      normalize_v_targets: true
+    memory: *memory
+    net: *net
+  env:
+    <<: *env
+    num_envs: 2048
+  meta: *meta
+
+# --- Humanoid DM Control: wider policy (2x256), constant LR, reward_scale=10 ---
+# Humanoid has 21 DOF — needs wider policy than 2x64 for multi-joint coordination
+# Phase 3 solved Gymnasium Humanoid-v5 (2661 MA) with 2x256 policy + constant LR
+# Brax uses reward_scaling=10.0 for ALL DM Control envs (dm_control_suite_params.py)
+# Humanoid reward is multiplicative (standing*upright*move*control), all [0,1] — raw signal too small
+
+ppo_playground_humanoid:
+  agent:
+    name: PPO
+    algorithm:
+      <<: *algorithm
+      action_pdtype: NormalTanh  # Brax stores pre-tanh actions; avoids unstable atanh in 21-DOF space
+      gamma: 0.995
+      time_horizon: 30
+      training_epoch: 16
+      minibatch_size: 2048
+      normalize_v_targets: true
+    memory: *memory
+    net:
+      <<: *net
+      actor_arc:
+        modules:
+          body:
+            Sequential:
+              - LazyLinear: {out_features: 256}
+              - SiLU:
+              - LazyLinear: {out_features: 256}
+              - SiLU:
+        graph:
+          input: x
+          modules:
+            body: [x]
+          output: body
+      lr_scheduler_spec: null  # constant LR — Brax default, Phase 3 used constant
+  env:
+    <<: *env
+    num_envs: 2048
+    reward_scale: 10.0  # Brax default for DM Control — critical for Humanoid's tiny rewards
+  meta: *meta
+
+# --- reward_scale=10.0: Brax default for ALL DM Control envs ---
+# Research: dm_control_suite_params.py applies reward_scaling=10.0 universally.
+# Previously only ppo_playground_humanoid had this. Test on all underperforming envs.
+
+ppo_playground_rs10:
+  agent:
+    name: PPO
+    algorithm:
+      <<: *algorithm
+      gamma: 0.995
+      time_horizon: 30
+      training_epoch: 16
+      minibatch_size: 2048
+    memory: *memory
+    net:
+      <<: *net
+      lr_scheduler_spec: null  # constant LR — Brax default
+  env:
+    <<: *env
+    num_envs: 2048
+    reward_scale: 10.0
+  meta: *meta
+
+# --- Constant LR variants: test Brax default (no LR decay) in isolation ---
+
+ppo_playground_constlr:
+  agent:
+    name: PPO
+    algorithm:
+      <<: *algorithm
+      gamma: 0.995
+      time_horizon: 30
+      training_epoch: 16
+      minibatch_size: 4096
+    memory: *memory
+    net:
+      <<: *net
+      lr_scheduler_spec: null  # constant LR — Brax default
+  env:
+    <<: *env
+    num_envs: 2048
+  meta: *meta
+
+ppo_playground_vnorm_constlr:
+  agent:
+    name: PPO
+    algorithm:
+      <<: *algorithm
+      gamma: 0.995
+      time_horizon: 30
+      training_epoch: 16
+      minibatch_size: 2048
+      normalize_v_targets: true
+    memory: *memory
+    net:
+      <<: *net
+      lr_scheduler_spec: null  # constant LR — Brax default
+  env:
+    <<: *env
+    num_envs: 2048
+  meta: *meta
+
+# --- Constant LR + clip_eps=0.3: both Brax defaults, tested together ---
+
+ppo_playground_constlr_clip03:
+  agent:
+    name: PPO
+    algorithm:
+      <<: *algorithm
+      gamma: 0.995
+      time_horizon: 30
+      training_epoch: 16
+      minibatch_size: 2048
+      clip_eps_spec:
+        name: no_decay
+        start_val: 0.3
+    memory: *memory
+    net:
+      <<: *net
+      lr_scheduler_spec: null  # constant LR — Brax default
+  env:
+    <<: *env
+    num_envs: 2048
+  meta: *meta
+
+ppo_playground_vnorm_constlr_clip03:
+  agent:
+    name: PPO
+    algorithm:
+      <<: *algorithm
+      gamma: 0.995
+      time_horizon: 30
+      training_epoch: 16
+      minibatch_size: 2048
+      normalize_v_targets: true
+      clip_eps_spec:
+        name: no_decay
+        start_val: 0.3
+    memory: *memory
+    net:
+      <<: *net
+      lr_scheduler_spec: null  # constant LR — Brax default
+  env:
+    <<: *env
+    num_envs: 2048
+  meta: *meta
+
+# --- Brax-matched policy (4x32): deeper narrower policy matching Brax default ---
+
+ppo_playground_brax_policy:
+  agent:
+    name: PPO
+    algorithm:
+      <<: *algorithm
+      gamma: 0.995
+      time_horizon: 30
+      training_epoch: 16
+      minibatch_size: 2048
+      normalize_v_targets: true
+    memory: *memory
+    net:
+      <<: *net
+      actor_arc:
+        modules:
+          body:
+            Sequential:
+              - LazyLinear: {out_features: 32}
+              - SiLU:
+              - LazyLinear: {out_features: 32}
+              - SiLU:
+              - LazyLinear: {out_features: 32}
+              - SiLU:
+              - LazyLinear: {out_features: 32}
+              - SiLU:
+        graph:
+          input: x
+          modules:
+            body: [x]
+          output: body
+      lr_scheduler_spec: null  # constant LR — Brax default
+  env:
+    <<: *env
+    num_envs: 2048
+  meta: *meta
+
+# --- Locomotion: official Brax defaults (gamma=0.97, lr=3e-4 constant, clip=0.3) ---
+# Policy: 4x128, Value: 5x256 (official default for most locomotion envs)
+# Use for: BarkourJoystick, H1*, Op3, Spot* (default-config envs)
+# num_envs=2048 — official uses 8192; all Phase 5.2 benchmark runs used 2048
+
+ppo_playground_loco:
+  agent:
+    name: PPO
+    algorithm:
+      <<: *algorithm
+      gamma: 0.97
+      time_horizon: 20
+      training_epoch: 4
+      minibatch_size: 4096
+      clip_eps_spec:
+        name: no_decay
+        start_val: 0.3
+    memory: *memory
+    net:
+      <<: *net
+      actor_arc:
+        modules:
+          body:
+            Sequential:
+              - LazyLinear: {out_features: 128}
+              - SiLU:
+              - LazyLinear: {out_features: 128}
+              - SiLU:
+              - LazyLinear: {out_features: 128}
+              - SiLU:
+              - LazyLinear: {out_features: 128}
+              - SiLU:
+        graph:
+          input: x
+          modules:
+            body: [x]
+          output: body
+      critic_arc:
+        modules:
+          body:
+            Sequential:
+              - LazyLinear: {out_features: 256}
+              - SiLU:
+              - LazyLinear: {out_features: 256}
+              - SiLU:
+              - LazyLinear: {out_features: 256}
+              - SiLU:
+              - LazyLinear: {out_features: 256}
+              - SiLU:
+              - LazyLinear: {out_features: 256}
+              - SiLU:
+        graph:
+          input: x
+          modules:
+            body: [x]
+          output: body
+      optim_spec:
+        name: Adam
+        lr: 3.0e-4
+        eps: 1.0e-5
+      lr_scheduler_spec: null  # constant LR — Brax default
+  env:
+    <<: *env
+    num_envs: 2048
+  meta: *meta
+
+# --- Locomotion Go1/G1/T1: 512-256-128 both nets ---
+# Use for: Go1Joystick*, Go1Getup, Go1Handstand, Go1Footstand, Go1Backflip, G1*, T1*
+# These envs provide privileged_state obs (flattened into obs alongside policy state)
+# num_envs=2048 — official uses 8192; all Phase 5.2 benchmark runs used 2048
+
+ppo_playground_loco_go1:
+  agent:
+    name: PPO
+    algorithm:
+      <<: *algorithm
+      gamma: 0.97
+      time_horizon: 20
+      training_epoch: 4
+      minibatch_size: 4096
+      clip_eps_spec:
+        name: no_decay
+        start_val: 0.3
+    memory: *memory
+    net:
+      <<: *net
+      actor_arc:
+        modules:
+          body:
+            Sequential:
+              - LazyLinear: {out_features: 512}
+              - SiLU:
+              - LazyLinear: {out_features: 256}
+              - SiLU:
+              - LazyLinear: {out_features: 128}
+              - SiLU:
+        graph:
+          input: x
+          modules:
+            body: [x]
+          output: body
+      critic_arc:
+        modules:
+          body:
+            Sequential:
+              - LazyLinear: {out_features: 512}
+              - SiLU:
+              - LazyLinear: {out_features: 256}
+              - SiLU:
+              - LazyLinear: {out_features: 128}
+              - SiLU:
+        graph:
+          input: x
+          modules:
+            body: [x]
+          output: body
+      optim_spec:
+        name: Adam
+        lr: 3.0e-4
+        eps: 1.0e-5
+      lr_scheduler_spec: null  # constant LR — Brax default
+  env:
+    <<: *env
+    num_envs: 2048
+  meta: *meta
+
+# --- Locomotion precise: G1, BerkeleyHumanoid, T1, Apollo (clip=0.2, entropy=0.005) ---
+
+ppo_playground_loco_precise:
+  agent:
+    name: PPO
+    algorithm:
+      <<: *algorithm
+      gamma: 0.97
+      time_horizon: 20
+      training_epoch: 4
+      minibatch_size: 4096
+      clip_eps_spec:
+        name: no_decay
+        start_val: 0.2
+      entropy_coef_spec:
+        name: no_decay
+        start_val: 0.005
+    memory: *memory
+    net:
+      <<: *net
+      actor_arc:
+        modules:
+          body:
+            Sequential:
+              - LazyLinear: {out_features: 128}
+              - SiLU:
+              - LazyLinear: {out_features: 128}
+              - SiLU:
+              - LazyLinear: {out_features: 128}
+              - SiLU:
+              - LazyLinear: {out_features: 128}
+              - SiLU:
+        graph:
+          input: x
+          modules:
+            body: [x]
+          output: body
+      critic_arc:
+        modules:
+          body:
+            Sequential:
+              - LazyLinear: {out_features: 256}
+              - SiLU:
+              - LazyLinear: {out_features: 256}
+              - SiLU:
+              - LazyLinear: {out_features: 256}
+              - SiLU:
+              - LazyLinear: {out_features: 256}
+              - SiLU:
+              - LazyLinear: {out_features: 256}
+              - SiLU:
+        graph:
+          input: x
+          modules:
+            body: [x]
+          output: body
+      optim_spec:
+        name: Adam
+        lr: 3.0e-4
+        eps: 1.0e-5
+      lr_scheduler_spec: null
+  env:
+    <<: *env
+    num_envs: 2048
+  meta: *meta
+
+# --- Manipulation: Panda tasks (4x32 policy, epoch=8, th=10, entropy=0.02) ---
+
+ppo_playground_manip:
+  agent:
+    name: PPO
+    algorithm:
+      <<: *algorithm
+      gamma: 0.97
+      time_horizon: 10
+      training_epoch: 8
+      minibatch_size: 4096
+      entropy_coef_spec:
+        name: no_decay
+        start_val: 0.02
+    memory: *memory
+    net:
+      <<: *net
+      actor_arc:
+        modules:
+          body:
+            Sequential:
+              - LazyLinear: {out_features: 32}
+              - SiLU:
+              - LazyLinear: {out_features: 32}
+              - SiLU:
+              - LazyLinear: {out_features: 32}
+              - SiLU:
+              - LazyLinear: {out_features: 32}
+              - SiLU:
+        graph:
+          input: x
+          modules:
+            body: [x]
+          output: body
+      optim_spec:
+        name: Adam
+        lr: 1.0e-3
+        eps: 1.0e-5
+      lr_scheduler_spec: null
+  env:
+    <<: *env
+    num_envs: 2048
+  meta: *meta
+
+# --- Manipulation: Aloha bimanual (3x256 policy, entropy=0.02) ---
+
+ppo_playground_manip_aloha:
+  agent:
+    name: PPO
+    algorithm:
+      <<: *algorithm
+      gamma: 0.97
+      time_horizon: 15
+      training_epoch: 8
+      minibatch_size: 4096
+      entropy_coef_spec:
+        name: no_decay
+        start_val: 0.02
+    memory: *memory
+    net:
+      <<: *net
+      actor_arc:
+        modules:
+          body:
+            Sequential:
+              - LazyLinear: {out_features: 256}
+              - SiLU:
+              - LazyLinear: {out_features: 256}
+              - SiLU:
+              - LazyLinear: {out_features: 256}
+              - SiLU:
+        graph:
+          input: x
+          modules:
+            body: [x]
+          output: body
+      optim_spec:
+        name: Adam
+        lr: 1.0e-3
+        eps: 1.0e-5
+      lr_scheduler_spec: null
+  env:
+    <<: *env
+    num_envs: 2048
+  meta: *meta
+
+# --- Manipulation: AlohaSinglePegInsertion (4x256 policy, th=40, lr=3e-4, entropy=0.01) ---
+# Official config differs significantly from AlohaHandOver: deeper policy, lower lr/entropy, longer horizon
+
+ppo_playground_manip_aloha_peg:
+  agent:
+    name: PPO
+    algorithm:
+      <<: *algorithm
+      gamma: 0.97
+      time_horizon: 40
+      training_epoch: 8
+      minibatch_size: 4096
+    memory: *memory
+    net:
+      <<: *net
+      actor_arc:
+        modules:
+          body:
+            Sequential:
+              - LazyLinear: {out_features: 256}
+              - SiLU:
+              - LazyLinear: {out_features: 256}
+              - SiLU:
+              - LazyLinear: {out_features: 256}
+              - SiLU:
+              - LazyLinear: {out_features: 256}
+              - SiLU:
+        graph:
+          input: x
+          modules:
+            body: [x]
+          output: body
+      optim_spec:
+        name: Adam
+        lr: 3.0e-4
+        eps: 1.0e-5
+      lr_scheduler_spec: null
+  env:
+    <<: *env
+    num_envs: 2048
+  meta: *meta
+
+# --- Manipulation: Leap/Aero dexterous (512-256-128, lr=3e-4, th=40, gamma=0.99) ---
+# Official uses gamma=0.99 (not 0.97) for LeapCube and AeroCube envs
+
+ppo_playground_manip_dexterous:
+  agent:
+    name: PPO
+    algorithm:
+      <<: *algorithm
+      gamma: 0.99
+      time_horizon: 40
+      training_epoch: 4
+      minibatch_size: 4096
+    memory: *memory
+    net:
+      <<: *net
+      actor_arc:
+        modules:
+          body:
+            Sequential:
+              - LazyLinear: {out_features: 512}
+              - SiLU:
+              - LazyLinear: {out_features: 256}
+              - SiLU:
+              - LazyLinear: {out_features: 128}
+              - SiLU:
+        graph:
+          input: x
+          modules:
+            body: [x]
+          output: body
+      critic_arc:
+        modules:
+          body:
+            Sequential:
+              - LazyLinear: {out_features: 512}
+              - SiLU:
+              - LazyLinear: {out_features: 256}
+              - SiLU:
+              - LazyLinear: {out_features: 128}
+              - SiLU:
+        graph:
+          input: x
+          modules:
+            body: [x]
+          output: body
+      optim_spec:
+        name: Adam
+        lr: 3.0e-4
+        eps: 1.0e-5
+      lr_scheduler_spec: null
+  env:
+    <<: *env
+    num_envs: 2048
+  meta: *meta
+
+# --- Manipulation: PandaRobotiqPushCube (4x64 policy, gamma=0.994, th=100, lr=6e-4) ---
+
+ppo_playground_manip_robotiq:
+  agent:
+    name: PPO
+    algorithm:
+      <<: *algorithm
+      gamma: 0.994
+      time_horizon: 100
+      training_epoch: 8
+      minibatch_size: 4096
+    memory: *memory
+    net:
+      <<: *net
+      actor_arc:
+        modules:
+          body:
+            Sequential:
+              - LazyLinear: {out_features: 64}
+              - SiLU:
+              - LazyLinear: {out_features: 64}
+              - SiLU:
+              - LazyLinear: {out_features: 64}
+              - SiLU:
+              - LazyLinear: {out_features: 64}
+              - SiLU:
+        graph:
+          input: x
+          modules:
+            body: [x]
+          output: body
+      optim_spec:
+        name: Adam
+        lr: 6.0e-4
+        eps: 1.0e-5
+      lr_scheduler_spec: null
+  env:
+    <<: *env
+    num_envs: 2048
+  meta: *meta
diff --git a/test/env/test_playground.py b/test/env/test_playground.py
new file mode 100644
index 000000000..1416d7356
--- /dev/null
+++ b/test/env/test_playground.py
@@ -0,0 +1,225 @@
+"""Tests for MuJoCo Playground integration."""
+
+from unittest.mock import MagicMock, patch
+
+import gymnasium as gym
+from gymnasium import spaces
+import numpy as np
+import pytest
+
+
+# ============================================================================
+# PlaygroundVecEnv tests (require mujoco_playground)
+# ============================================================================
+
+
+class TestPlaygroundVecEnv:
+    """Tests for PlaygroundVecEnv with live mujoco_playground."""
+
+    @pytest.fixture(autouse=True)
+    def check_playground_available(self):
+        pytest.importorskip("mujoco_playground")
+
+    @pytest.fixture
+    def env(self):
+        from slm_lab.env.playground import PlaygroundVecEnv
+
+        env = PlaygroundVecEnv("CartpoleBalance", num_envs=4)
+        yield env
+        env.close()
+
+    def test_instantiation(self, env):
+        assert env.num_envs == 4
+
+    def test_spaces(self, env):
+        assert env.single_observation_space is not None
+        assert env.single_action_space is not None
+        obs_dim = env.single_observation_space.shape[0]
+        act_dim = env.single_action_space.shape[0]
+        assert obs_dim > 0
+        assert act_dim > 0
+        # Batched spaces should have num_envs in first dim
+        assert env.observation_space.shape == (4, obs_dim)
+        assert env.action_space.shape == (4, act_dim)
+
+    def test_reset(self, env):
+        obs, info = env.reset()
+        assert isinstance(obs, np.ndarray)
+        assert obs.shape == (4, env.single_observation_space.shape[0])
+        assert obs.dtype == np.float32
+        assert isinstance(info, dict)
+
+    def test_step(self, env):
+        env.reset()
+        actions = np.random.uniform(-1, 1, size=env.action_space.shape).astype(np.float32)
+        obs, rewards, terminated, truncated, info = env.step(actions)
+
+        assert obs.shape == (4, env.single_observation_space.shape[0])
+        assert obs.dtype == np.float32
+        assert rewards.shape == (4,)
+        assert rewards.dtype == np.float32
+        assert terminated.shape == (4,)
+        assert terminated.dtype == bool
+        assert truncated.shape == (4,)
+        assert truncated.dtype == bool
+        assert isinstance(info, dict)
+
+    def test_reset_with_seed(self, env):
+        obs1, _ = env.reset(seed=42)
+        obs2, _ = env.reset(seed=42)
+        np.testing.assert_array_equal(obs1, obs2)
+
+    def test_multiple_steps(self, env):
+        env.reset()
+        for _ in range(10):
+            actions = np.random.uniform(-1, 1, size=env.action_space.shape).astype(np.float32)
+            obs, rewards, terminated, truncated, info = env.step(actions)
+            assert obs.shape[0] == 4
+
+
+# ============================================================================
+# make_env routing tests (mocked — no mujoco_playground needed)
+# ============================================================================
+
+
+class TestMakeEnvPlaygroundRouting:
+    """Test that make_env routes playground/ envs to _make_playground_env."""
+
+    def test_playground_prefix_routes_correctly(self):
+        spec = {
+            "agent": {"algorithm": {"gamma": 0.99}},
+            "env": {
+                "name": "playground/CartpoleBalance",
+                "num_envs": 4,
+                "max_frame": 100000,
+            },
+            "meta": {
+                "distributed": False,
+                "eval_frequency": 5000,
+                "log_frequency": 5000,
+                "max_session": 1,
+            },
+        }
+
+        with patch("slm_lab.env._make_playground_env") as mock_pg:
+            # Create a mock env with real gymnasium spaces
+            obs_space = spaces.Box(low=-np.inf, high=np.inf, shape=(5,), dtype=np.float32)
+            act_space = spaces.Box(low=-1.0, high=1.0, shape=(1,), dtype=np.float32)
+            mock_env = MagicMock(spec=gym.vector.VectorEnv)
+            mock_env.num_envs = 4
+            mock_env.is_venv = True
+            mock_env.single_observation_space = obs_space
+            mock_env.single_action_space = act_space
+            mock_env.observation_space = obs_space
+            mock_env.action_space = act_space
+            mock_env.spec = None
+            mock_pg.return_value = mock_env
+
+            from slm_lab.env import make_env
+
+            make_env(spec)
+            mock_pg.assert_called_once()
+            call_args = mock_pg.call_args
+            assert call_args[0][0] == "playground/CartpoleBalance"
+            assert call_args[0][1] == 4
+
+    def test_non_playground_does_not_route(self):
+        spec = {
+            "agent": {"algorithm": {"gamma": 0.99}},
+            "env": {
+                "name": "CartPole-v1",
+                "num_envs": 1,
+                "max_frame": 1000,
+            },
+            "meta": {
+                "distributed": False,
+                "eval_frequency": 1000,
+                "log_frequency": 1000,
+                "max_session": 1,
+            },
+        }
+
+        with patch("slm_lab.env._make_playground_env") as mock_pg:
+            from slm_lab.env import make_env
+
+            env = make_env(spec)
+            mock_pg.assert_not_called()
+            env.close()
+
+
+# ============================================================================
+# PlaygroundVecEnv impl detection tests (require mujoco_playground)
+# ============================================================================
+
+
+class TestPlaygroundImplDetection:
+    """Test that PlaygroundVecEnv selects the right impl based on hardware."""
+
+    @pytest.fixture(autouse=True)
+    def check_playground_available(self):
+        pytest.importorskip("mujoco_playground")
+
+    def test_impl_is_warp_on_cuda(self):
+        """On CUDA GPU, impl should be 'warp'."""
+        import jax
+
+        if not any(d.platform == "gpu" for d in jax.devices()):
+            pytest.skip("No CUDA GPU available")
+        import slm_lab.env.playground as pg_module
+
+        assert pg_module._impl == "warp"
+
+        from slm_lab.env.playground import PlaygroundVecEnv
+
+        env = PlaygroundVecEnv("CartpoleBalance", num_envs=2)
+        env.close()
+
+    def test_impl_is_jax_on_cpu(self):
+        """On CPU (no CUDA), impl should be 'jax'."""
+        import jax
+
+        if any(d.platform == "gpu" for d in jax.devices()):
+            pytest.skip("CUDA GPU present — test is for CPU only")
+        import slm_lab.env.playground as pg_module
+
+        assert pg_module._impl == "jax"
+
+        from slm_lab.env.playground import PlaygroundVecEnv
+
+        env = PlaygroundVecEnv("CartpoleBalance", num_envs=2)
+        env.close()
+
+    def test_config_overrides_matches_impl(self):
+        """_config_overrides dict must reflect the selected impl."""
+        import slm_lab.env.playground as pg_module
+
+        assert pg_module._config_overrides == {"impl": pg_module._impl}
+
+    def test_impl_is_consistent_with_cuda_flag(self):
+        """_impl and _has_cuda must agree: warp iff CUDA present."""
+        import slm_lab.env.playground as pg_module
+
+        if pg_module._has_cuda:
+            assert pg_module._impl == "warp"
+        else:
+            assert pg_module._impl == "jax"
+
+
+# ============================================================================
+# Import guard tests
+# ============================================================================
+
+
+class TestImportGuard:
+    """Test that slm_lab.env imports cleanly without mujoco_playground."""
+
+    def test_env_module_imports_without_playground(self):
+        """Importing slm_lab.env should not fail if playground is missing.
+
+        The playground import is lazy (inside _make_playground_env), so the
+        env module should always import successfully.
+        """
+        import slm_lab.env
+
+        assert hasattr(slm_lab.env, "make_env")
+        assert hasattr(slm_lab.env, "_make_playground_env")