GaTech-RL2 · aidang3019 · Jun 17, 2026
diff --git a/egomimic/algo/pi.py b/egomimic/algo/pi.py
diff --git a/egomimic/eval/eval_pi.py b/egomimic/eval/eval_pi.py
@@ -42,6 +42,15 @@ def compute_metrics_and_viz(self, batch):
                 total_loss = total_loss + loss_val
                 n_loss_embodiments += 1
 
+            # Subtask-prediction (hierarchical) val signals — CE loss + teacher-
+            # forced token accuracy. Computed by forward_eval; surfaced here.
+            sub_loss_key = f"{embodiment_name}_subtask_loss"
+            if sub_loss_key in preds:
+                metrics[f"Valid/{sub_loss_key}"] = preds[sub_loss_key]
+            sub_acc_key = f"{embodiment_name}_subtask_acc"
+            if sub_acc_key in preds:
+                metrics[f"Valid/{sub_acc_key}"] = preds[sub_acc_key]
+
             if pred_key in preds:
                 metrics[f"Valid/{pred_key}_paired_mse_avg"] = mse(
                     preds[pred_key].cpu(), _batch[ac_key].cpu()

diff --git a/egomimic/hydra_configs/data/sort_pi_subtask.yaml b/egomimic/hydra_configs/data/sort_pi_subtask.yaml
@@ -0,0 +1,59 @@
+defaults:
+  - cotrain_pi_lang_6d
+  - _self_
+
+# Sort-regime data for hierarchical subtask prediction.
+#
+# Inherits `cotrain_pi_lang_6d` (NOT `cotrain_pi_lang`): the model config
+# `pi0.5_subtask` uses action_encoding=cartesian_normalized_rot6d, which expects
+# the xyz+6D(+gripper) action layout produced by transform mode `cartesian_6d`.
+# Inheriting the plain `cotrain_pi_lang` (ypr) parent would crash in
+# to32_norm_6d on the first batch.
+#
+# Two changes vs. the parent:
+#   1. resolver key_map registers a high-level view of the annotation array
+#      (`annotations_high`), filtered to the `level == "high"` sort goals the
+#      SortConverter writes. The primary `annotations` key is pinned to
+#      `level == "low"` (pick-and-place) inside get_keymap when
+#      `high_annotation_key` is set.
+#   2. filters select both SORT episodes (task == 'sort', which carry the
+#      high-level "Sorting" track -> real high != low training signal) and
+#      pick_place episodes (high == low via the use-as-both fallback). Without
+#      this override the inherited filter is `task == 'pick_place'` only, so no
+#      episode would have a high-level track and the subtask objective would
+#      degenerate to high == low for every frame.
+#
+# Pair with model config `pi0.5_subtask`.
+train_datasets:
+  eva_bimanual:
+    resolver:
+      key_map:
+        high_annotation_key: annotations_high
+    filters:
+      _target_: egomimic.rldb.filters.ScaleAnnotationDatasetFilter
+      project_name: "dense-language"
+      filter_lambdas:
+        - "lambda row: (row.get('embodiment') == 'eva_bimanual') & (row['task'] in ('sort', 'pick_place')) & (row['zarr_processed_path'] != '')"
+  aria_bimanual:
+    resolver:
+      key_map:
+        high_annotation_key: annotations_high
+    filters:
+      _target_: egomimic.rldb.filters.ScaleAnnotationDatasetFilter
+      project_name: "dense-language"
+      filter_lambdas:
+        - "lambda row: (row.get('embodiment') == 'aria_bimanual') & (row['task'] in ('sort', 'pick_place')) & (row['zarr_processed_path'] != '')"
+
+valid_datasets:
+  eva_bimanual:
+    filters:
+      _target_: egomimic.rldb.filters.ScaleAnnotationDatasetFilter
+      project_name: "dense-language"
+      filter_lambdas:
+        - "lambda row: row.get('embodiment') == 'eva_bimanual' and row['task'] in ('sort', 'pick_place') and row['zarr_processed_path'] != '' and 'alignment' not in (row.get('task_description') or '')"
+  aria_bimanual:
+    filters:
+      _target_: egomimic.rldb.filters.ScaleAnnotationDatasetFilter
+      project_name: "dense-language"
+      filter_lambdas:
+        - "lambda row: row.get('embodiment') == 'aria_bimanual' and row['task'] in ('sort', 'pick_place') and row['zarr_processed_path'] != '' and 'alignment' not in (row.get('task_description') or '')"
diff --git a/egomimic/hydra_configs/evaluator/viz/cotrain_lang.yaml b/egomimic/hydra_configs/evaluator/viz/cotrain_lang.yaml
@@ -8,7 +8,9 @@ eva_bimanual:
 aria_bimanual:
   _target_: egomimic.rldb.embodiment.human.Aria.viz_gt_preds
   _partial_: true
-  image_key: observations.images.front_img_1
+  # cotrain_pi_lang uses keymap_mode=cartesian_pi, which names the front image
+  # `base_0_rgb` (not the `cartesian`-style observations.images.front_img_1).
+  image_key: base_0_rgb
   action_key: actions_cartesian
   mode: traj
   annotation_key: annotations

diff --git a/egomimic/hydra_configs/model/pi0.5_subtask.yaml b/egomimic/hydra_configs/model/pi0.5_subtask.yaml
@@ -0,0 +1,26 @@
+defaults:
+  - pi0.5_cotrain_eva_aria_6d
+  - _self_
+
+# PI0.5 with hierarchical subtask prediction (sort regime). The model conditions
+# on the HIGH-level sort instruction and predicts the LOW-level pick-and-place
+# instruction as language tokens (auxiliary cross-entropy), in addition to the
+# action flow-matching loss. At inference the subtask is decoded
+# autoregressively and the action expert is conditioned on the realized
+# [high + subtask] prefix. See egomimic/models/pi0_subtask.py and
+# egomimic/algo/pi.py.
+#
+# Pair with data config `sort_pi_subtask` (registers the `annotations_high`
+# high-level view of the annotation array).
+robomimic_model:
+  subtask_prediction: true
+  # High-level (sort) goal conditions the model; low-level (pick-and-place) is
+  # the prediction target. Both read the same zarr `annotations` array, split by
+  # the `level` tag the SortConverter writes (see get_keymap high_annotation_key).
+  annotation_key: "annotations_high"
+  subtask_key: "annotations"
+  subtask_anchor: "Subtask: "
+  # Relative weight of the subtask CE loss vs. the action flow-matching loss.
+  subtask_loss_weight: 1.0
+  # Max autoregressively-decoded subtask length (tokens) at inference.
+  max_subtask_len: 48