Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co

## Environment

- **You are on a shared SLURM cluster.** Do NOT run anything GPU- or CPU-intensive yourself (no training, no eval, no large data conversions, no full dataset loads, no heavy `pytest` runs that spin up models or pull data). Defer to the user to actually execute those commands — your job is to prepare the command and explain it. Lightweight read-only work (lint, type checks, small unit tests, file edits, single-file syntax checks) is fine on the login node.
- **You are on a shared SLURM cluster.** Do not run anything GPU- or CPU-intensive yourself unless told to (no training, no eval, no large data conversions, no full dataset loads, no heavy `pytest` runs that spin up models or pull data). Defer to the user to actually execute those commands — your job is to prepare the command and explain it. Lightweight read-only work (lint, type checks, small unit tests, file edits, single-file syntax checks) is fine on the login node.
- Python 3.11. Activate the project venv before any Python tooling: `source emimic/bin/activate`.
- Package is installed editable as `egomimic` (see `pyproject.toml`). Linting is `ruff` via pre-commit.
- AWS/Cloudflare R2 credentials are required for SQL episode registry + data download. Bootstrap with `aws configure` then `./egomimic/utils/aws/setup_secret.sh` (writes `~/.egoverse_env`). `load_env()` from `egomimic.utils.aws.aws_data_utils` is called automatically at the top of `trainHydra.py`.
Expand Down
59 changes: 59 additions & 0 deletions egomimic/eval/eval_train_viz.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
"""Train-set visualization evaluator.

Wraps a concrete EvalVideo (HPTEvalVideo, PIEvalVideo, ...) so the same
forward/metric/viz logic can run a second time against a separate
``train_viz`` dataloader. Videos go to ``<root>/videos_train_viz/`` and
metric keys are prefixed with ``train_viz/`` so they don't collide with the
canonical ``Valid/...`` keys.

Instantiated via Hydra from a config like
``hydra_configs/evaluator/train_viz_pi.yaml``.
"""

from __future__ import annotations

import os

from egomimic.eval.eval_video import EvalVideo


class TrainVizEvalVideo(EvalVideo):
def __init__(self, base: EvalVideo, limit_val_batches: int = 50):
self.base = base
# Forward eval-affecting knobs from the wrapped evaluator so the
# wrapper's own buffering/flushing logic (inherited from EvalVideo)
# matches the base's intent. compute_metrics_and_viz is still
# delegated to base.
super().__init__(
limit_val_batches=limit_val_batches,
viz_func=base.viz_func,
transform_lists=base.transform_lists,
one_video_per_task=base.one_video_per_task,
max_frames_per_task=base.max_frames_per_task,
)

@property
def trainer(self):
return self._trainer

@trainer.setter
def trainer(self, value):
self._trainer = value
self.base.trainer = value

@property
def model(self):
return self._model

@model.setter
def model(self, value):
self._model = value
self.base.model = value

def video_dir(self):
return os.path.join(self.root_dir(), "videos_train_viz")

def compute_metrics_and_viz(self, batch):
metrics, images_dict = self.base.compute_metrics_and_viz(batch)
metrics = {f"train_viz/{k}": v for k, v in metrics.items()}
return metrics, images_dict
48 changes: 48 additions & 0 deletions egomimic/hydra_configs/data/mecka_pi_10_hrs.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
_target_: egomimic.pl_utils.pl_data_utils.MultiDataModuleWrapper

train_datasets:
mecka_bimanual:
_target_: egomimic.rldb.zarr.zarr_dataset_multi.MultiDataset._from_resolver
resolver:
_target_: egomimic.rldb.zarr.zarr_dataset_multi.S3EpisodeResolver
folder_path: ${paths.dataset_dir}
key_map:
_target_: egomimic.rldb.embodiment.human.Mecka.get_keymap
mode: cartesian_pi
annotation_key: annotations
transform_list:
_target_: egomimic.rldb.embodiment.human.Mecka.get_transform_list
mode: cartesian
filters:
_target_: egomimic.rldb.filters.DatasetFilter
filter_lambdas:
- "lambda row: row['lab'] == 'mecka' and row['task'] in {'packaging_coffee', 'wrapping_gifts', 'cleaning_tools', 'folding_napkins', 'repairing_electronics', 'polishing_jewelry', 'disassembling_phone', 'assembling_flowers', 'making_dumplings', 'peeling_vegetables'}"
mode: train

valid_datasets:
mecka_bimanual:
_target_: ${data.train_datasets.mecka_bimanual._target_}
resolver: ${data.train_datasets.mecka_bimanual.resolver}
filters: ${data.train_datasets.mecka_bimanual.filters}
mode: valid

train_dataloader_params:
mecka_bimanual:
batch_size: 64
num_workers: 10
valid_dataloader_params:
mecka_bimanual:
batch_size: 64
num_workers: 10

# `+evaluator@train_viz_evaluator=train_viz_pi`.
train_viz_datasets:
mecka_bimanual:
_target_: ${data.train_datasets.mecka_bimanual._target_}
resolver: ${data.train_datasets.mecka_bimanual.resolver}
filters: ${data.train_datasets.mecka_bimanual.filters}
mode: train
train_viz_dataloader_params:
mecka_bimanual:
batch_size: 64
num_workers: 10
48 changes: 48 additions & 0 deletions egomimic/hydra_configs/data/mecka_pi_50_hrs.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
_target_: egomimic.pl_utils.pl_data_utils.MultiDataModuleWrapper

train_datasets:
mecka_bimanual:
_target_: egomimic.rldb.zarr.zarr_dataset_multi.MultiDataset._from_resolver
resolver:
_target_: egomimic.rldb.zarr.zarr_dataset_multi.S3EpisodeResolver
folder_path: ${paths.dataset_dir}
key_map:
_target_: egomimic.rldb.embodiment.human.Mecka.get_keymap
mode: cartesian_pi
annotation_key: annotations
transform_list:
_target_: egomimic.rldb.embodiment.human.Mecka.get_transform_list
mode: cartesian
filters:
_target_: egomimic.rldb.filters.DatasetFilter
filter_lambdas:
- "lambda row: row['lab'] == 'mecka' and row['task'] in {'bottling_perfume', 'assembling_flowers', 'packaging_perfumes', 'making_dumplings', 'cleaning_tools', 'planting_seedlings', 'assembling_components', 'disassembling_phone', 'portioning_food', 'packaging_cutlery', 'stringing_beads', 'packaging_nuts', 'wrapping_gifts', 'arranging_flowers', 'folding_paper', 'packaging_gifts', 'peeling_garlic', 'disassembling_laptops', 'peeling_vegetables', 'cleaning_windows', 'packaging_masks', 'rinsing_dishes', 'making_paper_bags', 'packaging_coffee', 'crafting_decorations'}"
mode: train

valid_datasets:
mecka_bimanual:
_target_: ${data.train_datasets.mecka_bimanual._target_}
resolver: ${data.train_datasets.mecka_bimanual.resolver}
filters: ${data.train_datasets.mecka_bimanual.filters}
mode: valid

train_dataloader_params:
mecka_bimanual:
batch_size: 64
num_workers: 10
valid_dataloader_params:
mecka_bimanual:
batch_size: 64
num_workers: 10

# `+evaluator@train_viz_evaluator=train_viz_pi`.
train_viz_datasets:
mecka_bimanual:
_target_: ${data.train_datasets.mecka_bimanual._target_}
resolver: ${data.train_datasets.mecka_bimanual.resolver}
filters: ${data.train_datasets.mecka_bimanual.filters}
mode: train
train_viz_dataloader_params:
mecka_bimanual:
batch_size: 64
num_workers: 10
21 changes: 19 additions & 2 deletions egomimic/hydra_configs/data/mecka_pi_eval.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ train_datasets:
filter_lambdas:
- "lambda row: row['lab'] == 'mecka' and row['episode_hash'] in {'69b9a059c69a19757c9ec3e7', '69b979626e470e7c633d794d', '693a33831eb710720c250034', '696da1be2ddbc19f3c9b0899', '692eaa836974927fef249cfc'}"
# cleaning_shoes, dishwashing, potting_plants, folding_clothes, cup_on_saucer
mode: train
mode: total

valid_datasets:
mecka_bimanual:
Expand All @@ -33,7 +33,7 @@ valid_datasets:
filter_lambdas:
- "lambda row: row['lab'] == 'mecka' and row['episode_hash'] in {'69b9a059c69a19757c9ec3e7', '69b979626e470e7c633d794d', '693a33831eb710720c250034', '696da1be2ddbc19f3c9b0899', '692eaa836974927fef249cfc'}"
# cleaning_shoes, dishwashing, potting_plants, folding_clothes, cup_on_saucer
mode: valid
mode: total

train_dataloader_params:
mecka_bimanual:
Expand All @@ -43,3 +43,20 @@ valid_dataloader_params:
mecka_bimanual:
batch_size: 32
num_workers: 10

# Optional train-set visualization loader. When set, ModelWrapper drives a
# second pass through these datasets each validation epoch and routes them to
# the TrainVizEvalVideo wrapper (videos under `videos_train_viz/`, metrics
# prefixed with `train_viz/`). Uncomment together with
# `+evaluator@train_viz_evaluator=train_viz_pi`.
#
# train_viz_datasets:
# mecka_bimanual:
# _target_: ${data.train_datasets.mecka_bimanual._target_}
# resolver: ${data.train_datasets.mecka_bimanual.resolver}
# filters: ${data.train_datasets.mecka_bimanual.filters}
# mode: train
# train_viz_dataloader_params:
# mecka_bimanual:
# batch_size: 32
# num_workers: 10
19 changes: 19 additions & 0 deletions egomimic/hydra_configs/evaluator/train_viz_pi.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Wraps the canonical Pi evaluator (eval_pi) so it can run a second time
# against the `train_viz` dataloader. The wrapped `base` evaluator owns
# compute_metrics_and_viz; this wrapper writes videos to `videos_train_viz/`
# and prefixes metric keys with `train_viz/`.
#
# Enable via `+evaluator@train_viz_evaluator=train_viz_pi` together with a
# data config that defines `train_viz_datasets` and
# `train_viz_dataloader_params`.

defaults:
- /evaluator@base: eval_pi
- _self_

_target_: egomimic.eval.eval_train_viz.TrainVizEvalVideo

# Cap on validation batches consumed from the train_viz loader per epoch.
# Train data is iterated as a "spot check"; keep this small relative to the
# canonical valid loader's limit_val_batches.
limit_val_batches: 50
17 changes: 17 additions & 0 deletions egomimic/hydra_configs/hydra/launcher/submitit_cpu_pace.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
defaults:
- submitit_slurm

_target_: hydra_plugins.hydra_submitit_launcher.submitit_launcher.SlurmLauncher

# CPU-only PACE launcher for norm-stats / preprocessing jobs (no GPU).
name: ${hydra.job.name}
partition: "cpu-small" # PACE CPU partition — confirm before first submit
account: "gts-dxu345-rl2"
cpus_per_task: 24
nodes: 1
tasks_per_node: 1
qos: "inferno"
mem_per_cpu: 8G
timeout_min: 720 # 12h
additional_parameters:
requeue: true
5 changes: 5 additions & 0 deletions egomimic/hydra_configs/train_zarr_cartesian.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@ description: test
ckpt_path: null
mode: train

# Optional second evaluator that runs against the train_viz dataloader.
# Set via override, e.g. `+train_viz_evaluator=train_viz_pi`, together with a
# data config that defines `train_viz_datasets`.
train_viz_evaluator: null

hydra:
run:
# Dir should be experiment_name/description_{timestamp}
Expand Down
4 changes: 2 additions & 2 deletions egomimic/hydra_configs/trainer/ddp_pi.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@ accelerator: gpu
devices: ${eval:'${launch_params.gpus_per_node} * ${launch_params.nodes}'}
num_nodes: ${launch_params.nodes}
sync_batchnorm: True
check_val_every_n_epoch: 200
num_sanity_val_steps: 0
check_val_every_n_epoch: 100
num_sanity_val_steps: 0
39 changes: 34 additions & 5 deletions egomimic/pl_utils/pl_data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,13 +60,23 @@ def __init__(
valid_datasets: dict,
train_dataloader_params: dict,
valid_dataloader_params: dict,
train_viz_datasets: dict | None = None,
train_viz_dataloader_params: dict | None = None,
):
"""
Args:
train_datasets: dictionary of train datasets
valid_datasets: dictionary of valid datasets
train_dataloader_params: dictionary of train dataloader parameters
valid_dataloader_params: dictionary of valid dataloader parameters
train_viz_datasets: optional dict of datasets iterated like a
second val loader. Used by TrainVizEvalVideo to visualize the
policy on training data alongside the canonical validation
pass. When set, ``val_dataloader()`` returns a list with the
train-viz CombinedLoader at index 1 so Lightning populates
``dataloader_idx=1`` on validation_step.
train_viz_dataloader_params: dict of per-dataset DataLoader kwargs
for the train-viz loader (parallels valid_dataloader_params).

Tokenization (sampling a prompt from per-sample annotation lists,
splicing in embodiment / control-mode / proprio blocks, and running
Expand All @@ -81,8 +91,12 @@ def __init__(
# dataset defined in a base (e.g. `aria_bimanual: null`).
self.train_datasets = {k: v for k, v in train_datasets.items() if v is not None}
self.valid_datasets = {k: v for k, v in valid_datasets.items() if v is not None}
self.train_viz_datasets = {
k: v for k, v in (train_viz_datasets or {}).items() if v is not None
}
self.train_dataloader_params = train_dataloader_params
self.valid_dataloader_params = valid_dataloader_params
self.train_viz_dataloader_params = train_viz_dataloader_params or {}
self.collate_fn = annotation_collate

def train_dataloader(self):
Expand All @@ -102,13 +116,13 @@ def train_dataloader(self):

return CombinedLoader(iterables, "max_size_cycle")

def val_dataloader(self):
def _build_val_style_loader(self, datasets: dict, params: dict, kind: str):
iterables = dict()
for dataset_name, dataset in self.valid_datasets.items():
dataset_params = self.valid_dataloader_params.get(dataset_name)
for dataset_name, dataset in datasets.items():
dataset_params = params.get(dataset_name)
if dataset_params is None or len(dataset_params) == 0:
raise ValueError(
f"No dataloader params found for dataset {dataset_name}. Please add {dataset_name} into your data config valid_dataloader_params."
f"No dataloader params found for dataset {dataset_name}. Please add {dataset_name} into your data config {kind}_dataloader_params."
)
dataset_params = dict(dataset_params)
shuffle = dataset_params.pop("shuffle", False)
Expand All @@ -118,9 +132,24 @@ def val_dataloader(self):
collate_fn=self.collate_fn,
**dataset_params,
)

return CombinedLoader(iterables, "max_size_cycle")

def val_dataloader(self):
valid_loader = self._build_val_style_loader(
self.valid_datasets, self.valid_dataloader_params, kind="valid"
)
if not self.train_viz_datasets:
return valid_loader
# When train_viz_datasets is configured, return a list so Lightning
# populates dataloader_idx (0=valid, 1=train_viz) and ModelWrapper can
# dispatch to self.train_viz_evaluator.
train_viz_loader = self._build_val_style_loader(
self.train_viz_datasets,
self.train_viz_dataloader_params,
kind="train_viz",
)
return [valid_loader, train_viz_loader]


class DualDataModuleWrapper(LightningDataModule):
"""
Expand Down
Loading
Loading