From ac7a9c99a6c5047826e289c9617a6d2cbc764109 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tobias=20J=C3=BClg?= <tobias.juelg@utn.de>
Date: Fri, 16 Jan 2026 10:17:29 +0100
Subject: [PATCH 1/5] ci: fix pipeline

---
 .github/workflows/pipeline.yaml |  2 +-
 README.md                       | 14 +++++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/pipeline.yaml b/.github/workflows/pipeline.yaml
index b08d04a..f05e77e 100644
--- a/.github/workflows/pipeline.yaml
+++ b/.github/workflows/pipeline.yaml
@@ -37,7 +37,7 @@ jobs:
           path: dist/
 
   upload_pypi:
-    needs: [build_dist]
+    needs: [pipeline]
     runs-on: ubuntu-latest
     # Upload to PyPI on release
     if: always() && github.event_name == 'release' && github.event.action == 'published'
diff --git a/README.md b/README.md
index b53d3cc..cbb11f2 100644
--- a/README.md
+++ b/README.md
@@ -13,17 +13,18 @@ The work also includes a section on related engineering challenges regarding jax
 
 ## Installation
 
+### Pip Installation (Recommended)
+```shell
+pip install vlagents
+```
+
 ### Local Installation
 ```shell
-git clone https://github.com/juelg/vlagents.git
+git clone https://https://github.com/RobotControlStack/vlagents.git
 cd vlagents
 pip install -ve .
 ```
 
-### Repo Installation
-```shell
-pip install git+https://github.com/juelg/vlagents.git
-```
 
 ### Environment and Policy Installation
 On top of vlagents you can then install a simulation environment where the agent acts.
@@ -190,9 +191,8 @@ If you find the agent useful for your work, please consider citing the original
 ```
 @inproceedings{juelg2025refinedpolicydistillationvla,
     title={{Refined Policy Distillation}: {F}rom {VLA} Generalists to {RL} Experts}, 
-    author={Tobias Jülg and Wolfram Burgard and Florian Walter},
+    author={Tobias J{\"u}lg and Wolfram Burgard and Florian Walter},
     year={2025},
     booktitle={Proc.~of the IEEE/RSJ Int.~Conf.~on Intelligent Robots and Systems (IROS)},
-    note={Accepted for publication.}
 }
 ```
\ No newline at end of file

From 3d4e14dad027cf8b11b6345c353c1ce7b6f0fd2c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tobias=20J=C3=BClg?= <tobias.juelg@utn.de>
Date: Fri, 16 Jan 2026 12:19:11 +0100
Subject: [PATCH 2/5] feat: added support for libero

---
 README.md                      | 18 ++++++++-
 src/vlagents/evaluator_envs.py | 73 +++++++++++++++++++++++++++++++---
 2 files changed, 84 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index cbb11f2..25abd37 100644
--- a/README.md
+++ b/README.md
@@ -28,9 +28,20 @@ pip install -ve .
 
 ### Environment and Policy Installation
 On top of vlagents you can then install a simulation environment where the agent acts.
-We currently support [maniskill](https://github.com/haosulab/ManiSkill) with more to come.
+We currently the following environments:
+- [maniskill](https://github.com/haosulab/ManiSkill)
+- [robot control stack](https://github.com/RobotControlStack/robot-control-stack)
+- [libero](https://github.com/Lifelong-Robot-Learning/LIBERO)
+
+
 In order to avoid dependency conflicts, use a second conda/pip environment to install your policy.
-We currently support [octo](https://github.com/octo-models/octo) and [openvla](https://github.com/openvla/openvla).
+We currently support the following policies:
+- [octo](https://github.com/octo-models/octo)
+- [openvla](https://github.com/openvla/openvla)
+- [openpi](https://github.com/Physical-Intelligence/openpi)
+- [vjepa2-ac](https://github.com/facebookresearch/vjepa2)
+- [diffusion policy](https://github.com/real-stanford/diffusion_policy)
+
 
 ### Octo
 To use Octo as an agent/policy you need to create a new conda environment:
@@ -135,6 +146,9 @@ pip install -ve .
 
 ```
 
+### Diffusion Policy
+Currently located on the branch `diffusion_policy`.
+
 ## Usage
 To start an vlagents server use the `start-server` command where `kwargs` is a dictionary of the constructor arguments of the policy you want to start e.g.
 ```shell
diff --git a/src/vlagents/evaluator_envs.py b/src/vlagents/evaluator_envs.py
index e8306d4..fbf6c6f 100644
--- a/src/vlagents/evaluator_envs.py
+++ b/src/vlagents/evaluator_envs.py
@@ -132,11 +132,7 @@ def __init__(self, env_id, seed, **env_kwargs):
         # TODO: one could save only every nth episode by adding an episode counter which steps the record env only
         # when the counter is divisible by n otherwise steps the normal env
         logging.info(f"Creating ManiSkill env {env_id}")
-        if "video_dir" in env_kwargs:
-            output_dir = env_kwargs["video_dir"]
-            del env_kwargs["video_dir"]
-        else:
-            output_dir = None
+        output_dir = env_kwargs.pop("video_dir", None)
         super().__init__(env_id, seed, **env_kwargs)
         logging.info(f"Created ManiSkill env {env_id}")
         if "human_render_camera_configs" in env_kwargs:
@@ -204,11 +200,78 @@ def do_import():
 EvaluatorEnv.register("PokeCube-v1", ManiSkill)
 
 
+class Libero(EvaluatorEnv):
+
+    def __init__(self, env_id: str, seed: int, **env_kwargs) -> None:
+        logging.info("Creating Libero env")
+        self.env, self._language_instruction, self.task_name, self.task_suite, self.task_id, self.task = self._make_gym(
+            env_id, seed, **env_kwargs
+        )
+        logging.info(
+            f"Created Libero env, task suite: {env_id}, task id: {self.task_id}, task name {self.task_name}, instruction: {self._language_instruction}"
+        )
+        self.env_id = env_id
+        self.seed = seed
+
+    def _make_gym(self, env_id, seed, **env_kwargs):
+        from libero.libero import benchmark, get_libero_path
+        from libero.libero.envs import OffScreenRenderEnv
+
+        benchmark_dict = benchmark.get_benchmark_dict()
+
+        task_suite = benchmark_dict[env_id]()
+        task_id = min(max(env_kwargs.pop("task_id", 0), 0), task_suite.n_tasks - 1)
+        task = task_suite.get_task(task_id)
+
+        task_bddl_file = os.path.join(get_libero_path("bddl_files"), task.problem_folder, task.bddl_file)
+        env = OffScreenRenderEnv(
+            bddl_file_name=task_bddl_file,
+            **env_kwargs,
+        )
+        env.seed(seed)
+        return env, task.language, task.name, task_suite, task_id, task
+
+    def translate_obs(self, obs: dict[str, Any]) -> Obs:
+        return Obs(
+            cameras=dict(rgb_side=obs["agentview_image"]),
+            gripper=obs["robot0_gripper_qpos"] / 0.04,  # normalize
+        )
+
+    def step(self, action: Act) -> tuple[Obs, float, bool, bool, dict]:
+        # change gripper to libero format (-1, 1) where -1 is open
+        action.action[-1] = (1 - action.action[-1]) * 2 - 1.0
+        obs, reward, done, info = self.env.step(action.action)
+        return self.translate_obs(obs), reward, done, done, info
+
+    def reset(self, seed: int | None = None, options: dict[str, Any] | None = None) -> tuple[Obs, dict[str, Any]]:
+        obs, info = self.env.reset()
+        init_states = self.task_suite.get_task_init_states(
+            self.task_id
+        )  # for benchmarking purpose, we fix the a set of initial states
+        init_state_id = 0
+        self.env.set_init_state(init_states[init_state_id])
+
+        return self.translate_obs(obs), info
+
+    @property
+    def language_instruction(self) -> str:
+        return self._language_instruction
+
+
+EvaluatorEnv.register("libero_10", Libero)
+EvaluatorEnv.register("libero_90", Libero)
+EvaluatorEnv.register("libero_100", Libero)
+EvaluatorEnv.register("libero_spatial", Libero)
+EvaluatorEnv.register("libero_object", Libero)
+EvaluatorEnv.register("libero_goal", Libero)
+
+
 @dataclass
 class EvalConfig:
     env_id: str
     env_kwargs: dict[str, Any]
     max_steps_per_episode: int = 100
+    # TODO: add seed, on same machine and jpeg encoding
 
 
 @dataclass

From c016f2e64bb3b0962b7fb7a1d901cfd3865831c0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tobias=20J=C3=BClg?= <tobias.juelg@utn.de>
Date: Fri, 16 Jan 2026 16:45:36 +0100
Subject: [PATCH 3/5] feat: libero fixes

- added relative and absolute control options for libero
- corrected gripper translation
- added wrist camera
- added doc string for libero
- tested libero
- refactored eval run into two functions, to be callable in python
---
 src/vlagents/__main__.py       | 293 ++++++++++-----------------------
 src/vlagents/evaluator_envs.py | 106 ++++++------
 2 files changed, 137 insertions(+), 262 deletions(-)

diff --git a/src/vlagents/__main__.py b/src/vlagents/__main__.py
index f3797d5..74727ce 100644
--- a/src/vlagents/__main__.py
+++ b/src/vlagents/__main__.py
@@ -82,12 +82,12 @@ def _per_process(
 
 
 @main_app.command()
-def run_eval_post_training(
-    wandb_project: Annotated[str, typer.Option(help="weights and biases logging project.")],
-    wandb_entity: Annotated[str, typer.Option(help="weights and biases logging entity.")],
-    wandb_note: Annotated[str, typer.Option(help="weights and biases logging note.")],
-    wandb_name: Annotated[str, typer.Option(help="weights and biases logging name.")],
+def run_eval(
     output_path: Annotated[str, typer.Option(help="Path to store the run results.")],
+    wandb_project: Annotated[str | None, typer.Option(help="weights and biases logging project.")] = None,
+    wandb_entity: Annotated[str | None, typer.Option(help="weights and biases logging entity.")] = None,
+    wandb_note: Annotated[str | None, typer.Option(help="weights and biases logging note.")] = None,
+    wandb_name: Annotated[str | None, typer.Option(help="weights and biases logging name.")] = None,
     wandb_group: Annotated[str | None, typer.Option(help="weights and biases logging name.")] = None,
     steps: Annotated[str | None, typer.Option(help="steps to evaluate.")] = None,
     episodes: Annotated[int, typer.Option(help="Number of episodes to run.")] = 100,
@@ -104,203 +104,63 @@ def run_eval_post_training(
     post training eval which goes over all checkpoints
     - each checkpoint with many envs
     """
-    if steps is None:
-        steps = [None]
-    else:
-        steps = json.loads(steps)
 
-    if wandb_group == "":
-        wandb_group = None
-
-    wandb.init(
-        entity=wandb_entity,
-        resume="allow",
-        project=wandb_project,
-        # config=dict(agent_name=agent_name, agent_kwargs=json.loads(kwargs), eval_cfgs=json.loads(eval_cfgs)),
-        notes=wandb_note,
-        job_type="eval",
-        name=wandb_name,
-        group=wandb_group,
-    )
-    wandb_log_git_diff(output_path)
-    wandb.run.log_code(".")
-
-    wandb.define_metric(
-        "total/success",
-        step_metric="train_step",
-        overwrite=False,
-        step_sync=False,
-        hidden=False,
-        summary="max",
-    )
-    wandb.define_metric(
-        "total/last_step_reward",
-        step_metric="train_step",
-        overwrite=False,
-        step_sync=False,
-        hidden=False,
-        summary="max",
+    eval_cfgs_ = [EvalConfig(**cfg) for cfg in json.loads(eval_cfgs)]
+    agent_cfg_ = AgentConfig(**json.loads(agent_cfg))
+    _run_eval(
+        output_path=output_path,
+        eval_cfgs=eval_cfgs_,
+        agent_cfg=agent_cfg_,
+        wandb_project=wandb_project,
+        wandb_entity=wandb_entity,
+        wandb_note=wandb_note,
+        wandb_name=wandb_name,
+        wandb_group=wandb_group,
+        steps=steps,
+        episodes=episodes,
+        n_processes=n_processes,
+        n_gpus=n_gpus,
     )
-    wandb.define_metric(
-        "total/total_steps",
-        step_metric="train_step",
-        overwrite=False,
-        step_sync=False,
-        hidden=False,
-        summary="min",
-    )
-    wandb.define_metric(
-        "total/mean_reward",
-        step_metric="train_step",
-        overwrite=False,
-        step_sync=False,
-        hidden=False,
-        summary="max",
-    )
-    eval_cfgs = [EvalConfig(**cfg) for cfg in json.loads(eval_cfgs)]
-    for idx, env in enumerate(eval_cfgs):
-        wandb.define_metric(
-            f"{env.env_id}/success",
-            step_metric="train_step",
-            overwrite=False,
-            step_sync=False,
-            hidden=False,
-            summary="max",
-        )
-        wandb.define_metric(
-            f"{env.env_id}/last_step_reward",
-            step_metric="train_step",
-            overwrite=False,
-            step_sync=False,
-            hidden=False,
-            summary="max",
-        )
-        wandb.define_metric(
-            f"{env.env_id}/total_steps",
-            step_metric="train_step",
-            overwrite=False,
-            step_sync=False,
-            hidden=False,
-            summary="min",
-        )
-        wandb.define_metric(
-            f"{env.env_id}/mean_reward",
-            step_metric="train_step",
-            overwrite=False,
-            step_sync=False,
-            hidden=False,
-            summary="max",
-        )
 
-    # distribute gpus equally
-    gpus_ids = [i % n_gpus for i in range(len(steps))]
 
-    # spawn n processes and run in parallel
-
-    agent_cfgs = [AgentConfig(**json.loads(agent_cfg)) for _ in steps]
-    for idx in range(len(steps)):
-        agent_cfgs[idx].port += idx
-    with Pool(n_processes) as p:
-        args = [(step, agent_cfgs[idx], eval_cfgs, episodes, 1, gpus_ids[idx]) for idx, step in enumerate(steps)]
-        results = p.map(_per_process, args)
-    logging.info("Finished evaluation")
-
-    for result in results:
-        per_env_results_last_reward, per_env_results_rewards, mean_rewards, step = result
-        step = step if step is not None else 0
-        wandb_log_dict = {
-            "total/success": per_env_results_last_reward.mean(axis=(0, 1))[0],
-            "total/last_step_reward": per_env_results_last_reward.mean(axis=(0, 1))[1],
-            "total/total_steps": per_env_results_last_reward.mean(axis=(0, 1))[2],
-            "total/mean_reward": np.mean(mean_rewards),
-            "train_step": step,
-        }
-        # log for each env
-        for idx, env in enumerate(eval_cfgs):
-            wandb_log_dict.update(
-                {
-                    f"{env.env_id}/success": per_env_results_last_reward[idx].mean(axis=0)[0],
-                    f"{env.env_id}/last_step_reward": per_env_results_last_reward[idx].mean(axis=0)[1],
-                    f"{env.env_id}/total_steps": per_env_results_last_reward[idx].mean(axis=0)[2],
-                    f"{env.env_id}/mean_reward": mean_rewards[idx],
-                }
-            )
-        wandb.log(wandb_log_dict, step=step, commit=True)
-
-        path = write_results(
-            per_env_results_last_reward,
-            per_env_results_rewards,
-            eval_cfgs,
-            agent_cfg=agent_cfgs[0],
-            out=output_path,
-        )
-        wandb.log_artifact(path, type="file", name="results", aliases=[f"step_{step}"])
-
-
-@main_app.command()
-def run_eval_during_training(
-    wandb_id: Annotated[str, typer.Option(help="weights and biases logging id.")],
-    wandb_group: Annotated[str, typer.Option(help="weights and biases logging group.")],
-    wandb_project: Annotated[str, typer.Option(help="weights and biases logging project.")],
-    wandb_entity: Annotated[str, typer.Option(help="weights and biases logging entity.")],
-    wandb_note: Annotated[str, typer.Option(help="weights and biases logging note.")],
-    wandb_name: Annotated[str, typer.Option(help="weights and biases logging name.")],
-    output_path: Annotated[str, typer.Option(help="Path to store the run results.")],
-    wandb_first: Annotated[bool, typer.Option(help="whether its the first eval.")] = False,
-    episodes: Annotated[int, typer.Option(help="Number of episodes to run.")] = 100,
-    n_processes: Annotated[int | None, typer.Option(help="Number of processes to run.")] = None,
-    eval_cfgs: Annotated[
-        str, typer.Option(help="Evaluation configurations.")
-    ] = '[{"env": "rcs/SimplePickUpSim-v0", "kwargs": {}}]',
-    agent_cfg: Annotated[
-        str, typer.Option(help="Agent configuration.")
-    ] = '{"host": "localhost", "port": 8080, "agent_name": "Test", "agent_kwargs": {}, "python_path": "python"}',
+def _run_eval(
+    output_path: str,
+    eval_cfgs: list[EvalConfig],
+    agent_cfg: AgentConfig,
+    wandb_project: str | None = None,
+    wandb_entity: str | None = None,
+    wandb_note: str | None = None,
+    wandb_name: str | None = None,
+    wandb_group: str | None = None,
+    steps: str | None = None,
+    episodes: int = 100,
+    n_processes: int | None = None,
+    n_gpus: int = 1,
 ):
-    """
-    during training eval, all need to use the same id
-    - just for one model, but many envs
-    - can be new run but at least in the same project and same group as the training
-    """
-    assert (
-        agent_cfg["agent_name"] != "Test"
-    ), "agent_cfg needs to be passed as a json argument. See the default for an example."
+    if steps is None:
+        steps = [None]
+    else:
+        steps = json.loads(steps)
+
+    agent_cfgs = [agent_cfg] * len(steps)
+    # TODO: make this a prober argument that is passed
+    os.environ["RUN_PATH"] = output_path
 
-    if wandb_first:
+    use_wandb = wandb_project is not None and wandb_entity is not None
+    if use_wandb:
         wandb.init(
-            id=wandb_id,
             entity=wandb_entity,
             resume="allow",
-            group=wandb_group,
             project=wandb_project,
             # config=dict(agent_name=agent_name, agent_kwargs=json.loads(kwargs), eval_cfgs=json.loads(eval_cfgs)),
             notes=wandb_note,
             job_type="eval",
             name=wandb_name,
+            group=wandb_group,
         )
-
         wandb_log_git_diff(output_path)
         wandb.run.log_code(".")
-    else:
-        wandb.init(id=wandb_id, entity=wandb_entity, resume="must", project=wandb_project)
 
-    eval_cfgs = [EvalConfig(**cfg) for cfg in json.loads(eval_cfgs)]
-
-    agent_cfg = AgentConfig(**json.loads(agent_cfg))
-    step = agent_cfg.agent_kwargs.get("checkpoint_step", 0)
-    step = step if step is not None else 0
-
-    per_env_results_last_reward, per_env_results_rewards = evaluation(
-        agent_cfg=agent_cfg, eval_cfgs=eval_cfgs, episodes=episodes, n_processes=n_processes
-    )
-
-    # return is [envs, episodes, 3(success, reward, steps)], [envs, episodes, rewards for all steps in the episode]
-
-    flatten_rewards = [[item for sublist in env_rewards for item in sublist] for env_rewards in per_env_results_rewards]
-    mean_rewards = [np.mean(env_rewards) if env_rewards else 0.0 for env_rewards in flatten_rewards]
-
-    # these new define metric to also not work with several jobs
-    # wandb says that logging can only be done in run groups
-    if wandb_first:
         wandb.define_metric(
             "total/success",
             step_metric="train_step",
@@ -367,33 +227,50 @@ def run_eval_during_training(
                 summary="max",
             )
 
-    wandb_log_dict = {
-        "total/success": per_env_results_last_reward.mean(axis=(0, 1))[0],
-        "total/last_step_reward": per_env_results_last_reward.mean(axis=(0, 1))[1],
-        "total/total_steps": per_env_results_last_reward.mean(axis=(0, 1))[2],
-        "total/mean_reward": np.mean(mean_rewards),
-        "train_step": step,
-    }
+    # distribute gpus equally
+    gpus_ids = [i % n_gpus for i in range(len(steps))]
+
+    # spawn n processes and run in parallel
+
+    for idx in range(len(steps)):
+        agent_cfgs[idx].port += idx
+    with Pool(n_processes) as p:
+        args = [(step, agent_cfgs[idx], eval_cfgs, episodes, 1, gpus_ids[idx]) for idx, step in enumerate(steps)]
+        results = p.map(_per_process, args)
+    logging.info("Finished evaluation")
 
-    # log for each env
-    for idx, env in enumerate(eval_cfgs):
-        wandb_log_dict.update(
-            {
-                f"{env.env_id}/success": per_env_results_last_reward[idx].mean(axis=0)[0],
-                f"{env.env_id}/last_step_reward": per_env_results_last_reward[idx].mean(axis=0)[1],
-                f"{env.env_id}/total_steps": per_env_results_last_reward[idx].mean(axis=0)[2],
-                f"{env.env_id}/mean_reward": mean_rewards[idx],
+    for result in results:
+        per_env_results_last_reward, per_env_results_rewards, mean_rewards, step = result
+        if use_wandb:
+            step = step if step is not None else 0
+            wandb_log_dict = {
+                "total/success": per_env_results_last_reward.mean(axis=(0, 1))[0],
+                "total/last_step_reward": per_env_results_last_reward.mean(axis=(0, 1))[1],
+                "total/total_steps": per_env_results_last_reward.mean(axis=(0, 1))[2],
+                "total/mean_reward": np.mean(mean_rewards),
+                "train_step": step,
             }
+            # log for each env
+            for idx, env in enumerate(eval_cfgs):
+                wandb_log_dict.update(
+                    {
+                        f"{env.env_id}/success": per_env_results_last_reward[idx].mean(axis=0)[0],
+                        f"{env.env_id}/last_step_reward": per_env_results_last_reward[idx].mean(axis=0)[1],
+                        f"{env.env_id}/total_steps": per_env_results_last_reward[idx].mean(axis=0)[2],
+                        f"{env.env_id}/mean_reward": mean_rewards[idx],
+                    }
+                )
+            wandb.log(wandb_log_dict, step=step, commit=True)
+
+        path = write_results(
+            per_env_results_last_reward,
+            per_env_results_rewards,
+            eval_cfgs,
+            agent_cfg=agent_cfgs[0],
+            out=output_path,
         )
-    wandb.log(wandb_log_dict, step=step, commit=True)
-    path = write_results(
-        per_env_results_last_reward,
-        per_env_results_rewards,
-        eval_cfgs,
-        agent_cfg=agent_cfg,
-        out=output_path,
-    )
-    wandb.log_artifact(path, type="file", name="results", aliases=[f"step_{step}"])
+        if use_wandb:
+            wandb.log_artifact(path, type="file", name="results", aliases=[f"step_{step}"])
 
 
 if __name__ == "__main__":
diff --git a/src/vlagents/evaluator_envs.py b/src/vlagents/evaluator_envs.py
index fbf6c6f..ff7c470 100644
--- a/src/vlagents/evaluator_envs.py
+++ b/src/vlagents/evaluator_envs.py
@@ -202,10 +202,20 @@ def do_import():
 
 class Libero(EvaluatorEnv):
 
-    def __init__(self, env_id: str, seed: int, **env_kwargs) -> None:
+    def __init__(self, env_id: str, seed: int, reset_steps: int = 14, **env_kwargs) -> None:
+        """
+        For supported env_kwargs checkout ControlEnv class in libero.
+        We add the following env_kwargs on top:
+        - task_id (int): libero task id for given task suite. The number of tasks per task suite can checked with Libero.n_tasks(env_id). Defaults to 0.
+        - control_mode (str): either 'relative' or 'absolute'. Defaults to 'relative'.
+
+        """
         logging.info("Creating Libero env")
+        self.env_kwargs = env_kwargs
+        self.reset_steps = reset_steps
+        self.control_mode = self.env_kwargs.pop("control_mode", "relative")
         self.env, self._language_instruction, self.task_name, self.task_suite, self.task_id, self.task = self._make_gym(
-            env_id, seed, **env_kwargs
+            env_id, seed, **self.env_kwargs
         )
         logging.info(
             f"Created Libero env, task suite: {env_id}, task id: {self.task_id}, task name {self.task_name}, instruction: {self._language_instruction}"
@@ -213,7 +223,16 @@ def __init__(self, env_id: str, seed: int, **env_kwargs) -> None:
         self.env_id = env_id
         self.seed = seed
 
-    def _make_gym(self, env_id, seed, **env_kwargs):
+    @staticmethod
+    def n_tasks(env_id: str) -> int:
+        from libero.libero import benchmark, get_libero_path
+
+        benchmark_dict = benchmark.get_benchmark_dict()
+        task_suite = benchmark_dict[env_id]()
+        return task_suite.n_tasks
+
+    @staticmethod
+    def _make_gym(env_id, seed, **env_kwargs):
         from libero.libero import benchmark, get_libero_path
         from libero.libero.envs import OffScreenRenderEnv
 
@@ -229,29 +248,48 @@ def _make_gym(self, env_id, seed, **env_kwargs):
             **env_kwargs,
         )
         env.seed(seed)
+
         return env, task.language, task.name, task_suite, task_id, task
 
     def translate_obs(self, obs: dict[str, Any]) -> Obs:
         return Obs(
-            cameras=dict(rgb_side=obs["agentview_image"]),
+            cameras=dict(rgb_side=obs["agentview_image"][::-1], rgb_wrist=obs["robot0_eye_in_hand_image"][::-1]),
             gripper=obs["robot0_gripper_qpos"] / 0.04,  # normalize
         )
 
     def step(self, action: Act) -> tuple[Obs, float, bool, bool, dict]:
         # change gripper to libero format (-1, 1) where -1 is open
-        action.action[-1] = (1 - action.action[-1]) * 2 - 1.0
-        obs, reward, done, info = self.env.step(action.action)
-        return self.translate_obs(obs), reward, done, done, info
+        act = np.copy(action.action)
+        act[-1] = (1 - act[-1]) * 2 - 1.0
+        obs, reward, done, info = self.env.step(act)
+        success = self.env.check_success()
+        return self.translate_obs(obs), reward, success, done, info
 
     def reset(self, seed: int | None = None, options: dict[str, Any] | None = None) -> tuple[Obs, dict[str, Any]]:
-        obs, info = self.env.reset()
+        obs = self.env.reset()
         init_states = self.task_suite.get_task_init_states(
             self.task_id
         )  # for benchmarking purpose, we fix the a set of initial states
         init_state_id = 0
         self.env.set_init_state(init_states[init_state_id])
 
-        return self.translate_obs(obs), info
+        for robot in self.env.robots:
+            robot.controller.use_delta = True
+        for _ in range(self.reset_steps):
+            obs, _, _, _ = self.env.step(
+                np.zeros(8) if "JOINT" in self.env_kwargs.get("controller", "OSC_POSE") else np.zeros(7)
+            )
+
+        if self.control_mode == "absolute":
+            for robot in self.env.robots:
+                robot.controller.use_delta = False
+        elif self.control_mode == "relative":
+            for robot in self.env.robots:
+                robot.controller.use_delta = True
+        else:
+            raise ValueError(f"Invalid control mode: {self.control_mode}, use 'absolute' or 'relative'.")
+
+        return self.translate_obs(obs), {}
 
     @property
     def language_instruction(self) -> str:
@@ -285,11 +323,11 @@ class AgentConfig:
 
 
 def single_eval(env: EvaluatorEnv, agent: Agent, max_steps: int, i) -> tuple[list[float], list[float], list[float]]:
-    logging.debug(f"Starting evaluation of {env.env.unwrapped.spec.id}")
+    logging.debug(f"Starting evaluation")
     obs, _ = env.reset(options={})
-    logging.debug(f"Reset env {env.env.unwrapped.spec.id}")
+    logging.debug(f"Reset env")
     agent.reset(obs, env.language_instruction)
-    logging.debug(f"Reset agent {env.env.unwrapped.spec.id}")
+    logging.debug(f"Reset agent")
     done = False
     truncated = False
     step = 0.0
@@ -320,9 +358,7 @@ def single_eval(env: EvaluatorEnv, agent: Agent, max_steps: int, i) -> tuple[lis
         )
 
     env.reset(options={})
-    logging.debug(
-        f"Finished evaluation of {env.env.unwrapped.spec.id} with {step} steps and reward {reward}, success {done}"
-    )
+    logging.debug(f"Finished evaluation with {step} steps and reward {reward}, success {done}")
     # success, last reward and number of steps
     return done, rewards, step
 
@@ -441,7 +477,6 @@ def evaluation(
         with start_server(
             agent_cfg.agent_name, agent_cfg.agent_kwargs, agent_cfg.port, agent_cfg.host, agent_cfg.python_path
         ):
-            sleep(30)
             res = multi_eval(agent_cfg, eval_cfgs, episodes, n_processes)
     except Exception:
         # Ensures you SEE the client's stack trace and any logged errors.
@@ -458,44 +493,7 @@ def evaluation(
     return res
 
 
-def run_eval_during_training(
-    agent_cfg: AgentConfig,
-    eval_cfgs: list[EvalConfig],
-    wandb_id: str,
-    wandb_entity: str,
-    wandb_group: str,
-    wandb_project: str,
-    wandb_note: str,
-    wandb_name: str,
-    slurm: Slurm,
-    output_path: str,
-    wandb_first: bool = False,
-    episodes: int = 100,
-    n_processes: int | None = None,
-    cmd=None,
-):
-    if cmd is None:
-        cmd = ["python"]
-    cmd += [
-        "-m",
-        "vlagents" "run-eval-during-training",
-        f"--agent-cfg={json.dumps(asdict(agent_cfg))}" f"--episodes={episodes}",
-        f"--n-processes={n_processes}",
-        f"--eval-cfgs={json.dumps([asdict(cfg) for cfg in eval_cfgs])}",
-        f"--wandb-id={wandb_id}",
-        f"--wandb-group={wandb_group.replace(':', '_')}",
-        f"--wandb-project={wandb_project}",
-        f"--wandb-entity={wandb_entity}",
-        f"--wandb-note={wandb_note}",
-        f"--wandb-name={wandb_name}",
-        f"--output-path={output_path}",
-    ]
-    if wandb_first:
-        cmd.append("--wandb-first")
-    slurm.sbatch(shlex.join(cmd))
-
-
-def run_eval_post_training(
+def run_eval(
     agent_cfg: AgentConfig,
     eval_cfgs: list[EvalConfig],
     wandb_entity: str,

From e56ed5e1e0343f55949d5e821b18fca39385666a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tobias=20J=C3=BClg?= <tobias.juelg@utn.de>
Date: Fri, 16 Jan 2026 16:55:16 +0100
Subject: [PATCH 4/5] feat: shm and jpeg configurable

---
 src/tests/test_libero.py       | 51 ++++++++++++++++++++++++++++++++++
 src/vlagents/evaluator_envs.py | 15 ++++++++--
 2 files changed, 63 insertions(+), 3 deletions(-)
 create mode 100644 src/tests/test_libero.py

diff --git a/src/tests/test_libero.py b/src/tests/test_libero.py
new file mode 100644
index 0000000..67d422f
--- /dev/null
+++ b/src/tests/test_libero.py
@@ -0,0 +1,51 @@
+if __name__ == "__main__":
+    import datetime
+    import os
+
+    import numpy as np
+    from PIL import Image
+
+    from lerobot.envs.libero import LiberoEnv
+    from vlagents.__main__ import _run_eval
+    from vlagents.evaluator_envs import AgentConfig, EvalConfig
+
+    # main_app()
+    # test
+    os.environ["RUN_PATH"] = "test_output"
+    _run_eval(
+        output_path="test_output",
+        eval_cfgs=[
+            EvalConfig(
+                env_id="libero_10",
+                env_kwargs={"controller": "OSC_POSE", "camera_heights": 256, "camera_widths": 256},
+                max_steps_per_episode=100,
+            )
+        ],
+        agent_cfg=AgentConfig(host="localhost", port=8080, agent_name="test", agent_kwargs={}),
+        n_processes=1,
+        n_gpus=1,
+        episodes=1,
+    )
+
+    # from libero.libero import benchmark, get_libero_path
+    # from libero.libero.envs import OffScreenRenderEnv
+    # benchmark_dict = benchmark.get_benchmark_dict()
+
+    # task_suite = benchmark_dict["libero_10"]()
+    # env = LiberoEnv(task_suite, task_id=0, task_suite_name="libero_10")
+    # env.reset()
+    # im = []
+    # im2 = []
+    # for i in range(100):
+    #     obs, reward, done, truncated, info = env.step(np.zeros(7))
+    #     im2.append(Image.fromarray(obs["pixels"]["image"]))
+    #     im.append(Image.fromarray(obs["pixels"]["image2"]))
+    # env.close()
+
+    # im[0].save(
+    #     f"{str(datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))}.gif",
+    #     save_all=True,
+    #     append_images=im[1:],
+    #     duration=0.2 * 1000,
+    #     loop=0,
+    # )
diff --git a/src/vlagents/evaluator_envs.py b/src/vlagents/evaluator_envs.py
index ff7c470..d30c4bf 100644
--- a/src/vlagents/evaluator_envs.py
+++ b/src/vlagents/evaluator_envs.py
@@ -276,6 +276,7 @@ def reset(self, seed: int | None = None, options: dict[str, Any] | None = None)
         for robot in self.env.robots:
             robot.controller.use_delta = True
         for _ in range(self.reset_steps):
+            # steps the environment to filter out falling objects
             obs, _, _, _ = self.env.step(
                 np.zeros(8) if "JOINT" in self.env_kwargs.get("controller", "OSC_POSE") else np.zeros(7)
             )
@@ -309,7 +310,9 @@ class EvalConfig:
     env_id: str
     env_kwargs: dict[str, Any]
     max_steps_per_episode: int = 100
-    # TODO: add seed, on same machine and jpeg encoding
+    seed: int = 42
+    same_machine: bool = False
+    jpeg_encoding: bool = False
 
 
 @dataclass
@@ -373,7 +376,13 @@ def create_env_agent(agent_config: AgentConfig, cfg: EvalConfig, seed: int) -> t
         logging.info(f"env {cfg.env_id} not available, creating new env and agent")
         env = EvaluatorEnv.make(cfg.env_id, seed=seed, **cfg.env_kwargs)
         logging.info("done creating env")
-        agent = RemoteAgent(agent_config.host, agent_config.port, agent_config.agent_name)
+        agent = RemoteAgent(
+            agent_config.host,
+            agent_config.port,
+            agent_config.agent_name,
+            on_same_machine=cfg.same_machine,
+            jpeg_encoding=cfg.jpeg_encoding,
+        )
         logging.info("done creating agent")
         per_process_cache[key] = (env, agent)
     return per_process_cache[key]
@@ -402,7 +411,7 @@ def multi_eval(
     #     single_results = p.map(run_episode, args)
 
     # without process
-    np.random.seed(42)
+    np.random.seed(cfgs[0].seed)
     args = [(i, cfgs, episodes, agent_cfg) for i in range(len(cfgs) * episodes)]
     single_results = [run_episode(arg) for arg in tqdm(args)]
 

From a62b8b0bfefaea767bb09140bf5ba8d71a1f7b62 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tobias=20J=C3=BClg?= <tobias.juelg@utn.de>
Date: Fri, 16 Jan 2026 16:57:58 +0100
Subject: [PATCH 5/5] ci: only format py module

---
 Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 195c455..d4b6fb2 100644
--- a/Makefile
+++ b/Makefile
@@ -2,8 +2,8 @@ PYSRC = src
 
 # Python
 checkformat:
-	isort --check-only ${PYSRC}
-	black --check ${PYSRC}
+	isort --check-only ${PYSRC}/vlagents
+	black --check ${PYSRC}/vlagents
 
 format:
 	isort ${PYSRC}