InternRobotics · kew6688 · Jan 21, 2026 · Jan 22, 2026
diff --git a/internnav/habitat_extensions/vln/habitat_vln_evaluator.py b/internnav/habitat_extensions/vln/habitat_vln_evaluator.py
@@ -13,6 +13,7 @@
 
 import cv2
 import habitat
+import imageio
 import numpy as np
 import quaternion
 import torch
@@ -105,6 +106,10 @@ def __init__(self, cfg: EvalCfg):
 
         # ------------------------------------- model ------------------------------------------
         self.model_args = argparse.Namespace(**cfg.agent.model_settings)
+        self.vis_debug = bool(getattr(self.model_args, "vis_debug", False))
+        self.vis_debug_path = getattr(
+            self.model_args, "vis_debug_path", os.path.join(self.output_path, "vis_debug")
+        )
 
         processor = AutoProcessor.from_pretrained(self.model_args.model_path)
         processor.tokenizer.padding_side = 'left'
@@ -288,9 +293,17 @@ def _run_eval_dual_system(self) -> tuple:
 
             vis_frames = []
             step_id = 0
+            vis_writer = None
 
             if self.save_video:
                 os.makedirs(os.path.join(self.output_path, f'vis_{self.epoch}', f'{scene_id}'), exist_ok=True)
+            if self.vis_debug:
+                debug_dir = os.path.join(self.vis_debug_path, f'epoch_{self.epoch}')
+                os.makedirs(debug_dir, exist_ok=True)
+                vis_writer = imageio.get_writer(
+                    os.path.join(debug_dir, f'{scene_id}_{episode_id:04d}.mp4'),
+                    fps=5,
+                )
 
             rgb_list = []
             action_seq = []
@@ -307,6 +320,7 @@ def _run_eval_dual_system(self) -> tuple:
 
             # ---------- 2. Episode step loop -----------
             while (not done) and (step_id <= self.max_steps_per_episode):
+                draw_pixel_goal = False
                 # refactor agent get action
                 rgb = observations["rgb"]
                 depth = observations["depth"]
@@ -422,6 +436,7 @@ def _run_eval_dual_system(self) -> tuple:
                         coord = [int(c) for c in re.findall(r'\d+', llm_outputs)]
 
                         pixel_goal = [int(coord[1]), int(coord[0])]
+                        draw_pixel_goal = True
 
                         # look down --> horizontal
                         self.env.step(action_code.LOOKUP)
@@ -526,6 +541,24 @@ def _run_eval_dual_system(self) -> tuple:
 
                 print("step_id", step_id, "action", action)
 
+                if vis_writer is not None:
+                    vis = np.asarray(save_raw_image).copy()
+                    vis = cv2.putText(
+                        vis,
+                        f"step {step_id} action {int(action)}",
+                        (20, 40),
+                        cv2.FONT_HERSHEY_SIMPLEX,
+                        1,
+                        (0, 255, 0),
+                        2,
+                    )
+                    if pixel_goal is not None:
+                        if draw_pixel_goal:
+                            cv2.circle(
+                                vis, (pixel_goal[0], pixel_goal[1]), radius=8, color=(255, 0, 0), thickness=-1
+                            )
+                    vis_writer.append_data(vis)
+
                 if action == action_code.LOOKDOWN:
                     self.env.step(action)
                     observations, _, done, _ = self.env.step(action)
@@ -586,6 +619,8 @@ def _run_eval_dual_system(self) -> tuple:
                     quality=9,
                 )
             vis_frames.clear()
+            if vis_writer is not None:
+                vis_writer.close()
 
         self.env.close()
 
@@ -643,9 +678,17 @@ def _run_eval_system2(self) -> tuple:
 
             vis_frames = []
             step_id = 0
+            vis_writer = None
 
             if self.save_video:
                 os.makedirs(os.path.join(self.output_path, f'vis_{self.epoch}', f'{scene_id}'), exist_ok=True)
+            if self.vis_debug:
+                debug_dir = os.path.join(self.vis_debug_path, f'epoch_{self.epoch}')
+                os.makedirs(debug_dir, exist_ok=True)
+                vis_writer = imageio.get_writer(
+                    os.path.join(debug_dir, f'{scene_id}_{episode_id:04d}.mp4'),
+                    fps=5,
+                )
             initial_height = self.env._env.sim.get_agent_state().position[1]
 
             rgb_list = []
@@ -662,6 +705,7 @@ def _run_eval_system2(self) -> tuple:
 
             # ---------- 2. Episode step loop -----------
             while (not done) and (step_id <= self.max_steps_per_episode):
+                draw_pixel_goal = False
                 # refactor agent get action
                 rgb = observations["rgb"]
                 depth = observations["depth"]
@@ -755,6 +799,7 @@ def _run_eval_system2(self) -> tuple:
                         coord = [int(c) for c in re.findall(r'\d+', llm_outputs)]
 
                         pixel_goal = [int(coord[1]), int(coord[0])]
+                        draw_pixel_goal = True
 
                         # look down --> horizontal
                         self.env.step(action_code.LOOKUP)
@@ -818,6 +863,21 @@ def _run_eval_system2(self) -> tuple:
 
                 print("step_id", step_id, "action", action)
 
+                if vis_writer is not None:
+                    vis = np.asarray(save_raw_image).copy()
+                    vis = cv2.putText(
+                        vis,
+                        f"step {step_id} action {int(action)}",
+                        (20, 40),
+                        cv2.FONT_HERSHEY_SIMPLEX,
+                        1,
+                        (0, 255, 0),
+                        2,
+                    )
+                    if draw_pixel_goal:
+                        cv2.circle(vis, (pixel_goal[0], pixel_goal[1]), radius=8, color=(255, 0, 0), thickness=-1)
+                    vis_writer.append_data(vis)
+
                 if action == action_code.LOOKDOWN:
                     self.env.step(action)
                     observations, _, done, _ = self.env.step(action)
@@ -875,6 +935,8 @@ def _run_eval_system2(self) -> tuple:
                     quality=9,
                 )
             vis_frames.clear()
+            if vis_writer is not None:
+                vis_writer.close()
 
         self.env.close()
 

diff --git a/scripts/eval/configs/habitat_dual_system_cfg.py b/scripts/eval/configs/habitat_dual_system_cfg.py
@@ -11,6 +11,8 @@
             "resize_w": 384,  # image resize width
             "resize_h": 384,  # image resize height
             "max_new_tokens": 1024,  # maximum number of tokens for generation
+            "vis_debug": True,  # If vis_debug=True, save debug videos per episode
+            "vis_debug_path": "./logs/habitat/vis_debug",
         },
     ),
     env=EnvCfg(

diff --git a/scripts/eval/configs/habitat_s2_cfg.py b/scripts/eval/configs/habitat_s2_cfg.py
@@ -11,6 +11,8 @@
             "resize_w": 384,  # image resize width
             "resize_h": 384,  # image resize height
             "max_new_tokens": 1024,  # maximum number of tokens for generation
+            "vis_debug": False,  # If vis_debug=True, save debug videos per episode
+            "vis_debug_path": "./logs/habitat/vis_debug",
         },
     ),
     env=EnvCfg(