From cf02a76c3ba101d1e99304bc7f3b8e22968cf04c Mon Sep 17 00:00:00 2001
From: wangyukai <wangyukai@pjlab.org.cn>
Date: Wed, 21 Jan 2026 18:17:49 +0800
Subject: [PATCH 1/2] add vis debug in habitat

---
 .../vln/habitat_vln_evaluator.py              | 55 +++++++++++++++++++
 .../eval/configs/habitat_dual_system_cfg.py   |  2 +
 scripts/eval/configs/habitat_s2_cfg.py        |  2 +
 3 files changed, 59 insertions(+)

diff --git a/internnav/habitat_extensions/vln/habitat_vln_evaluator.py b/internnav/habitat_extensions/vln/habitat_vln_evaluator.py
index e31d45ee..67d6e264 100644
--- a/internnav/habitat_extensions/vln/habitat_vln_evaluator.py
+++ b/internnav/habitat_extensions/vln/habitat_vln_evaluator.py
@@ -13,6 +13,7 @@
 
 import cv2
 import habitat
+import imageio
 import numpy as np
 import quaternion
 import torch
@@ -105,6 +106,10 @@ def __init__(self, cfg: EvalCfg):
 
         # ------------------------------------- model ------------------------------------------
         self.model_args = argparse.Namespace(**cfg.agent.model_settings)
+        self.vis_debug = bool(getattr(self.model_args, "vis_debug", False))
+        self.vis_debug_path = getattr(
+            self.model_args, "vis_debug_path", os.path.join(self.output_path, "vis_debug")
+        )
 
         processor = AutoProcessor.from_pretrained(self.model_args.model_path)
         processor.tokenizer.padding_side = 'left'
@@ -288,9 +293,17 @@ def _run_eval_dual_system(self) -> tuple:
 
             vis_frames = []
             step_id = 0
+            vis_writer = None
 
             if self.save_video:
                 os.makedirs(os.path.join(self.output_path, f'vis_{self.epoch}', f'{scene_id}'), exist_ok=True)
+            if self.vis_debug:
+                debug_dir = os.path.join(self.vis_debug_path, f'epoch_{self.epoch}')
+                os.makedirs(debug_dir, exist_ok=True)
+                vis_writer = imageio.get_writer(
+                    os.path.join(debug_dir, f'{scene_id}_{episode_id:04d}.mp4'),
+                    fps=5,
+                )
 
             rgb_list = []
             action_seq = []
@@ -526,6 +539,21 @@ def _run_eval_dual_system(self) -> tuple:
 
                 print("step_id", step_id, "action", action)
 
+                if vis_writer is not None:
+                    vis = np.asarray(save_raw_image).copy()
+                    vis = cv2.putText(
+                        vis,
+                        f"step {step_id} action {int(action)}",
+                        (20, 40),
+                        cv2.FONT_HERSHEY_SIMPLEX,
+                        1,
+                        (0, 255, 0),
+                        2,
+                    )
+                    if pixel_goal is not None:
+                        cv2.circle(vis, (pixel_goal[0], pixel_goal[1]), radius=8, color=(255, 0, 0), thickness=-1)
+                    vis_writer.append_data(vis)
+
                 if action == action_code.LOOKDOWN:
                     self.env.step(action)
                     observations, _, done, _ = self.env.step(action)
@@ -586,6 +614,8 @@ def _run_eval_dual_system(self) -> tuple:
                     quality=9,
                 )
             vis_frames.clear()
+            if vis_writer is not None:
+                vis_writer.close()
 
         self.env.close()
 
@@ -643,9 +673,17 @@ def _run_eval_system2(self) -> tuple:
 
             vis_frames = []
             step_id = 0
+            vis_writer = None
 
             if self.save_video:
                 os.makedirs(os.path.join(self.output_path, f'vis_{self.epoch}', f'{scene_id}'), exist_ok=True)
+            if self.vis_debug:
+                debug_dir = os.path.join(self.vis_debug_path, f'epoch_{self.epoch}')
+                os.makedirs(debug_dir, exist_ok=True)
+                vis_writer = imageio.get_writer(
+                    os.path.join(debug_dir, f'{scene_id}_{episode_id:04d}.mp4'),
+                    fps=5,
+                )
             initial_height = self.env._env.sim.get_agent_state().position[1]
 
             rgb_list = []
@@ -818,6 +856,21 @@ def _run_eval_system2(self) -> tuple:
 
                 print("step_id", step_id, "action", action)
 
+                if vis_writer is not None:
+                    vis = np.asarray(save_raw_image).copy()
+                    vis = cv2.putText(
+                        vis,
+                        f"step {step_id} action {int(action)}",
+                        (20, 40),
+                        cv2.FONT_HERSHEY_SIMPLEX,
+                        1,
+                        (0, 255, 0),
+                        2,
+                    )
+                    if goal is not None:
+                        cv2.circle(vis, (pixel_goal[0], pixel_goal[1]), radius=8, color=(255, 0, 0), thickness=-1)
+                    vis_writer.append_data(vis)
+
                 if action == action_code.LOOKDOWN:
                     self.env.step(action)
                     observations, _, done, _ = self.env.step(action)
@@ -875,6 +928,8 @@ def _run_eval_system2(self) -> tuple:
                     quality=9,
                 )
             vis_frames.clear()
+            if vis_writer is not None:
+                vis_writer.close()
 
         self.env.close()
 
diff --git a/scripts/eval/configs/habitat_dual_system_cfg.py b/scripts/eval/configs/habitat_dual_system_cfg.py
index 8f967061..20651e8f 100644
--- a/scripts/eval/configs/habitat_dual_system_cfg.py
+++ b/scripts/eval/configs/habitat_dual_system_cfg.py
@@ -11,6 +11,8 @@
             "resize_w": 384,  # image resize width
             "resize_h": 384,  # image resize height
             "max_new_tokens": 1024,  # maximum number of tokens for generation
+            "vis_debug": True,  # If vis_debug=True, save debug videos per episode
+            "vis_debug_path": "./logs/habitat/vis_debug",
         },
     ),
     env=EnvCfg(
diff --git a/scripts/eval/configs/habitat_s2_cfg.py b/scripts/eval/configs/habitat_s2_cfg.py
index ddb5b72b..b8cdb3be 100644
--- a/scripts/eval/configs/habitat_s2_cfg.py
+++ b/scripts/eval/configs/habitat_s2_cfg.py
@@ -11,6 +11,8 @@
             "resize_w": 384,  # image resize width
             "resize_h": 384,  # image resize height
             "max_new_tokens": 1024,  # maximum number of tokens for generation
+            "vis_debug": False,  # If vis_debug=True, save debug videos per episode
+            "vis_debug_path": "./logs/habitat/vis_debug",
         },
     ),
     env=EnvCfg(

From 8b5de79cf7c87e2bfad508d67ab2db35b3a041a8 Mon Sep 17 00:00:00 2001
From: wangyukai <wangyukai@pjlab.org.cn>
Date: Thu, 22 Jan 2026 13:10:40 +0800
Subject: [PATCH 2/2] fix draw pixel bug

---
 .../habitat_extensions/vln/habitat_vln_evaluator.py   | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/internnav/habitat_extensions/vln/habitat_vln_evaluator.py b/internnav/habitat_extensions/vln/habitat_vln_evaluator.py
index 67d6e264..966bebf5 100644
--- a/internnav/habitat_extensions/vln/habitat_vln_evaluator.py
+++ b/internnav/habitat_extensions/vln/habitat_vln_evaluator.py
@@ -320,6 +320,7 @@ def _run_eval_dual_system(self) -> tuple:
 
             # ---------- 2. Episode step loop -----------
             while (not done) and (step_id <= self.max_steps_per_episode):
+                draw_pixel_goal = False
                 # refactor agent get action
                 rgb = observations["rgb"]
                 depth = observations["depth"]
@@ -435,6 +436,7 @@ def _run_eval_dual_system(self) -> tuple:
                         coord = [int(c) for c in re.findall(r'\d+', llm_outputs)]
 
                         pixel_goal = [int(coord[1]), int(coord[0])]
+                        draw_pixel_goal = True
 
                         # look down --> horizontal
                         self.env.step(action_code.LOOKUP)
@@ -551,7 +553,10 @@ def _run_eval_dual_system(self) -> tuple:
                         2,
                     )
                     if pixel_goal is not None:
-                        cv2.circle(vis, (pixel_goal[0], pixel_goal[1]), radius=8, color=(255, 0, 0), thickness=-1)
+                        if draw_pixel_goal:
+                            cv2.circle(
+                                vis, (pixel_goal[0], pixel_goal[1]), radius=8, color=(255, 0, 0), thickness=-1
+                            )
                     vis_writer.append_data(vis)
 
                 if action == action_code.LOOKDOWN:
@@ -700,6 +705,7 @@ def _run_eval_system2(self) -> tuple:
 
             # ---------- 2. Episode step loop -----------
             while (not done) and (step_id <= self.max_steps_per_episode):
+                draw_pixel_goal = False
                 # refactor agent get action
                 rgb = observations["rgb"]
                 depth = observations["depth"]
@@ -793,6 +799,7 @@ def _run_eval_system2(self) -> tuple:
                         coord = [int(c) for c in re.findall(r'\d+', llm_outputs)]
 
                         pixel_goal = [int(coord[1]), int(coord[0])]
+                        draw_pixel_goal = True
 
                         # look down --> horizontal
                         self.env.step(action_code.LOOKUP)
@@ -867,7 +874,7 @@ def _run_eval_system2(self) -> tuple:
                         (0, 255, 0),
                         2,
                     )
-                    if goal is not None:
+                    if draw_pixel_goal:
                         cv2.circle(vis, (pixel_goal[0], pixel_goal[1]), radius=8, color=(255, 0, 0), thickness=-1)
                     vis_writer.append_data(vis)