From cf02a76c3ba101d1e99304bc7f3b8e22968cf04c Mon Sep 17 00:00:00 2001 From: wangyukai Date: Wed, 21 Jan 2026 18:17:49 +0800 Subject: [PATCH 1/2] add vis debug in habitat --- .../vln/habitat_vln_evaluator.py | 55 +++++++++++++++++++ .../eval/configs/habitat_dual_system_cfg.py | 2 + scripts/eval/configs/habitat_s2_cfg.py | 2 + 3 files changed, 59 insertions(+) diff --git a/internnav/habitat_extensions/vln/habitat_vln_evaluator.py b/internnav/habitat_extensions/vln/habitat_vln_evaluator.py index e31d45ee..67d6e264 100644 --- a/internnav/habitat_extensions/vln/habitat_vln_evaluator.py +++ b/internnav/habitat_extensions/vln/habitat_vln_evaluator.py @@ -13,6 +13,7 @@ import cv2 import habitat +import imageio import numpy as np import quaternion import torch @@ -105,6 +106,10 @@ def __init__(self, cfg: EvalCfg): # ------------------------------------- model ------------------------------------------ self.model_args = argparse.Namespace(**cfg.agent.model_settings) + self.vis_debug = bool(getattr(self.model_args, "vis_debug", False)) + self.vis_debug_path = getattr( + self.model_args, "vis_debug_path", os.path.join(self.output_path, "vis_debug") + ) processor = AutoProcessor.from_pretrained(self.model_args.model_path) processor.tokenizer.padding_side = 'left' @@ -288,9 +293,17 @@ def _run_eval_dual_system(self) -> tuple: vis_frames = [] step_id = 0 + vis_writer = None if self.save_video: os.makedirs(os.path.join(self.output_path, f'vis_{self.epoch}', f'{scene_id}'), exist_ok=True) + if self.vis_debug: + debug_dir = os.path.join(self.vis_debug_path, f'epoch_{self.epoch}') + os.makedirs(debug_dir, exist_ok=True) + vis_writer = imageio.get_writer( + os.path.join(debug_dir, f'{scene_id}_{episode_id:04d}.mp4'), + fps=5, + ) rgb_list = [] action_seq = [] @@ -526,6 +539,21 @@ def _run_eval_dual_system(self) -> tuple: print("step_id", step_id, "action", action) + if vis_writer is not None: + vis = np.asarray(save_raw_image).copy() + vis = cv2.putText( + vis, + f"step {step_id} action {int(action)}", + (20, 40), + cv2.FONT_HERSHEY_SIMPLEX, + 1, + (0, 255, 0), + 2, + ) + if pixel_goal is not None: + cv2.circle(vis, (pixel_goal[0], pixel_goal[1]), radius=8, color=(255, 0, 0), thickness=-1) + vis_writer.append_data(vis) + if action == action_code.LOOKDOWN: self.env.step(action) observations, _, done, _ = self.env.step(action) @@ -586,6 +614,8 @@ def _run_eval_dual_system(self) -> tuple: quality=9, ) vis_frames.clear() + if vis_writer is not None: + vis_writer.close() self.env.close() @@ -643,9 +673,17 @@ def _run_eval_system2(self) -> tuple: vis_frames = [] step_id = 0 + vis_writer = None if self.save_video: os.makedirs(os.path.join(self.output_path, f'vis_{self.epoch}', f'{scene_id}'), exist_ok=True) + if self.vis_debug: + debug_dir = os.path.join(self.vis_debug_path, f'epoch_{self.epoch}') + os.makedirs(debug_dir, exist_ok=True) + vis_writer = imageio.get_writer( + os.path.join(debug_dir, f'{scene_id}_{episode_id:04d}.mp4'), + fps=5, + ) initial_height = self.env._env.sim.get_agent_state().position[1] rgb_list = [] @@ -818,6 +856,21 @@ def _run_eval_system2(self) -> tuple: print("step_id", step_id, "action", action) + if vis_writer is not None: + vis = np.asarray(save_raw_image).copy() + vis = cv2.putText( + vis, + f"step {step_id} action {int(action)}", + (20, 40), + cv2.FONT_HERSHEY_SIMPLEX, + 1, + (0, 255, 0), + 2, + ) + if goal is not None: + cv2.circle(vis, (pixel_goal[0], pixel_goal[1]), radius=8, color=(255, 0, 0), thickness=-1) + vis_writer.append_data(vis) + if action == action_code.LOOKDOWN: self.env.step(action) observations, _, done, _ = self.env.step(action) @@ -875,6 +928,8 @@ def _run_eval_system2(self) -> tuple: quality=9, ) vis_frames.clear() + if vis_writer is not None: + vis_writer.close() self.env.close() diff --git a/scripts/eval/configs/habitat_dual_system_cfg.py b/scripts/eval/configs/habitat_dual_system_cfg.py index 8f967061..20651e8f 100644 --- a/scripts/eval/configs/habitat_dual_system_cfg.py +++ b/scripts/eval/configs/habitat_dual_system_cfg.py @@ -11,6 +11,8 @@ "resize_w": 384, # image resize width "resize_h": 384, # image resize height "max_new_tokens": 1024, # maximum number of tokens for generation + "vis_debug": True, # If vis_debug=True, save debug videos per episode + "vis_debug_path": "./logs/habitat/vis_debug", }, ), env=EnvCfg( diff --git a/scripts/eval/configs/habitat_s2_cfg.py b/scripts/eval/configs/habitat_s2_cfg.py index ddb5b72b..b8cdb3be 100644 --- a/scripts/eval/configs/habitat_s2_cfg.py +++ b/scripts/eval/configs/habitat_s2_cfg.py @@ -11,6 +11,8 @@ "resize_w": 384, # image resize width "resize_h": 384, # image resize height "max_new_tokens": 1024, # maximum number of tokens for generation + "vis_debug": False, # If vis_debug=True, save debug videos per episode + "vis_debug_path": "./logs/habitat/vis_debug", }, ), env=EnvCfg( From 8b5de79cf7c87e2bfad508d67ab2db35b3a041a8 Mon Sep 17 00:00:00 2001 From: wangyukai Date: Thu, 22 Jan 2026 13:10:40 +0800 Subject: [PATCH 2/2] fix draw pixel bug --- .../habitat_extensions/vln/habitat_vln_evaluator.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/internnav/habitat_extensions/vln/habitat_vln_evaluator.py b/internnav/habitat_extensions/vln/habitat_vln_evaluator.py index 67d6e264..966bebf5 100644 --- a/internnav/habitat_extensions/vln/habitat_vln_evaluator.py +++ b/internnav/habitat_extensions/vln/habitat_vln_evaluator.py @@ -320,6 +320,7 @@ def _run_eval_dual_system(self) -> tuple: # ---------- 2. Episode step loop ----------- while (not done) and (step_id <= self.max_steps_per_episode): + draw_pixel_goal = False # refactor agent get action rgb = observations["rgb"] depth = observations["depth"] @@ -435,6 +436,7 @@ def _run_eval_dual_system(self) -> tuple: coord = [int(c) for c in re.findall(r'\d+', llm_outputs)] pixel_goal = [int(coord[1]), int(coord[0])] + draw_pixel_goal = True # look down --> horizontal self.env.step(action_code.LOOKUP) @@ -551,7 +553,10 @@ def _run_eval_dual_system(self) -> tuple: 2, ) if pixel_goal is not None: - cv2.circle(vis, (pixel_goal[0], pixel_goal[1]), radius=8, color=(255, 0, 0), thickness=-1) + if draw_pixel_goal: + cv2.circle( + vis, (pixel_goal[0], pixel_goal[1]), radius=8, color=(255, 0, 0), thickness=-1 + ) vis_writer.append_data(vis) if action == action_code.LOOKDOWN: @@ -700,6 +705,7 @@ def _run_eval_system2(self) -> tuple: # ---------- 2. Episode step loop ----------- while (not done) and (step_id <= self.max_steps_per_episode): + draw_pixel_goal = False # refactor agent get action rgb = observations["rgb"] depth = observations["depth"] @@ -793,6 +799,7 @@ def _run_eval_system2(self) -> tuple: coord = [int(c) for c in re.findall(r'\d+', llm_outputs)] pixel_goal = [int(coord[1]), int(coord[0])] + draw_pixel_goal = True # look down --> horizontal self.env.step(action_code.LOOKUP) @@ -867,7 +874,7 @@ def _run_eval_system2(self) -> tuple: (0, 255, 0), 2, ) - if goal is not None: + if draw_pixel_goal: cv2.circle(vis, (pixel_goal[0], pixel_goal[1]), radius=8, color=(255, 0, 0), thickness=-1) vis_writer.append_data(vis)