Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 62 additions & 0 deletions internnav/habitat_extensions/vln/habitat_vln_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

import cv2
import habitat
import imageio
import numpy as np
import quaternion
import torch
Expand Down Expand Up @@ -105,6 +106,10 @@ def __init__(self, cfg: EvalCfg):

# ------------------------------------- model ------------------------------------------
self.model_args = argparse.Namespace(**cfg.agent.model_settings)
self.vis_debug = bool(getattr(self.model_args, "vis_debug", False))
self.vis_debug_path = getattr(
self.model_args, "vis_debug_path", os.path.join(self.output_path, "vis_debug")
)

processor = AutoProcessor.from_pretrained(self.model_args.model_path)
processor.tokenizer.padding_side = 'left'
Expand Down Expand Up @@ -288,9 +293,17 @@ def _run_eval_dual_system(self) -> tuple:

vis_frames = []
step_id = 0
vis_writer = None

if self.save_video:
os.makedirs(os.path.join(self.output_path, f'vis_{self.epoch}', f'{scene_id}'), exist_ok=True)
if self.vis_debug:
debug_dir = os.path.join(self.vis_debug_path, f'epoch_{self.epoch}')
os.makedirs(debug_dir, exist_ok=True)
vis_writer = imageio.get_writer(
os.path.join(debug_dir, f'{scene_id}_{episode_id:04d}.mp4'),
fps=5,
)

rgb_list = []
action_seq = []
Expand All @@ -307,6 +320,7 @@ def _run_eval_dual_system(self) -> tuple:

# ---------- 2. Episode step loop -----------
while (not done) and (step_id <= self.max_steps_per_episode):
draw_pixel_goal = False
# refactor agent get action
rgb = observations["rgb"]
depth = observations["depth"]
Expand Down Expand Up @@ -422,6 +436,7 @@ def _run_eval_dual_system(self) -> tuple:
coord = [int(c) for c in re.findall(r'\d+', llm_outputs)]

pixel_goal = [int(coord[1]), int(coord[0])]
draw_pixel_goal = True

# look down --> horizontal
self.env.step(action_code.LOOKUP)
Expand Down Expand Up @@ -526,6 +541,24 @@ def _run_eval_dual_system(self) -> tuple:

print("step_id", step_id, "action", action)

if vis_writer is not None:
vis = np.asarray(save_raw_image).copy()
vis = cv2.putText(
vis,
f"step {step_id} action {int(action)}",
(20, 40),
cv2.FONT_HERSHEY_SIMPLEX,
1,
(0, 255, 0),
2,
)
if pixel_goal is not None:
if draw_pixel_goal:
cv2.circle(
vis, (pixel_goal[0], pixel_goal[1]), radius=8, color=(255, 0, 0), thickness=-1
)
vis_writer.append_data(vis)

if action == action_code.LOOKDOWN:
self.env.step(action)
observations, _, done, _ = self.env.step(action)
Expand Down Expand Up @@ -586,6 +619,8 @@ def _run_eval_dual_system(self) -> tuple:
quality=9,
)
vis_frames.clear()
if vis_writer is not None:
vis_writer.close()

self.env.close()

Expand Down Expand Up @@ -643,9 +678,17 @@ def _run_eval_system2(self) -> tuple:

vis_frames = []
step_id = 0
vis_writer = None

if self.save_video:
os.makedirs(os.path.join(self.output_path, f'vis_{self.epoch}', f'{scene_id}'), exist_ok=True)
if self.vis_debug:
debug_dir = os.path.join(self.vis_debug_path, f'epoch_{self.epoch}')
os.makedirs(debug_dir, exist_ok=True)
vis_writer = imageio.get_writer(
os.path.join(debug_dir, f'{scene_id}_{episode_id:04d}.mp4'),
fps=5,
)
initial_height = self.env._env.sim.get_agent_state().position[1]

rgb_list = []
Expand All @@ -662,6 +705,7 @@ def _run_eval_system2(self) -> tuple:

# ---------- 2. Episode step loop -----------
while (not done) and (step_id <= self.max_steps_per_episode):
draw_pixel_goal = False
# refactor agent get action
rgb = observations["rgb"]
depth = observations["depth"]
Expand Down Expand Up @@ -755,6 +799,7 @@ def _run_eval_system2(self) -> tuple:
coord = [int(c) for c in re.findall(r'\d+', llm_outputs)]

pixel_goal = [int(coord[1]), int(coord[0])]
draw_pixel_goal = True

# look down --> horizontal
self.env.step(action_code.LOOKUP)
Expand Down Expand Up @@ -818,6 +863,21 @@ def _run_eval_system2(self) -> tuple:

print("step_id", step_id, "action", action)

if vis_writer is not None:
vis = np.asarray(save_raw_image).copy()
vis = cv2.putText(
vis,
f"step {step_id} action {int(action)}",
(20, 40),
cv2.FONT_HERSHEY_SIMPLEX,
1,
(0, 255, 0),
2,
)
if draw_pixel_goal:
cv2.circle(vis, (pixel_goal[0], pixel_goal[1]), radius=8, color=(255, 0, 0), thickness=-1)
vis_writer.append_data(vis)

if action == action_code.LOOKDOWN:
self.env.step(action)
observations, _, done, _ = self.env.step(action)
Expand Down Expand Up @@ -875,6 +935,8 @@ def _run_eval_system2(self) -> tuple:
quality=9,
)
vis_frames.clear()
if vis_writer is not None:
vis_writer.close()

self.env.close()

Expand Down
2 changes: 2 additions & 0 deletions scripts/eval/configs/habitat_dual_system_cfg.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
"resize_w": 384, # image resize width
"resize_h": 384, # image resize height
"max_new_tokens": 1024, # maximum number of tokens for generation
"vis_debug": True, # If vis_debug=True, save debug videos per episode
"vis_debug_path": "./logs/habitat/vis_debug",
},
),
env=EnvCfg(
Expand Down
2 changes: 2 additions & 0 deletions scripts/eval/configs/habitat_s2_cfg.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
"resize_w": 384, # image resize width
"resize_h": 384, # image resize height
"max_new_tokens": 1024, # maximum number of tokens for generation
"vis_debug": False, # If vis_debug=True, save debug videos per episode
"vis_debug_path": "./logs/habitat/vis_debug",
},
),
env=EnvCfg(
Expand Down
Loading