diff --git a/docs/source/overview/imitation-learning/humanoids_imitation.rst b/docs/source/overview/imitation-learning/humanoids_imitation.rst index 67a5f66cdad..60032fd62f6 100644 --- a/docs/source/overview/imitation-learning/humanoids_imitation.rst +++ b/docs/source/overview/imitation-learning/humanoids_imitation.rst @@ -565,7 +565,8 @@ To generate the locomanipulation dataset, use the following command: --navigate_step 130 \ --enable_pinocchio \ --output_file ./datasets/generated_dataset_g1_locomanipulation_sdg.hdf5 \ - --enable_cameras + --enable_cameras \ + --visualizer kit .. note:: diff --git a/scripts/imitation_learning/locomanipulation_sdg/generate_data.py b/scripts/imitation_learning/locomanipulation_sdg/generate_data.py index f00493bc0ae..0a2169745cc 100644 --- a/scripts/imitation_learning/locomanipulation_sdg/generate_data.py +++ b/scripts/imitation_learning/locomanipulation_sdg/generate_data.py @@ -119,6 +119,7 @@ import isaaclab.sim as sim_utils from isaaclab.utils import configclass from isaaclab.utils.datasets import EpisodeData, HDF5DatasetFileHandler +from isaaclab.utils.math import convert_quat import isaaclab_mimic.locomanipulation_sdg.envs # noqa: F401 from isaaclab_mimic.locomanipulation_sdg.data_classes import LocomanipulationSDGOutputData @@ -285,7 +286,8 @@ def setup_navigation_scene( parent=env.get_end_fixture(), ) base_goal_approach = RelativePose( - relative_pose=torch.tensor([-approach_distance, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0]), parent=base_goal + relative_pose=torch.tensor([-approach_distance, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0], device=env.device), + parent=base_goal, ) # Plan navigation path @@ -321,7 +323,7 @@ def handle_grasp_state( # Set control targets - robot stays stationary during grasping output_data.data_generation_state = int(LocomanipulationSDGDataGenerationState.GRASP_OBJECT) output_data.recording_step = recording_step - output_data.base_velocity_target = torch.tensor([0.0, 0.0, 0.0]) + output_data.base_velocity_target = torch.tensor([0.0, 0.0, 0.0], device=env.device) # Transform hand poses relative to object output_data.left_hand_pose_target = transform_relative_pose( @@ -369,7 +371,7 @@ def handle_lift_state( # Set control targets - robot stays stationary during lifting output_data.data_generation_state = int(LocomanipulationSDGDataGenerationState.LIFT_OBJECT) output_data.recording_step = recording_step - output_data.base_velocity_target = torch.tensor([0.0, 0.0, 0.0]) + output_data.base_velocity_target = torch.tensor([0.0, 0.0, 0.0], device=env.device) # Transform hand poses relative to base output_data.left_hand_pose_target = transform_relative_pose( @@ -428,7 +430,7 @@ def handle_navigate_state( # Set control targets output_data.data_generation_state = int(LocomanipulationSDGDataGenerationState.NAVIGATE) output_data.recording_step = recording_step - output_data.base_velocity_target = torch.tensor([linear_velocity, 0.0, angular_velocity]) + output_data.base_velocity_target = torch.tensor([linear_velocity, 0.0, angular_velocity], device=env.device) # Transform hand poses relative to base output_data.left_hand_pose_target = transform_relative_pose( @@ -482,7 +484,7 @@ def handle_approach_state( # Set control targets output_data.data_generation_state = int(LocomanipulationSDGDataGenerationState.APPROACH) output_data.recording_step = recording_step - output_data.base_velocity_target = torch.tensor([linear_velocity, 0.0, angular_velocity]) + output_data.base_velocity_target = torch.tensor([linear_velocity, 0.0, angular_velocity], device=env.device) # Transform hand poses relative to base output_data.left_hand_pose_target = transform_relative_pose( @@ -543,7 +545,7 @@ def handle_drop_off_state( # Set control targets output_data.data_generation_state = int(LocomanipulationSDGDataGenerationState.DROP_OFF_OBJECT) output_data.recording_step = recording_step - output_data.base_velocity_target = torch.tensor([linear_velocity, 0.0, angular_velocity]) + output_data.base_velocity_target = torch.tensor([linear_velocity, 0.0, angular_velocity], device=env.device) # Transform hand poses relative to end fixture output_data.left_hand_pose_target = transform_relative_pose( @@ -640,7 +642,9 @@ def replay( """ # Initialize environment to starting state - env.reset_to(state=input_episode_data.get_initial_state(), env_ids=torch.tensor([0]), is_relative=True) + env.reset_to( + state=input_episode_data.get_initial_state(), env_ids=torch.tensor([0], device=env.device), is_relative=True + ) # Create navigation control configuration config = LocomanipulationSDGControlConfig( @@ -742,6 +746,7 @@ def replay( # Load input data input_dataset_file_handler = HDF5DatasetFileHandler() input_dataset_file_handler.open(args_cli.dataset) + is_legacy_quat_format = input_dataset_file_handler.is_legacy_quaternion_format() for i in range(args_cli.num_runs): if args_cli.demo is None: @@ -751,6 +756,14 @@ def replay( input_episode_data = input_dataset_file_handler.load_episode(demo, args_cli.device) + # Convert action quaternions from legacy WXYZ to XYZW format if needed. + # load_episode() auto-converts root_pose state keys, but not action data. + # The action format is: [left_pos(3), left_quat(4), right_pos(3), right_quat(4), joints(14)] + if is_legacy_quat_format and input_episode_data is not None and "actions" in input_episode_data.data: + actions = input_episode_data.data["actions"] + actions[:, 3:7] = convert_quat(actions[:, 3:7], to="xyzw") # left hand quat + actions[:, 10:14] = convert_quat(actions[:, 10:14], to="xyzw") # right hand quat + replay( env=env, input_episode_data=input_episode_data, diff --git a/source/isaaclab/isaaclab/scene/interactive_scene.py b/source/isaaclab/isaaclab/scene/interactive_scene.py index 54fbcc068af..7ccd762eaf5 100644 --- a/source/isaaclab/isaaclab/scene/interactive_scene.py +++ b/source/isaaclab/isaaclab/scene/interactive_scene.py @@ -454,15 +454,15 @@ def reset_to( for asset_name, articulation in self._articulations.items(): asset_state = state["articulation"][asset_name] # root state - root_pose = asset_state["root_pose"].clone() + root_pose = asset_state["root_pose"].clone().to(self.device) if is_relative: root_pose[:, :3] += self.env_origins[env_ids] - root_velocity = asset_state["root_velocity"].clone() + root_velocity = asset_state["root_velocity"].clone().to(self.device) articulation.write_root_pose_to_sim(root_pose, env_ids=env_ids) articulation.write_root_velocity_to_sim(root_velocity, env_ids=env_ids) # joint state - joint_position = asset_state["joint_position"].clone() - joint_velocity = asset_state["joint_velocity"].clone() + joint_position = asset_state["joint_position"].clone().to(self.device) + joint_velocity = asset_state["joint_velocity"].clone().to(self.device) articulation.write_joint_state_to_sim(joint_position, joint_velocity, env_ids=env_ids) # FIXME: This is not generic as it assumes PD control over the joints. # This assumption does not hold for effort controlled joints. @@ -471,19 +471,19 @@ def reset_to( # deformable objects for asset_name, deformable_object in self._deformable_objects.items(): asset_state = state["deformable_object"][asset_name] - nodal_position = asset_state["nodal_position"].clone() + nodal_position = asset_state["nodal_position"].clone().to(self.device) if is_relative: nodal_position[:, :3] += self.env_origins[env_ids] - nodal_velocity = asset_state["nodal_velocity"].clone() + nodal_velocity = asset_state["nodal_velocity"].clone().to(self.device) deformable_object.write_nodal_pos_to_sim(nodal_position, env_ids=env_ids) deformable_object.write_nodal_velocity_to_sim(nodal_velocity, env_ids=env_ids) # rigid objects for asset_name, rigid_object in self._rigid_objects.items(): asset_state = state["rigid_object"][asset_name] - root_pose = asset_state["root_pose"].clone() + root_pose = asset_state["root_pose"].clone().to(self.device) if is_relative: root_pose[:, :3] += self.env_origins[env_ids] - root_velocity = asset_state["root_velocity"].clone() + root_velocity = asset_state["root_velocity"].clone().to(self.device) rigid_object.write_root_pose_to_sim(root_pose, env_ids=env_ids) rigid_object.write_root_velocity_to_sim(root_velocity, env_ids=env_ids) # surface grippers diff --git a/source/isaaclab_mimic/isaaclab_mimic/locomanipulation_sdg/envs/g1_locomanipulation_sdg_env.py b/source/isaaclab_mimic/isaaclab_mimic/locomanipulation_sdg/envs/g1_locomanipulation_sdg_env.py index c2ce6f0dcbe..f25f822288d 100644 --- a/source/isaaclab_mimic/isaaclab_mimic/locomanipulation_sdg/envs/g1_locomanipulation_sdg_env.py +++ b/source/isaaclab_mimic/isaaclab_mimic/locomanipulation_sdg/envs/g1_locomanipulation_sdg_env.py @@ -160,16 +160,17 @@ def load_input_data(self, episode_data: EpisodeData, step: int) -> Locomanipulat object_pose = dataset_state["rigid_object"]["object"]["root_pose"] + base_pose = episode_data.get_initial_state()["articulation"]["robot"]["root_pose"] data = LocomanipulationSDGInputData( left_hand_pose_target=dataset_action[0:7], right_hand_pose_target=dataset_action[7:14], left_hand_joint_positions_target=dataset_action[14:21], right_hand_joint_positions_target=dataset_action[21:28], - base_pose=episode_data.get_initial_state()["articulation"]["robot"]["root_pose"], + base_pose=base_pose, object_pose=object_pose, fixture_pose=torch.tensor( - [0.0, 0.55, -0.3, 1.0, 0.0, 0.0, 0.0] - ), # Table pose is not recorded for this env. + [0.0, 0.55, -0.3, 0.0, 0.0, 0.0, 1.0], device=base_pose.device + ), # Table pose is not recorded for this env. Quaternion in XYZW format (identity). ) return data diff --git a/source/isaaclab_mimic/isaaclab_mimic/locomanipulation_sdg/scene_utils.py b/source/isaaclab_mimic/isaaclab_mimic/locomanipulation_sdg/scene_utils.py index dc9c969171c..38e353f5eb3 100644 --- a/source/isaaclab_mimic/isaaclab_mimic/locomanipulation_sdg/scene_utils.py +++ b/source/isaaclab_mimic/isaaclab_mimic/locomanipulation_sdg/scene_utils.py @@ -7,8 +7,10 @@ import numpy as np import torch +import warp as wp import isaaclab.utils.math as math_utils +from isaaclab.sim.views import XformPrimView from .occupancy_map_utils import OccupancyMap, intersect_occupancy_maps from .transform_utils import transform_mul @@ -83,7 +85,8 @@ def __init__(self, scene, entity_name: str, body_name: str): def get_pose(self): """Get the 3D pose of the entity.""" - pose = self.scene[self.entity_name].data.body_link_state_w[ + body_link_state_w = wp.to_torch(self.scene[self.entity_name].data.body_link_state_w) + pose = body_link_state_w[ :, self.scene[self.entity_name].data.body_names.index(self.body_name), :7, @@ -98,16 +101,25 @@ def __init__(self, scene, entity_name: str): self.scene = scene self.entity_name = entity_name + def _get_xform_view(self) -> XformPrimView: + """Return the XformPrimView for this asset, refreshing it if prims were not yet cloned.""" + xform_prim = self.scene[self.entity_name] + if xform_prim.count == 0: + # The view was created before environment cloning; rebuild it now that prims exist. + xform_prim = XformPrimView(xform_prim._prim_path, device=xform_prim.device) + self.scene.extras[self.entity_name] = xform_prim + return xform_prim + def get_pose(self): """Get the 3D pose of the entity.""" - xform_prim = self.scene[self.entity_name] + xform_prim = self._get_xform_view() position, orientation = xform_prim.get_world_poses() pose = torch.cat([position, orientation], dim=-1) return pose def set_pose(self, pose: torch.Tensor): """Set the 3D pose of the entity.""" - xform_prim = self.scene[self.entity_name] + xform_prim = self._get_xform_view() position = pose[..., :3] orientation = pose[..., 3:] xform_prim.set_world_poses(position, orientation, None) @@ -124,8 +136,9 @@ def get_pose(self): """Get the 3D pose of the entity.""" parent_pose = self.parent.get_pose() + relative_pose = self.relative_pose.to(parent_pose.device) - pose = transform_mul(parent_pose, self.relative_pose) + pose = transform_mul(parent_pose, relative_pose) return pose @@ -146,6 +159,10 @@ def get_occupancy_map(self): ) transform = self.get_transform_2d().detach().cpu().numpy() + # get_world_poses() may return a batched (num_envs, 3, 3) or empty (0, 3, 3) tensor. + # For a fixed background asset placed at the world origin, fall back to identity when empty. + if transform.ndim == 3: + transform = transform[0] if transform.shape[0] > 0 else np.eye(3) occupancy_map = local_occupancy_map.transformed(transform) diff --git a/source/isaaclab_mimic/isaaclab_mimic/locomanipulation_sdg/transform_utils.py b/source/isaaclab_mimic/isaaclab_mimic/locomanipulation_sdg/transform_utils.py index 73406a187ff..8d832656c23 100644 --- a/source/isaaclab_mimic/isaaclab_mimic/locomanipulation_sdg/transform_utils.py +++ b/source/isaaclab_mimic/isaaclab_mimic/locomanipulation_sdg/transform_utils.py @@ -44,5 +44,8 @@ def transform_inv(transform: torch.Tensor) -> torch.Tensor: def transform_relative_pose(world_pose: torch.Tensor, src_frame_pose: torch.Tensor, dst_frame_pose: torch.Tensor): """Compute the relative pose with respect to a source frame, and apply this relative pose to a destination frame.""" + device = dst_frame_pose.device + world_pose = world_pose.to(device) + src_frame_pose = src_frame_pose.to(device) pose = transform_mul(dst_frame_pose, transform_mul(transform_inv(src_frame_pose), world_pose)) return pose