diff --git a/.gitignore b/.gitignore index cbe06491..040955d9 100644 --- a/.gitignore +++ b/.gitignore @@ -179,7 +179,8 @@ materials embodichain/toolkits/outputs/* embodichain/toolkits/outputs/* -embodichain/database/* +embodichain/database/tmp/* +embodichain/database/train/* 3rdparty/ Log/ diff --git a/configs/gym/agent/pour_water_agent/agent_config.json b/configs/gym/agent/pour_water_agent/agent_config.json new file mode 100644 index 00000000..b54f90c1 --- /dev/null +++ b/configs/gym/agent/pour_water_agent/agent_config.json @@ -0,0 +1,31 @@ +{ "TaskAgent": { + "prompt_name": "one_stage_prompt" + }, + "CodeAgent": { + "prompt_name": "one_stage_prompt_according_to_task_plan" + }, + "Agent": { + "prompt_kwargs": { + "task_prompt": { + "type": "text", + "name": "task_prompt.txt" + }, + "basic_background": { + "type": "text", + "name": "basic_background.txt" + }, + "atom_actions": { + "type": "text", + "name": "atom_actions.txt" + }, + "code_prompt": { + "type": "text", + "name": "code_prompt.txt" + }, + "code_example": { + "type": "text", + "name": "code_example.txt" + } + } + } +} \ No newline at end of file diff --git a/configs/gym/agent/pour_water_agent/agent_config_dual.json b/configs/gym/agent/pour_water_agent/agent_config_dual.json new file mode 100644 index 00000000..b243bb65 --- /dev/null +++ b/configs/gym/agent/pour_water_agent/agent_config_dual.json @@ -0,0 +1,31 @@ +{ "TaskAgent": { + "prompt_name": "one_stage_prompt" + }, + "CodeAgent": { + "prompt_name": "one_stage_prompt_according_to_task_plan" + }, + "Agent": { + "prompt_kwargs": { + "task_prompt": { + "type": "text", + "name": "task_prompt_dual.txt" + }, + "basic_background": { + "type": "text", + "name": "basic_background.txt" + }, + "atom_actions": { + "type": "text", + "name": "atom_actions.txt" + }, + "code_prompt": { + "type": "text", + "name": "code_prompt.txt" + }, + "code_example": { + "type": "text", + "name": "code_example.txt" + } + } + } +} \ No newline at end of file diff --git a/configs/gym/agent/pour_water_agent/fast_gym_config.json b/configs/gym/agent/pour_water_agent/fast_gym_config.json new file mode 100644 index 00000000..10a3d9cf --- /dev/null +++ b/configs/gym/agent/pour_water_agent/fast_gym_config.json @@ -0,0 +1,392 @@ +{ + "id": "PourWaterAgent-v3", + "max_episodes": 5, + "env": { + "events": { + "init_table_pose": { + "func": "randomize_rigid_object_pose", + "mode": "reset", + "params": { + "entity_cfg": {"uid": "table"}, + "position_range": [[0.0, 0.0, -0.04], [0.0, 0.0, 0.04]], + "relative_position": true + } + }, + "init_bottle_pose": { + "func": "randomize_rigid_object_pose", + "mode": "reset", + "params": { + "entity_cfg": {"uid": "bottle"}, + "position_range": [[-0.08, -0.12, 0.0], [0.08, 0.04, 0.0]], + "relative_position": true + } + }, + "init_cup_pose": { + "func": "randomize_rigid_object_pose", + "mode": "reset", + "params": { + "entity_cfg": {"uid": "cup"}, + "position_range": [[-0.08, -0.04, 0.0], [0.04, 0.06, 0.0]], + "relative_position": true + } + }, + "prepare_extra_attr": { + "func": "prepare_extra_attr", + "mode": "reset", + "params": { + "attrs": [ + { + "name": "object_lengths", + "mode": "callable", + "entity_uids": "all_objects", + "func_name": "compute_object_length", + "func_kwargs": { + "is_svd_frame": true, + "sample_points": 5000 + } + }, + { + "name": "grasp_pose_object", + "mode": "static", + "entity_cfg": { + "uid": "bottle" + }, + "value": [[ + [0.32243, 0.03245, 0.94604, 0.025], + [0.00706, -0.99947, 0.03188, -0.0 ], + [0.94657, -0.0036 , -0.32249, 0.0 ], + [0.0 , 0.0 , 0.0 , 1.0 ] + ]] + }, + { + "name": "grasp_pose_object", + "mode": "static", + "entity_cfg": { + "uid": "cup" + }, + "value": [[ + [ 0.32039, -0.03227, 0.94673, 0.0 ], + [ 0.00675, -0.99932, -0.03635, 0.0 ], + [ 0.94726, 0.01803, -0.31996, 0.0 ], + [ 0.0 , 0.0 , 0.0 , 1.0 ] + ]] + }, + { + "name": "left_arm_base_pose", + "mode": "callable", + "entity_cfg": { + "uid": "CobotMagic" + }, + "func_name": "get_link_pose", + "func_kwargs": { + "link_name": "left_arm_base", + "to_matrix": true + } + }, + { + "name": "right_arm_base_pose", + "mode": "callable", + "entity_cfg": { + "uid": "CobotMagic" + }, + "func_name": "get_link_pose", + "func_kwargs": { + "link_name": "right_arm_base", + "to_matrix": true + } + } + ] + } + }, + "register_info_to_env": { + "func": "register_info_to_env", + "mode": "reset", + "params": { + "registry": [ + { + "entity_cfg": { + "uid": "bottle" + }, + "pose_register_params": { + "compute_relative": false, + "compute_pose_object_to_arena": true, + "to_matrix": true + } + }, + { + "entity_cfg": { + "uid": "cup" + }, + "pose_register_params": { + "compute_relative": false, + "compute_pose_object_to_arena": true, + "to_matrix": true + } + }, + { + "entity_cfg": { + "uid": "CobotMagic", + "control_parts": ["left_arm"] + }, + "attrs": ["left_arm_base_pose"], + "pose_register_params": { + "compute_relative": "cup", + "compute_pose_object_to_arena": false, + "to_matrix": true + }, + "prefix": false + }, + { + "entity_cfg": { + "uid": "CobotMagic", + "control_parts": ["right_arm"] + }, + "attrs": ["right_arm_base_pose"], + "pose_register_params": { + "compute_relative": "bottle", + "compute_pose_object_to_arena": false, + "to_matrix": true + }, + "prefix": false + } + ], + "registration": "affordance_datas", + "sim_update": true + } + }, + "random_table_material": { + "func": "randomize_visual_material", + "mode": "reset", + "interval_step": 2, + "params": { + "entity_cfg": {"uid": "table"}, + "random_texture_prob": 1.0, + "texture_path": "DexsimMaterials/WoodTable" + } + }, + "random_bottle_material": { + "func": "randomize_visual_material", + "mode": "reset", + "interval_step": 2, + "params": { + "entity_cfg": {"uid": "bottle"}, + "random_texture_prob": 1.0, + "texture_path": "DexsimMaterials/Bottle" + } + }, + "random_cup_material": { + "func": "randomize_visual_material", + "mode": "reset", + "interval_step": 2, + "params": { + "entity_cfg": {"uid": "cup"}, + "random_texture_prob": 1.0, + "texture_path": "DexsimMaterials/Cup" + } + }, + "random_robot_init_eef_pose": { + "func": "randomize_robot_eef_pose", + "mode": "reset", + "params": { + "entity_cfg": {"uid": "CobotMagic", "control_parts": ["left_arm", "right_arm"]}, + "position_range": [[-0.01, -0.01, -0.01], [0.01, 0.01, 0]] + } + }, + "record_camera": { + "func": "record_camera_data", + "mode": "interval", + "interval_step": 1, + "params": { + "name": "cam1", + "resolution": [320, 240], + "eye": [2, 0, 2], + "target": [0.5, 0, 1] + } + } + }, + "dataset": { + "lerobot": { + "func": "embodichain.lab.gym.envs.managers.datasets:LeRobotRecorder", + "mode": "save", + "params": { + "robot_meta": { + "arm_dofs": 12, + "control_freq": 25, + "control_parts": ["left_arm", "left_eef", "right_arm", "right_eef"], + "observation": { + "vision": {}, + "states": ["qpos"] + }, + "min_len_steps": 5 + }, + "instruction": { + "lang": "Pour water from the bottle into the mug." + } + } + } + }, + "success_params": { + "strict": false + } + }, + "robot": { + "uid": "CobotMagic", + "robot_type": "CobotMagic", + "init_pos": [0.0, 0.0, 0.7775], + "init_qpos": [-0.3,0.3,1.0,1.0,-1.2,-1.2,0.0,0.0,0.6,0.6,0.0,0.0,0.05,0.05,0.05,0.05] + }, + "sensor": [ + { + "sensor_type": "StereoCamera", + "uid": "cam_high", + "width": 960, + "height": 540, + "enable_mask": false, + "enable_depth": false, + "left_to_right_pos": [0.059684025824163614, 0, 0], + "intrinsics": [453.851402686215, 453.8347628855552, 469.827725021235, 258.6656181845155], + "intrinsics_right": [453.4536601653505, 453.3306024582175, 499.13697412367776, 297.7176248477935], + "extrinsics": { + "eye": [0.35368482807598, 0.014695524383058989, 1.4517046071614774], + "target": [0.7186357573287919, -0.054534732904795505, 0.5232553674540066], + "up": [0.9306678549330372, -0.0005600064212467153, 0.3658647703553347] + } + }, + { + "sensor_type": "Camera", + "uid": "cam_right_wrist", + "width": 640, + "height": 480, + "enable_mask": false, + "intrinsics": [488.1665344238281, 488.1665344238281, 322.7323303222656, 213.17434692382812], + "extrinsics": { + "parent": "right_link6", + "pos": [-0.08, 0.0, 0.04], + "quat": [0.15304635, 0.69034543, -0.69034543, -0.15304635] + } + }, + { + "sensor_type": "Camera", + "uid": "cam_left_wrist", + "width": 640, + "height": 480, + "enable_mask": false, + "intrinsics": [488.1665344238281, 488.1665344238281, 322.7323303222656, 213.17434692382812], + "extrinsics": { + "parent": "left_link6", + "pos": [-0.08, 0.0, 0.04], + "quat": [0.15304635, 0.69034543, -0.69034543, -0.15304635] + } + }, + { + "sensor_type": "Camera", + "uid": "valid_cam_1", + "width": 1280, + "height": 960, + "enable_mask": false, + "intrinsics": [1400, 1400, 640, 480], + "extrinsics": { + "eye": [1.7, 0, 2.3], + "target": [0.6, 0, 0.8] + } + }, + { + "sensor_type": "Camera", + "uid": "valid_cam_2", + "width": 1280, + "height": 960, + "enable_mask": false, + "intrinsics": [1400, 1400, 640, 480], + "extrinsics": { + "eye": [2.0, 0, 1.8], + "target": [0.7, 0, 0.9] + } + }, + { + "sensor_type": "Camera", + "uid": "valid_cam_3", + "width": 1280, + "height": 960, + "enable_mask": false, + "intrinsics": [1400, 1400, 640, 480], + "extrinsics": { + "eye": [2.0, 0, 1.3], + "target": [0.7, 0, 0.9] + } + } + ], + "light": { + "direct": [ + { + "uid": "light_1", + "light_type": "point", + "color": [1.0, 1.0, 1.0], + "intensity": 100.0, + "init_pos": [2, 0, 2], + "radius": 20.0 + } + ] + }, + "background": [ + { + "uid": "table", + "shape": { + "shape_type": "Mesh", + "fpath": "CircleTableSimple/circle_table_simple.ply", + "compute_uv": true + }, + "attrs" : { + "mass": 10.0, + "static_friction": 0.95, + "dynamic_friction": 0.9, + "restitution": 0.01 + }, + "body_scale": [1, 1, 1], + "body_type": "kinematic", + "init_pos": [0.725, 0.0, 0.825], + "init_rot": [0, 90, 0] + } + ], + "rigid_object": [ + { + "uid":"cup", + "shape": { + "shape_type": "Mesh", + "fpath": "PaperCup/paper_cup.ply", + "compute_uv": true + }, + "attrs" : { + "mass": 0.01, + "contact_offset": 0.003, + "rest_offset": 0.001, + "restitution": 0.01, + "max_depenetration_velocity": 1e1, + "min_position_iters": 32, + "min_velocity_iters":8 + }, + "init_pos": [0.75, 0.1, 0.9], + "body_scale":[0.75, 0.75, 1.0], + "max_convex_hull_num": 8 + }, + { + "uid":"bottle", + "shape": { + "shape_type": "Mesh", + "fpath": "ScannedBottle/kashijia_processed.ply", + "compute_uv": true + }, + "attrs" : { + "mass": 0.01, + "contact_offset": 0.003, + "rest_offset": 0.001, + "restitution": 0.01, + "max_depenetration_velocity": 1e1, + "min_position_iters": 32, + "min_velocity_iters":8 + }, + "init_pos": [0.75, -0.1, 0.932], + "body_scale":[1, 1, 1], + "max_convex_hull_num": 8 + } + ] +} \ No newline at end of file diff --git a/configs/gym/agent/pour_water_agent/task_prompt.txt b/configs/gym/agent/pour_water_agent/task_prompt.txt new file mode 100644 index 00000000..be7eb3b1 --- /dev/null +++ b/configs/gym/agent/pour_water_agent/task_prompt.txt @@ -0,0 +1,5 @@ +Task: +Use a single robotic arm to pour water from the bottle into the cup. +Position the bottle at an offset of [0.05, −0.10, 0.125] relative to the cup’s position during pouring. +After completing the pour, return the bottle to the location [0.7, −0.1]. +The cup should remain stationary throughout the task. \ No newline at end of file diff --git a/configs/gym/agent/pour_water_agent/task_prompt_dual.txt b/configs/gym/agent/pour_water_agent/task_prompt_dual.txt new file mode 100644 index 00000000..6b2205e2 --- /dev/null +++ b/configs/gym/agent/pour_water_agent/task_prompt_dual.txt @@ -0,0 +1,5 @@ +Task: +Use both arms to pour water from the bottle into the cup. +First, grasp the bottle with right arm and the cup with left arm simultaneously. Then lift the cup by 0.10 m, and then move it to [0.55, 0.05] to prepare for pouring. +Then hold the bottle at a relative offset of [0.05, −0.10, 0.125] with respect to the cup for starting pouring. +After pouring, place the bottle to [0.7, −0.1] and place the cup at [0.6, 0.1] simultaneously. \ No newline at end of file diff --git a/configs/gym/agent/rearrangement_agent_v3/agent_config.json b/configs/gym/agent/rearrangement_agent_v3/agent_config.json new file mode 100644 index 00000000..6dbe2f5e --- /dev/null +++ b/configs/gym/agent/rearrangement_agent_v3/agent_config.json @@ -0,0 +1,31 @@ +{ "TaskAgent": { + "prompt_name": "one_stage_prompt" + }, + "CodeAgent": { + "prompt_name": "one_stage_prompt_according_to_task_plan" + }, + "Agent": { + "prompt_kwargs": { + "task_prompt": { + "type": "text", + "name": "task_prompt.txt" + }, + "basic_background": { + "type": "text", + "name": "basic_background.txt" + }, + "atom_actions": { + "type": "text", + "name": "atom_actions.txt" + }, + "code_prompt": { + "type": "text", + "name": "code_prompt.txt" + }, + "code_example": { + "type": "text", + "name": "code_example.txt" + } + } + } +} diff --git a/configs/gym/agent/rearrangement_agent_v3/fast_gym_config.json b/configs/gym/agent/rearrangement_agent_v3/fast_gym_config.json new file mode 100644 index 00000000..5cc6daad --- /dev/null +++ b/configs/gym/agent/rearrangement_agent_v3/fast_gym_config.json @@ -0,0 +1,384 @@ +{ + "id": "RearrangementAgent-v3", + "max_episodes": 10, + "env": { + "events": { + "init_table_pose": { + "func": "randomize_rigid_object_pose", + "mode": "reset", + "params": { + "entity_cfg": {"uid": "table"}, + "position_range": [[0.0, 0.0, -0.04], [0.0, 0.0, 0.04]], + "relative_position": true + } + }, + "init_fork_pose": { + "func": "randomize_rigid_object_pose", + "mode": "reset", + "params": { + "entity_cfg": {"uid": "fork"}, + "position_range": [[0.0, -0.05, 0.0], [0.1, 0.05, 0.0]], + "rotation_range": [[0, 0, -45], [0, 0, 45]], + "relative_position": true, + "relative_rotation": true + } + }, + "init_spoon_pose": { + "func": "randomize_rigid_object_pose", + "mode": "reset", + "params": { + "entity_cfg": {"uid": "spoon"}, + "position_range": [[0.0, -0.05, 0.0], [0.1, 0.05, 0.0]], + "rotation_range": [[0, 0, -45], [0, 0, 45]], + "relative_position": true, + "relative_rotation": true + } + }, + "prepare_extra_attr": { + "func": "prepare_extra_attr", + "mode": "reset", + "params": { + "attrs": [ + { + "name": "object_lengths", + "mode": "callable", + "entity_uids": "all_objects", + "func_name": "compute_object_length", + "func_kwargs": { + "is_svd_frame": true, + "sample_points": 5000 + } + }, + { + "name": "grasp_pose_object", + "mode": "static", + "entity_uids": ["fork", "spoon"], + "value": [[ + [1.0, 0.0, 0.0, 0.0], + [0.0, -1.0, 0.0, 0.0], + [0.0, 0.0, -1.0, 0.0], + [0.0, 0.0, 0.0, 1.0] + ]] + } + ] + } + }, + "register_info_to_env": { + "func": "register_info_to_env", + "mode": "reset", + "params": { + "registry": [ + { + "entity_cfg": { + "uid": "plate" + }, + "pose_register_params": { + "compute_relative": false, + "compute_pose_object_to_arena": true, + "to_matrix": true + } + }, + { + "entity_cfg": { + "uid": "fork" + }, + "pose_register_params": { + "compute_relative": false, + "compute_pose_object_to_arena": true, + "to_matrix": true + } + }, + { + "entity_cfg": { + "uid": "spoon" + }, + "pose_register_params": { + "compute_relative": false, + "compute_pose_object_to_arena": true, + "to_matrix": true + } + }, + { + "entity_cfg": { + "uid": "CobotMagic", + "control_parts": ["left_arm"] + }, + "attrs": ["left_arm_base_pose"], + "pose_register_params": { + "compute_relative": "spoon", + "compute_pose_object_to_arena": false, + "to_matrix": true + }, + "prefix": false + }, + { + "entity_cfg": { + "uid": "CobotMagic", + "control_parts": ["right_arm"] + }, + "attrs": ["right_arm_base_pose"], + "pose_register_params": { + "compute_relative": "fork", + "compute_pose_object_to_arena": false, + "to_matrix": true + }, + "prefix": false + } + ], + "registration": "affordance_datas", + "sim_update": true + } + }, + "random_table_material": { + "func": "randomize_visual_material", + "mode": "reset", + "interval_step": 2, + "params": { + "entity_cfg": {"uid": "table"}, + "random_texture_prob": 1.0, + "texture_path": "DexsimMaterials/WoodTable" + } + }, + "random_plate_material": { + "func": "randomize_visual_material", + "mode": "reset", + "interval_step": 2, + "params": { + "entity_cfg": {"uid": "plate"}, + "random_texture_prob": 1.0, + "texture_path": "DexsimMaterials/Plate" + } + }, + "random_fork_material": { + "func": "randomize_visual_material", + "mode": "reset", + "interval_step": 2, + "params": { + "entity_cfg": {"uid": "fork"}, + "random_texture_prob": 1.0, + "texture_path": "DexsimMaterials/Spoon" + } + }, + "random_spoon_material": { + "func": "randomize_visual_material", + "mode": "reset", + "interval_step": 2, + "params": { + "entity_cfg": {"uid": "spoon"}, + "random_texture_prob": 1.0, + "texture_path": "DexsimMaterials/Spoon" + } + }, + "random_robot_init_eef_pose": { + "func": "randomize_robot_eef_pose", + "mode": "reset", + "params": { + "entity_cfg": {"uid": "CobotMagic", "control_parts": ["left_arm", "right_arm"]}, + "position_range": [[-0.01, -0.01, -0.01], [0.01, 0.01, 0]] + } + }, + "record_camera": { + "func": "record_camera_data", + "mode": "interval", + "interval_step": 1, + "params": { + "name": "cam1", + "resolution": [320, 240], + "eye": [2, 0, 2], + "target": [0.5, 0, 1] + } + } + }, + "dataset": { + "lerobot": { + "func": "embodichain.lab.gym.envs.managers.datasets:LeRobotRecorder", + "mode": "save", + "params": { + "robot_meta": { + "arm_dofs": 12, + "control_freq": 25, + "control_parts": ["left_arm", "left_eef", "right_arm", "right_eef"], + "observation": { + "vision": {}, + "states": ["qpos"] + }, + "min_len_steps": 125 + }, + "instruction": { + "lang": "Place the spoon and fork neatly into the plate on the table." + } + } + } + }, + "success_params": { + "strict": false + } + }, + "robot": { + "uid": "CobotMagic", + "robot_type": "CobotMagic", + "init_pos": [0.0, 0.0, 0.7775], + "init_qpos": [-0.3,0.3,1.0,1.0,-1.2,-1.2,0.0,0.0,0.6,0.6,0.0,0.0,0.05,0.05,0.05,0.05] + }, + "sensor": [ + { + "sensor_type": "StereoCamera", + "uid": "cam_high", + "width": 960, + "height": 540, + "enable_mask": false, + "enable_depth": false, + "left_to_right_pos": [0.059684025824163614, 0, 0], + "intrinsics": [453.851402686215, 453.8347628855552, 469.827725021235, 258.6656181845155], + "intrinsics_right": [453.4536601653505, 453.3306024582175, 499.13697412367776, 297.7176248477935], + "extrinsics": { + "eye": [0.35368482807598, 0.014695524383058989, 1.4517046071614774], + "target": [0.7186357573287919, -0.054534732904795505, 0.5232553674540066], + "up": [0.9306678549330372, -0.0005600064212467153, 0.3658647703553347] + } + }, + { + "sensor_type": "Camera", + "uid": "cam_right_wrist", + "width": 640, + "height": 480, + "enable_mask": false, + "intrinsics": [488.1665344238281, 488.1665344238281, 322.7323303222656, 213.17434692382812], + "extrinsics": { + "parent": "right_link6", + "pos": [-0.08, 0.0, 0.04], + "quat": [0.15304635, 0.69034543, -0.69034543, -0.15304635] + } + }, + { + "sensor_type": "Camera", + "uid": "cam_left_wrist", + "width": 640, + "height": 480, + "enable_mask": false, + "intrinsics": [488.1665344238281, 488.1665344238281, 322.7323303222656, 213.17434692382812], + "extrinsics": { + "parent": "left_link6", + "pos": [-0.08, 0.0, 0.04], + "quat": [0.15304635, 0.69034543, -0.69034543, -0.15304635] + } + }, + { + "sensor_type": "Camera", + "uid": "valid_cam_1", + "width": 1280, + "height": 960, + "enable_mask": false, + "intrinsics": [1400, 1400, 640, 480], + "extrinsics": { + "eye": [1.7, 0, 2.3], + "target": [0.6, 0, 0.8] + } + }, + { + "sensor_type": "Camera", + "uid": "valid_cam_2", + "width": 1280, + "height": 960, + "enable_mask": false, + "intrinsics": [1400, 1400, 640, 480], + "extrinsics": { + "eye": [2.0, 0, 1.8], + "target": [0.7, 0, 0.9] + } + }, + { + "sensor_type": "Camera", + "uid": "valid_cam_3", + "width": 1280, + "height": 960, + "enable_mask": false, + "intrinsics": [1400, 1400, 640, 480], + "extrinsics": { + "eye": [2.0, 0, 1.3], + "target": [0.7, 0, 0.9] + } + } + ], + "light": { + "direct": [ + { + "uid": "light_1", + "light_type": "point", + "color": [1.0, 1.0, 1.0], + "intensity": 100.0, + "init_pos": [2, 0, 2], + "radius": 20.0 + } + ] + }, + "background": [ + { + "uid": "table", + "shape": { + "shape_type": "Mesh", + "fpath": "CircleTableSimple/circle_table_simple.ply", + "compute_uv": true + }, + "attrs" : { + "mass": 10.0, + "static_friction": 2.0, + "dynamic_friction": 1.5, + "restitution": 0.1 + }, + "body_scale": [1, 1, 1], + "body_type": "kinematic", + "init_pos": [0.725, 0.0, 0.691], + "init_rot": [0, 90, 0] + } + ], + "rigid_object": [ + { + "uid": "plate", + "shape": { + "shape_type": "Mesh", + "fpath": "TableWare/tableware/plate/3_center.ply", + "compute_uv": true + }, + "body_scale": [0.001, 0.001, 0.001], + "init_pos": [0.5, 0.0, 1.0], + "init_rot": [180, 0, 0] + }, + { + "uid": "fork", + "shape": { + "shape_type": "Mesh", + "fpath": "TableWare/tableware/fork/standard_fork_scale.ply", + "compute_uv": true + }, + "body_scale": [1.0, 1.0, 1.0], + "init_pos": [0.5, 0.21, 1.0], + "attrs" : { + "mass": 0.01, + "static_friction": 0.95, + "dynamic_friction": 0.9, + "restitution": 0.05, + "min_position_iters": 16, + "min_velocity_iters": 4 + } + }, + { + "uid": "spoon", + "shape": { + "shape_type": "Mesh", + "fpath": "TableWare/tableware/spoon/standard_spoon_a_rescale.ply", + "compute_uv": true + }, + "body_scale": [1.0, 1.0, 1.0], + "init_pos": [0.5, -0.21, 1.0], + "attrs" : { + "mass": 0.01, + "static_friction": 0.95, + "dynamic_friction": 0.9, + "restitution": 0.05, + "min_position_iters": 16, + "min_velocity_iters": 4 + } + } + ] +} diff --git a/configs/gym/agent/rearrangement_agent_v3/task_prompt.txt b/configs/gym/agent/rearrangement_agent_v3/task_prompt.txt new file mode 100644 index 00000000..280bd227 --- /dev/null +++ b/configs/gym/agent/rearrangement_agent_v3/task_prompt.txt @@ -0,0 +1,8 @@ +Task: +Use both arms to rearrange a fork and a spoon on opposite sides of a plate. Perform the following steps **simultaneously**. + +1. Grasp the fork with the left arm and the spoon with the right arm. + +2. Reorient both end-effectors to a downward-facing pose. + +3. Place the fork at y = +0.16 and the spoon at y = −0.16 relative to the plate’s center. \ No newline at end of file diff --git a/docs/source/conf.py b/docs/source/conf.py index 3aac3af3..6e392b52 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -9,6 +9,10 @@ import os import sys +os.environ.setdefault( + "AZURE_OPENAI_ENDPOINT", "https://mock-endpoint.openai.azure.com/" +) +os.environ.setdefault("AZURE_OPENAI_API_KEY", "mock-api-key-for-docs-build") sys.path.insert(0, os.path.abspath("../..")) diff --git a/docs/source/features/agents.md b/docs/source/features/agents.md new file mode 100644 index 00000000..7cb2356d --- /dev/null +++ b/docs/source/features/agents.md @@ -0,0 +1,166 @@ +# EmbodiAgent + +EmbodiAgent is a hierarchical multi-agent system that enables robots to perform complex manipulation tasks through closed-loop planning, code generation, and validation. The system combines vision-language models (VLMs) and large language models (LLMs) to translate high-level goals into executable robot actions. + +## Quick Start + +### Prerequisites +Ensure you have access to Azure OpenAI or a compatible LLM endpoint. + +```bash +# Set environment variables +export AZURE_OPENAI_ENDPOINT="https://your-endpoint.openai.azure.com/" +export AZURE_OPENAI_API_KEY="your-api-key" +``` + +### Using Different LLM/VLM APIs + +The system uses LangChain's `AzureChatOpenAI` by default. To use different LLM/VLM providers, you can modify the `create_llm` function in `embodichain/agents/hierarchy/llm.py`. + +#### Azure OpenAI +```bash +export AZURE_OPENAI_ENDPOINT="https://your-endpoint.openai.azure.com/" +export AZURE_OPENAI_API_KEY="your-api-key" +export OPENAI_API_VERSION="2024-10-21" # Optional, defaults to "2024-10-21" +``` + +#### OpenAI +To use OpenAI directly instead of Azure, modify `llm.py`: +```python +from langchain_openai import ChatOpenAI + +def create_llm(*, temperature=0.0, model="gpt-4o"): + return ChatOpenAI( + temperature=temperature, + model=model, + api_key=os.getenv("OPENAI_API_KEY"), + ) +``` + +Then set: +```bash +export OPENAI_API_KEY="your-api-key" +``` + +#### Other Providers +You can use other LangChain-compatible providers by modifying the `create_llm` function, for example: + +**Anthropic Claude:** +```python +from langchain_anthropic import ChatAnthropic + +def create_llm(*, temperature=0.0, model="claude-3-opus-20240229"): + return ChatAnthropic( + temperature=temperature, + model=model, + anthropic_api_key=os.getenv("ANTHROPIC_API_KEY"), + ) +``` + +**Google Gemini:** +```python +from langchain_google_genai import ChatGoogleGenerativeAI + +def create_llm(*, temperature=0.0, model="gemini-pro"): + return ChatGoogleGenerativeAI( + temperature=temperature, + model=model, + google_api_key=os.getenv("GOOGLE_API_KEY"), + ) +``` + +### Run the System + +Run the agent system with the following command: + +```bash +python embodichain/lab/scripts/run_agent.py \ + --task_name YourTask \ + --gym_config configs/gym/your_task/gym_config.json \ + --agent_config configs/gym/agent/your_agent/agent_config.json \ + --regenerate False +``` + +**Parameters:** +- `--task_name`: Name identifier for the task +- `--gym_config`: Path to the gym environment configuration file +- `--agent_config`: Path to the agent configuration file (defines prompts and agent behavior) +- `--regenerate`: If `True`, forces regeneration of plans/code even if cached + +## System Architecture + +The system operates on a closed-loop control cycle: + +- **Observe**: The `TaskAgent` perceives the environment via multi-view camera inputs. +- **Plan**: It decomposes the goal into natural language steps. +- **Code**: The `CodeAgent` translates steps into executable Python code using atomic actions. +- **Execute**: The code runs in the environment; runtime errors are caught immediately. +- **Validate**: The `ValidationAgent` analyzes the result images, selects the best camera angle, and judges success. +- **Refine**: If validation fails, feedback is sent back to the agents to regenerate the plan or code. + +--- + +## Core Components + +### TaskAgent +*Located in:* `embodichain/agents/hierarchy/task_agent.py` + +Responsible for high-level reasoning. It parses visual observations and outputs a structured plan. + +* For every step, it generates a specific condition (e.g., "The cup must be held by the gripper") which is used later by the ValidationAgent. +* Prompt Strategies: + * `one_stage_prompt`: Direct VLM-to-Plan generation. + * `two_stage_prompt`: Separates visual analysis from planning logic. + +### CodeAgent +*Located in:* `embodichain/agents/hierarchy/code_agent.py` + +Translates natural language plans into executable Python code using atomic actions from the action bank. + +* Generates Python code that follows strict coding guidelines (no loops, only provided APIs) +* Executes code in a sandboxed environment with immediate error detection +* Uses Abstract Syntax Tree (AST) parsing to ensure code safety and correctness +* Supports few-shot learning through code examples in the configuration + + +### ValidationAgent +*Located in:* `embodichain/agents/hierarchy/validation_agent.py` + +Closes the loop by verifying if the robot actually achieved what it planned. + +* Uses a specialized LLM call (`select_best_view_dir`) to analyze images from all cameras and pick the single best angle that proves the action's outcome, ignoring irrelevant views. +* If an error occurs (runtime or logic), it generates a detailed explanation which is fed back to the `TaskAgent` or `CodeAgent` for the next attempt. + +--- + +## Configuration Guide + +The `Agent` configuration block controls the context provided to the LLMs. Prompt files are resolved in the following order: + +1. **Config directory**: Task-specific prompt files in the same directory as the agent configuration file (e.g., `configs/gym/agent/pour_water_agent/`) +2. **Default prompts directory**: Reusable prompt templates in `embodichain/agents/prompts/` + +| Parameter | Description | Typical Use | +| :--- | :--- | :--- | +| `task_prompt` | Task-specific goal description | "Pour water from the red cup to the blue cup." | +| `basic_background` | Physical rules & constraints | World coordinate system definitions, safety rules. | +| `atom_actions` | API Documentation | List of available functions (e.g., `drive(action='pick', ...)`). | +| `code_prompt` | Coding guidelines | "Use provided APIs only. Do not use loops." | +| `code_example` | Few-shot examples | Previous successful code snippets to guide style. | + +--- + +## File Structure + +```text +embodichain/agents/ +├── hierarchy/ +│ ├── agent_base.py # Abstract base handling prompts & images +│ ├── task_agent.py # Plan generation logic +│ ├── code_agent.py # Code generation & AST execution engine +│ ├── validation_agent.py # Visual analysis & view selection +│ └── llm.py # LLM configuration and instances +├── mllm/ +│ └── prompt/ # Prompt templates (LangChain) +└── prompts/ # Agent prompt templates +``` diff --git a/docs/source/index.rst b/docs/source/index.rst index c0ac442e..f87cbd7d 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -36,6 +36,7 @@ Table of Contents :caption: Features :glob: + features/agents.md features/workspace_analyzer/index* .. toctree:: diff --git a/embodichain/agents/hierarchy/__init__.py b/embodichain/agents/hierarchy/__init__.py new file mode 100644 index 00000000..30c31bc9 --- /dev/null +++ b/embodichain/agents/hierarchy/__init__.py @@ -0,0 +1,19 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2025 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from langchain_openai import AzureChatOpenAI +from langchain_openai import ChatOpenAI +import os diff --git a/embodichain/agents/hierarchy/agent_base.py b/embodichain/agents/hierarchy/agent_base.py new file mode 100644 index 00000000..12ba2c1d --- /dev/null +++ b/embodichain/agents/hierarchy/agent_base.py @@ -0,0 +1,94 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2025 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from abc import ABCMeta +import os +from pathlib import Path +from embodichain.utils.utility import load_txt +import embodichain.agents.mllm.prompt as mllm_prompt +from embodichain.data import database_2d_dir + + +def _resolve_prompt_path(file_name: str, config_dir: str = None) -> str: + # If absolute path, use directly + if os.path.isabs(file_name): + if os.path.exists(file_name): + return file_name + raise FileNotFoundError(f"Prompt file not found: {file_name}") + + # Try config directory first (for task-specific prompts) + if config_dir: + config_path = os.path.join(config_dir, file_name) + if os.path.exists(config_path): + return config_path + + # Try agents/prompts directory (for reusable prompts) + agents_prompts_dir = os.path.join( + os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "prompts" + ) + agents_path = os.path.join(agents_prompts_dir, file_name) + if os.path.exists(agents_path): + return agents_path + + # If still not found, raise error with search paths + searched_paths = [] + if config_dir: + searched_paths.append(f" - {config_dir}/{file_name}") + searched_paths.append(f" - {agents_prompts_dir}/{file_name}") + + raise FileNotFoundError( + f"Prompt file not found: {file_name}\n" + f"Searched in:\n" + "\n".join(searched_paths) + ) + + +class AgentBase(metaclass=ABCMeta): + def __init__(self, **kwargs) -> None: + + assert ( + "prompt_kwargs" in kwargs.keys() + ), "Key prompt_kwargs must exist in config." + + for key, value in kwargs.items(): + setattr(self, key, value) + + # Get config directory if provided + config_dir = kwargs.get("config_dir", None) + if config_dir: + config_dir = os.path.dirname(os.path.abspath(config_dir)) + + # Preload and store prompt contents inside self.prompt_kwargs + for key, val in self.prompt_kwargs.items(): + if val["type"] == "text": + file_path = _resolve_prompt_path(val["name"], config_dir) + val["content"] = load_txt(file_path) # ← store content here + else: + raise ValueError( + f"Now only support `text` type but {val['type']} is given." + ) + + def generate(self, *args, **kwargs): + pass + + def act(self, *args, **kwargs): + pass + + def get_composed_observations(self, **kwargs): + ret = {"observations": kwargs.get("env").get_obs_for_agent()} + for key, val in self.prompt_kwargs.items(): + ret[key] = val["content"] + ret.update(kwargs) + return ret diff --git a/embodichain/agents/hierarchy/code_agent.py b/embodichain/agents/hierarchy/code_agent.py new file mode 100644 index 00000000..3aa999a0 --- /dev/null +++ b/embodichain/agents/hierarchy/code_agent.py @@ -0,0 +1,288 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2025 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from embodichain.agents.hierarchy.agent_base import AgentBase +from langchain_core.prompts import ChatPromptTemplate +import os +import numpy as np +from typing import Dict, Tuple +from embodichain.agents.mllm.prompt import CodePrompt +from embodichain.data import database_agent_prompt_dir +from pathlib import Path +import re +import importlib.util +from datetime import datetime + + +def format_execution_history(execution_history): + if not execution_history or len(execution_history) == 0: + return "None." + + return "\n\n".join(f"{i}. {entry}" for i, entry in enumerate(execution_history, 1)) + + +def extract_python_code_and_text(llm_response: str) -> Tuple[str, str]: + """ + Extract exactly ONE python code block from the LLM response, + and return: + - code: the content inside the python block + - text: all remaining explanation text (outside the code block) + + Raises ValueError if zero or multiple python blocks are found. + """ + + pattern = r"```python\s*(.*?)\s*```" + matches = list(re.finditer(pattern, llm_response, re.DOTALL)) + + if len(matches) == 0: + raise ValueError("No python code block found in LLM response.") + if len(matches) > 1: + raise ValueError("Multiple python code blocks found in LLM response.") + + match = matches[0] + code = match.group(1).strip() + + # Optional sanity check + if not code.startswith("#") and not code.startswith("drive("): + raise ValueError( + f"Invalid code block content. Expected `drive(...)` or `# TASK_COMPLETE`, got:\n{code}" + ) + + # Extract remaining text (before + after the code block) + text_before = llm_response[: match.start()].strip() + text_after = llm_response[match.end() :].strip() + + explanation_text = "\n\n".join(part for part in [text_before, text_after] if part) + + return code, explanation_text + + +def format_llm_response_md( + llm_analysis: str, # plain-text explanation (NO code) + extracted_code: str, # validated executable code + step_id: int = None, + execution_history: str = None, + obs_image_path: Path = None, + md_file_path: Path = None, +) -> str: + ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + header = f"## Step: {step_id if step_id is not None else '-'} | {ts}\n\n" + + history_block = "" + if execution_history: + history_block = ( + "### Execution History (Input to LLM)\n\n" + "```\n" + f"{execution_history}\n" + "```\n\n" + ) + + image_block = "" + if obs_image_path is not None and md_file_path is not None: + try: + rel_path = obs_image_path.relative_to(md_file_path.parent) + except ValueError: + # Fallback: just use filename + rel_path = obs_image_path.name + + image_block = ( + "### Observation Image\n\n" f"![]({Path(rel_path).as_posix()})\n\n" + ) + + body = ( + image_block + history_block + "### LLM Analysis\n\n" + f"{llm_analysis.strip()}\n\n" + "### Executed Code\n\n" + "```python\n" + f"{extracted_code.strip()}\n" + "```\n\n" + "---\n\n" + ) + + return header + body + + +class CodeAgent(AgentBase): + query_prefix = "# " + query_suffix = "." + prompt: ChatPromptTemplate + prompt_kwargs: Dict[str, Dict] + + def __init__(self, llm, **kwargs) -> None: + super().__init__(**kwargs) + if llm is None: + raise ValueError( + "LLM is None. Please set the following environment variables:\n" + " - AZURE_OPENAI_ENDPOINT\n" + " - AZURE_OPENAI_API_KEY\n" + "Example:\n" + " export AZURE_OPENAI_ENDPOINT='https://your-endpoint.openai.azure.com/'\n" + " export AZURE_OPENAI_API_KEY='your-api-key'" + ) + self.llm = llm + + def generate(self, **kwargs): + log_dir = kwargs.get( + "log_dir", Path(database_agent_prompt_dir) / self.task_name + ) + file_path = log_dir / "agent_generated_code.py" + + # Check if the file already exists + if not kwargs.get("regenerate", False): + if file_path.exists(): + print(f"Code file already exists at {file_path}, skipping writing.") + return file_path, kwargs, None + + # Generate code via LLM + prompt = getattr(CodePrompt, self.prompt_name)( + **kwargs, + ) + + # insert feedback if exists + if len(kwargs.get("error_messages", [])) != 0: + # just use the last one + last_code = kwargs["generated_codes"][-1] + last_error = kwargs["error_messages"][-1] + last_observation = ( + kwargs.get("observation_feedbacks")[-1] + if kwargs.get("observation_feedbacks") + else None + ) + + # Add extra human message with feedback + feedback_msg = self.build_feedback_message( + last_code, last_error, last_observation + ) + prompt.messages.append(feedback_msg) + + llm_code = self.llm.invoke(prompt) + + # Normalize content + llm_code = getattr(llm_code, "content", str(llm_code)) + + print(f"\033[92m\nCode agent output:\n{llm_code}\n\033[0m") + + # Write the code to the file if it does not exist + match = re.search(r"```python\n(.*?)\n```", llm_code, re.DOTALL) + if match: + code_to_save = match.group(1).strip() + else: + code_to_save = llm_code.strip() + + file_path.parent.mkdir(parents=True, exist_ok=True) + with open(file_path, "w") as f: + f.write(code_to_save) + print(f"Generated function code saved to {file_path}") + + return file_path, kwargs, code_to_save + + def act(self, code_file_path, **kwargs): + """Execute generated code with proper execution environment. + + Supports two modes: + 1. If code defines 'create_agent_action_list' function, call it + 2. If code contains module-level drive() calls, execute them directly + """ + import ast + + # Read the generated code file + with open(code_file_path, "r") as f: + code_content = f.read() + + # Build execution namespace with necessary imports + ns = { + "__builtins__": __builtins__, + "__name__": "__main__", + "__file__": str(code_file_path), + "kwargs": kwargs, # Make kwargs available for injection + } + + # Import toolkit functions into namespace + try: + exec( + "from embodichain.toolkits.interfaces import *", + ns, + ns, + ) + except Exception as e: + raise RuntimeError( + "Failed to import embodichain.toolkits.interfaces" + ) from e + + # Parse code to check if it defines a function or contains module-level calls + tree = ast.parse(code_content) + + # Check if code defines create_agent_action_list function + has_function = any( + isinstance(node, ast.FunctionDef) + and node.name == "create_agent_action_list" + for node in tree.body + ) + + if has_function: + # Execute code (function will be defined in namespace) + exec(code_content, ns, ns) + + # Call the function if it exists + if "create_agent_action_list" in ns: + result = ns["create_agent_action_list"](**kwargs) + print("Function executed successfully.") + return result + else: + raise AttributeError( + "The function 'create_agent_action_list' was not found after execution." + ) + else: + # Code contains module-level drive() calls + # AST transformer to inject **kwargs into function calls + class InjectKwargs(ast.NodeTransformer): + def visit_Call(self, node): + self.generic_visit(node) + # Inject **kwargs if not present + has_kwargs = any( + kw.arg is None + and isinstance(kw.value, ast.Name) + and kw.value.id == "kwargs" + for kw in node.keywords + ) + if not has_kwargs: + node.keywords.append( + ast.keyword( + arg=None, value=ast.Name(id="kwargs", ctx=ast.Load()) + ) + ) + return node + + # Transform AST to inject kwargs + tree = InjectKwargs().visit(tree) + ast.fix_missing_locations(tree) + + # Compile and execute transformed code + compiled_code = compile(tree, filename=str(code_file_path), mode="exec") + exec(compiled_code, ns, ns) + + # Collect actions from drive() calls if they were executed + # drive() function stores actions in env._episode_action_list + if "env" in kwargs: + env = kwargs["env"] + if hasattr(env, "_episode_action_list") and env._episode_action_list: + print( + f"Collected {len(env._episode_action_list)} actions from module-level drive() calls." + ) + return env._episode_action_list + + print("Code executed successfully, but no actions were collected.") + return [] diff --git a/embodichain/agents/hierarchy/llm.py b/embodichain/agents/hierarchy/llm.py new file mode 100644 index 00000000..dce50532 --- /dev/null +++ b/embodichain/agents/hierarchy/llm.py @@ -0,0 +1,72 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2025 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +import os +from langchain_openai import AzureChatOpenAI + +# ------------------------------------------------------------------------------ +# Environment configuration +# ------------------------------------------------------------------------------ + +# Clear proxy if not needed (optional, can be set via environment variables) + +os.environ["ALL_PROXY"] = "" +os.environ["all_proxy"] = "" + +# Proxy configuration (optional, uncomment if needed) +# os.environ["HTTP_PROXY"] = "http://127.0.0.1:7890" +# os.environ["HTTPS_PROXY"] = "http://127.0.0.1:7890" + +# API version (optional, defaults to "2024-10-21" if not set) +# os.environ["OPENAI_API_VERSION"] = "2024-10-21" + +# Note: AZURE_OPENAI_ENDPOINT and AZURE_OPENAI_API_KEY must be set via environment variables +# Example in bash: +# export AZURE_OPENAI_ENDPOINT="https://your-endpoint.openai.azure.com/" +# export AZURE_OPENAI_API_KEY="your-api-key" + +# ------------------------------------------------------------------------------ +# LLM factory +# ------------------------------------------------------------------------------ + + +def create_llm(*, temperature=0.0, model="gpt-4o"): + return AzureChatOpenAI( + temperature=temperature, + model=model, + azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"), + api_key=os.getenv("AZURE_OPENAI_API_KEY"), + api_version=os.getenv("OPENAI_API_VERSION", "2024-10-21"), + ) + + +# ------------------------------------------------------------------------------ +# LLM instances +# ------------------------------------------------------------------------------ + + +# Initialize LLM instances, but handle errors gracefully for documentation builds +def _create_llm_safe(*, temperature=0.0, model="gpt-4o"): + try: + return create_llm(temperature=temperature, model=model) + except Exception: + return None + + +task_llm = _create_llm_safe(temperature=0.0, model="gpt-4o") +code_llm = _create_llm_safe(temperature=0.0, model="gpt-4o") +validation_llm = _create_llm_safe(temperature=0.0, model="gpt-4o") +view_selection_llm = _create_llm_safe(temperature=0.0, model="gpt-4o") diff --git a/embodichain/agents/hierarchy/task_agent.py b/embodichain/agents/hierarchy/task_agent.py new file mode 100644 index 00000000..5f7b5403 --- /dev/null +++ b/embodichain/agents/hierarchy/task_agent.py @@ -0,0 +1,157 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2025 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from typing import List, Dict, Tuple +from embodichain.agents.hierarchy.agent_base import AgentBase +from langchain_core.prompts import ChatPromptTemplate +from embodichain.data import database_2d_dir +from embodichain.utils.utility import load_txt +from embodichain.agents.mllm.prompt import TaskPrompt +from embodichain.data import database_agent_prompt_dir +from pathlib import Path +import numpy as np +import time +import re + +USEFUL_INFO = """The error may be caused by: +1. You did not follow the basic background information, especially the world coordinate system with its xyz directions. +2. You did not take into account the NOTE given in the atom actions or in the example functions. +3. You did not follow the steps of the task descriptions.\n +""" + + +def extract_plan_and_validation(text: str) -> Tuple[str, List[str], List[str]]: + def get_section(src: str, name: str, next_name) -> str: + if next_name: + pat = re.compile( + rf"\[{name}\]\s*:\s*(.*?)\s*(?=\[{next_name}\]\s*:|\Z)", + re.DOTALL | re.IGNORECASE, + ) + else: + pat = re.compile( + rf"\[{name}\]\s*:\s*(.*?)\s*\Z", + re.DOTALL | re.IGNORECASE, + ) + m = pat.search(src) + return m.group(1).strip() if m else "" + + step_re = re.compile( + r"Step\s*\d+\s*:.*?(?=Step\s*\d+\s*:|\Z)", + re.DOTALL | re.IGNORECASE, + ) + + # ---- plans ---- + plans_raw = get_section(text, "PLANS", "VALIDATION_CONDITIONS") + plan_steps = [m.group(0).rstrip() for m in step_re.finditer(plans_raw)] + plan_str = "\n".join(plan_steps) + + # normalized plan list (strip "Step k:") + plan_list = [] + for step in plan_steps: + content = re.sub(r"^Step\s*\d+\s*:\s*", "", step, flags=re.IGNORECASE).strip() + if content: + plan_list.append(content) + + # ---- validations ---- + vals_raw = get_section(text, "VALIDATION_CONDITIONS", None) + validation_list = [] + for m in step_re.finditer(vals_raw): + content = re.sub( + r"^Step\s*\d+\s*:\s*", "", m.group(0), flags=re.IGNORECASE + ).strip() + if content: + validation_list.append(content) + + return plan_str, plan_list, validation_list + + +class TaskAgent(AgentBase): + prompt: ChatPromptTemplate + object_list: List[str] + target: np.ndarray + prompt_name: str + prompt_kwargs: Dict[str, Dict] + + def __init__(self, llm, **kwargs) -> None: + super().__init__(**kwargs) + if llm is None: + raise ValueError( + "LLM is None. Please set the following environment variables:\n" + " - AZURE_OPENAI_ENDPOINT\n" + " - AZURE_OPENAI_API_KEY\n" + "Example:\n" + " export AZURE_OPENAI_ENDPOINT='https://your-endpoint.openai.azure.com/'\n" + " export AZURE_OPENAI_API_KEY='your-api-key'" + ) + self.llm = llm + + def generate(self, **kwargs) -> str: + log_dir = kwargs.get( + "log_dir", Path(database_agent_prompt_dir) / self.task_name + ) + file_path = log_dir / "agent_generated_plan.txt" + + # Check if the file already exists + if not kwargs.get("regenerate", False): + if file_path.exists(): + print(f"Plan file already exists at {file_path}, skipping writing.") + return load_txt(file_path) + + # Generate query via LLM + prompts_ = getattr(TaskPrompt, self.prompt_name)(**kwargs) + if isinstance(prompts_, list): + # TODO: support two-stage prompts with feedback + start_time = time.time() + response = self.llm.invoke(prompts_[0]) + query = response.content + print( + f"\033[92m\nSystem tasks output ({np.round(time.time()-start_time, 4)}s):\n{query}\n\033[0m" + ) + for prompt in prompts_[1:]: + temp = prompt["kwargs"] + temp.update({"query": query}) + start_time = time.time() + response = self.llm.invoke(prompt["prompt"].invoke(temp)) + query = response.content + print( + f"\033[92m\nSystem tasks output({np.round(time.time()-start_time, 4)}s):\n{query}\n\033[0m" + ) + else: + # insert feedback if exists + if len(kwargs.get("error_messages", [])) != 0: + # just use the last one + last_plan = kwargs["generated_plans"][-1] + last_code = kwargs["generated_codes"][-1] + last_error = kwargs["error_messages"][-1] + + # Add extra human message with feedback + feedback_msg = self.build_feedback_message( + last_plan, last_code, last_error + ) + prompts_.messages.append(feedback_msg) + + response = self.llm.invoke(prompts_) + print(f"\033[92m\nTask agent output:\n{response.content}\n\033[0m") + + file_path.parent.mkdir(parents=True, exist_ok=True) + with open(file_path, "w") as f: + f.write(response.content) + print(f"Generated task plan saved to {file_path}") + + return response.content + + def act(self, *args, **kwargs): + return super().act(*args, **kwargs) diff --git a/embodichain/agents/hierarchy/validation_agent.py b/embodichain/agents/hierarchy/validation_agent.py new file mode 100644 index 00000000..98e5f159 --- /dev/null +++ b/embodichain/agents/hierarchy/validation_agent.py @@ -0,0 +1,241 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2025 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +import os +from langchain_core.messages import SystemMessage, HumanMessage +from abc import ABCMeta +from embodichain.utils.utility import encode_image_from_path +import glob +from embodichain.agents.hierarchy.llm import view_selection_llm + + +def save_obs_image(obs_image, save_dir, step_id=None): + """ + Save observation image using encode_image() and return its file path. + """ + import base64 + from embodichain.utils.utility import encode_image + + if obs_image is None: + return None + + if isinstance(save_dir, str): + from pathlib import Path + + save_dir = Path(save_dir) + + save_dir.mkdir(parents=True, exist_ok=True) + + name = f"obs_step_{step_id}.png" if step_id is not None else "obs.png" + img_path = save_dir / name + + # Encode to base64 + base64_image = encode_image(obs_image) + + # Decode base64 → bytes + img_bytes = base64.b64decode(base64_image) + + # Write to file + with open(img_path, "wb") as f: + f.write(img_bytes) + + return img_path + + +def get_obj_position_info(env): + import json + + position_info = {} + obj_uids = env.sim.get_rigid_object_uid_list() + for obj_name in obj_uids: + target_obj = env.sim.get_rigid_object(obj_name) + target_obj_pose = target_obj.get_local_pose(to_matrix=True).squeeze(0)[:3, 3] + position_info[obj_name] = target_obj_pose.tolist() + return json.dumps(position_info, indent=4) + + +class ValidationAgent(metaclass=ABCMeta): + + def __init__(self, llm, **kwargs) -> None: + super().__init__() + for key, value in kwargs.items(): + setattr(self, key, value) + if llm is None: + raise ValueError( + "LLM is None. Please set the following environment variables:\n" + " - AZURE_OPENAI_ENDPOINT\n" + " - AZURE_OPENAI_API_KEY\n" + "Example:\n" + " export AZURE_OPENAI_ENDPOINT='https://your-endpoint.openai.azure.com/'\n" + " export AZURE_OPENAI_API_KEY='your-api-key'" + ) + self.llm = llm + + def validate(self, step_names, problematic_code, error_message, image_files): + # Construct the prompt + prompt = f""" + Analyze the execution of the following robot task: + + Task name: {self.task_name} + Task description: {self.task_description} + Basic background knowledge: {self.basic_background} + + You will be given images showing each step of the execution. For the step sequence: + {', '.join(step_names)} + + Provide the following analysis: + 1. Decide whether the full task succeeded or failed. + 2. If the task failed, provide a precise and detailed explanation. + + Below is a potentially problematic piece of code and the corresponding execution error: + + ```python + {problematic_code} + # Execution error: + {error_message} + Explain whether (and how) this code contributed to the observed failure. + """ + + # Prepare message content for API call + user_content = [] + + # Add textual prompt + user_content.append({"type": "text", "text": prompt}) + + # Add images and step names + for img_path in image_files: + filename = os.path.basename(img_path) + first_underscore_pos = filename.find("_") + if first_underscore_pos != -1: + step_name = filename[first_underscore_pos + 1 :].rsplit(".", 1)[0] + else: + step_name = filename.rsplit(".", 1)[0] + + # Add step name + user_content.append({"type": "text", "text": f"Step: {step_name}"}) + + # Add image as base64 + base64_image = encode_image_from_path(img_path) + user_content.append( + { + "type": "image_url", + "image_url": {"url": f"data:image/png;base64,{base64_image}"}, + } + ) + + messages = [ + SystemMessage( + content="You are a robot task execution analysis expert. Please analyze the provided image sequence." + ), + HumanMessage(content=user_content), + ] + + response = self.llm.invoke(messages) + return response.content + + def select_best_view_dir( + self, img_dirs: dict, action_description: str, valid_condition: str + ): + """ + img_dirs: { + "cam_1": Path, + "cam_2": Path, + "cam_3": Path + } + """ + + # --- collect final images --- + last_images = {} + for cam_id, cam_dir in img_dirs.items(): + imgs = sorted( + glob.glob(os.path.join(cam_dir, "obs_step_*.png")), + key=lambda p: int(os.path.basename(p).split("_")[-1].split(".")[0]), + ) + if imgs: + last_images[cam_id] = imgs[-1] + + if not last_images: + raise ValueError("No images found in any camera directory.") + + # --- system prompt --- + system_prompt = ( + "You are a robot perception assistant specialized in VIEW SELECTION.\n\n" + "TASK:\n" + "- You are given ONE final observation image from EACH camera view.\n" + "- Your job is NOT to judge success or failure.\n" + "- Your job is ONLY to select the SINGLE camera view that is MOST SUITABLE\n" + " for OBJECT-LEVEL validation of the action result.\n\n" + "ACTION CONTEXT:\n" + "- The robot has just executed ONE atomic action.\n" + "- You are given the action intention and the expected object-level outcome\n" + " ONLY to help you decide which view best reveals that outcome.\n\n" + "SELECTION CRITERIA (PRIORITY ORDER):\n" + "- Prefer views with:\n" + " * the clearest visibility of the relevant object(s)\n" + " * minimal occlusion by the arm or environment\n" + " * the clearest evidence related to the expected object-level result\n" + " (e.g., contact, separation, support, stability)\n\n" + "STRICT CONSTRAINTS:\n" + "- Do NOT judge robot motion quality or execution accuracy.\n" + "- Do NOT reason about numeric values (distance, angle, offset).\n" + "- Do NOT decide whether the action succeeded or failed.\n" + "- If multiple views are acceptable, choose the clearest overall view.\n\n" + "OUTPUT FORMAT (STRICT):\n" + "Output EXACTLY ONE of the following tokens:\n" + "- cam_1\n" + "- cam_2\n" + "- cam_3\n" + ) + + # --- human content --- + human_content = [ + { + "type": "text", + "text": ( + "Select the best camera view for object-level validation.\n\n" + "--------------------------------------------------\n" + "ACTION DESCRIPTION (INTENT ONLY):\n" + f"{action_description}\n\n" + "EXPECTED OBJECT-LEVEL RESULT (REFERENCE ONLY):\n" + f"{valid_condition}\n" + "--------------------------------------------------" + ), + } + ] + + for cam_id, img_path in last_images.items(): + img_b64 = encode_image_from_path(img_path) + human_content.extend( + [ + {"type": "text", "text": f"View candidate: {cam_id}"}, + { + "type": "image_url", + "image_url": {"url": f"data:image/png;base64,{img_b64}"}, + }, + ] + ) + + messages = [ + SystemMessage(content=system_prompt), + HumanMessage(content=human_content), + ] + + response = view_selection_llm.invoke(messages).content.strip() + + if response not in img_dirs: + raise ValueError(f"Invalid camera selection from LLM: {response}") + + return response diff --git a/embodichain/agents/mllm/prompt/__init__.py b/embodichain/agents/mllm/prompt/__init__.py new file mode 100644 index 00000000..55bc408b --- /dev/null +++ b/embodichain/agents/mllm/prompt/__init__.py @@ -0,0 +1,8 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2025 DexForce Technology Co., Ltd. +# +# All rights reserved. +# ---------------------------------------------------------------------------- + +from .task_prompt import TaskPrompt +from .code_prompt import CodePrompt diff --git a/embodichain/agents/mllm/prompt/code_prompt.py b/embodichain/agents/mllm/prompt/code_prompt.py new file mode 100644 index 00000000..845064e5 --- /dev/null +++ b/embodichain/agents/mllm/prompt/code_prompt.py @@ -0,0 +1,149 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2025 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from langchain_core.messages import SystemMessage +from langchain_core.prompts import ( + ChatPromptTemplate, + HumanMessagePromptTemplate, +) +from embodichain.utils.utility import encode_image + + +class CodePrompt: + @staticmethod + def one_stage_prompt(**kwargs) -> ChatPromptTemplate: + prompt = ChatPromptTemplate.from_messages( + [ + SystemMessage( + content="You are an AI assistant that can generate python code to execute robot arms." + ), + HumanMessagePromptTemplate.from_template( + [ + { + "type": "text", + "text": ( + "Generate a Python code snippet that accomplishes the following task:\n" + "{query}\n\n" + "You must strictly follow the rules and available functions described below:\n" + "{code_prompt}\n\n" + "Here are some reference examples of the expected output code:\n" + "{code_example}\n\n" + ), + } + ] + ), + ] + ) + return prompt.invoke(kwargs) + + @staticmethod + def unified_prompt(observations, **kwargs): + """ + Unified Vision→Code prompt: + - Model observes the image + - Understands the scene and the task goal + - Generates final executable Python code using atomic robot APIs + """ + + # Encode the image + observation = observations["rgb"] + kwargs.update({"observation": encode_image(observation)}) + + prompt = ChatPromptTemplate.from_messages( + [ + SystemMessage( + content=( + "You are a reliable Vision-Language-Code robot assistant. " + "You observe an image, understand the scene and the task goal, " + "and generate correct Python code using ONLY the allowed atomic robot actions. " + "Your final output must be a single Python code block." + ) + ), + HumanMessagePromptTemplate.from_template( + [ + { + "type": "image_url", + "image_url": { + "url": "data:image/png;base64,{observation}", + }, + }, + { + "type": "text", + "text": ( + "### Task Goal\n" + "{task_prompt}\n\n" + "### Environment Background\n" + "{basic_background}\n\n" + "### Allowed Atomic Actions\n" + "{atom_actions}\n\n" + "### Code Rules\n" + "{code_prompt}\n\n" + "### Reference Code Examples\n" + "{code_example}\n\n" + "### Final Instructions\n" + "Understand the scene from the image and generate final executable Python code " + "that performs the task using ONLY the allowed atomic actions.\n\n" + "Your entire response must be EXACTLY one Python code block:\n" + "```python\n" + "# your solution code here\n" + "```\n" + ), + }, + ] + ), + ] + ) + + return prompt.invoke(kwargs) + + @staticmethod + def one_stage_prompt_according_to_task_plan(**kwargs) -> ChatPromptTemplate: + prompt = ChatPromptTemplate.from_messages( + [ + SystemMessage( + content=( + "You are a reliable robot control code generator.\n" + "Your task is to generate Python code that executes robot arm actions.\n\n" + "CRITICAL RULES:\n" + "- The TASK PLAN defines the available atomic actions, rules, and execution logic.\n" + "- You MUST strictly follow the TASK PLAN.\n" + "- The CONSTRAINTS section contains additional global constraints you must obey.\n" + "- Do NOT invent new actions, functions, parameters, or control flow.\n" + "- You MAY include Python comments (# ...) inside the code.\n" + "- Your ENTIRE response MUST be a single Python code block.\n" + "- The code block MUST be directly executable without modification.\n" + "- Do NOT include any text, explanation, or markdown outside the Python code block.\n" + ) + ), + HumanMessagePromptTemplate.from_template( + [ + { + "type": "text", + "text": ( + "TASK PLAN (atomic actions, rules, and intended behavior):\n" + "{task_plan}\n\n" + "GLOBAL CONSTRAINTS (must be satisfied):\n" + "{code_prompt}\n\n" + "REFERENCE CODE (style and structure only; do NOT copy logic):\n" + "{code_example}\n\n" + "Generate the corrected Python code now." + ), + } + ] + ), + ] + ) + return prompt.invoke(kwargs) diff --git a/embodichain/agents/mllm/prompt/task_prompt.py b/embodichain/agents/mllm/prompt/task_prompt.py new file mode 100644 index 00000000..746e0f18 --- /dev/null +++ b/embodichain/agents/mllm/prompt/task_prompt.py @@ -0,0 +1,144 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2025 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +import torch +from langchain_core.messages import SystemMessage +from langchain_core.prompts import ( + ChatPromptTemplate, + HumanMessagePromptTemplate, +) +from embodichain.utils.utility import encode_image + + +class TaskPrompt: + @staticmethod + def one_stage_prompt(observations, **kwargs): + """ + Hybrid one-pass prompt: + Step 1: VLM analyzes the image and extracts object IDs. + Step 2: LLM generates task instructions using only those IDs. + """ + # Encode image + observation = ( + observations["rgb"].cpu().numpy() + if isinstance(observations["rgb"], torch.Tensor) + else observations["rgb"] + ) + kwargs.update({"observation": encode_image(observation)}) + + # Build hybrid prompt + prompt = ChatPromptTemplate.from_messages( + [ + SystemMessage( + content=( + "You are a precise and reliable robotic manipulation planner. " + "Given a camera observation and a task description, you must generate " + "a clear, step-by-step task plan for a robotic arm. " + "All actions must strictly use the provided atomic API functions, " + "and the plan must be executable without ambiguity." + ) + ), + HumanMessagePromptTemplate.from_template( + [ + { + "type": "image_url", + "image_url": { + "url": "data:image/png;base64,{observation}", + }, + }, + { + "type": "text", + "text": ( + "Here is the latest camera observation.\n" + "First, analyze the scene in the image.\n" + "Then, using the context below, produce an actionable task plan.\n\n" + "**Environment background:** \n{basic_background}\n\n" + '**Task goal:** \n"{task_prompt}"\n\n' + "**Available atomic actions:** \n{atom_actions}\n" + ), + }, + ] + ), + ] + ) + + # Return the prompt template and kwargs to be executed by the caller + return prompt.invoke(kwargs) + + @staticmethod + def two_stage_prompt(observations, **kwargs): + # for VLM generate image descriptions + prompt = ChatPromptTemplate.from_messages( + [ + SystemMessage( + content="You are a helpful assistant to operate a robotic arm with a camera to generate task plans according to descriptions." + ), + HumanMessagePromptTemplate.from_template( + [ + { + "type": "image_url", + "image_url": { + "url": "data:image/jpg;base64,{observation}", + }, + }, + { + "type": "text", + "text": "What is in the image? Return answer with their potential effects.", + }, + ] + ), + ] + ) + + observation = ( + observations["rgb"].cpu().numpy() + if isinstance(observations["rgb"], torch.Tensor) + else observations["rgb"] + ) + kwargs.update({"observation": encode_image(observation)}) + # for LLM generate task descriptions + prompt_query = ChatPromptTemplate.from_messages( + [ + SystemMessage( + content="You are a helpful assistant to operate a robotic arm with a camera to generate task plans according to descriptions." + ), + HumanMessagePromptTemplate.from_template( + [ + { + "type": "image_url", + "image_url": { + "url": "data:image/jpg;base64,{observation}", + }, + }, + { + "type": "text", + "text": "Here is analysis for this image: {query}.", + }, + { + "type": "text", + "text": ( + "Using the context below, produce an actionable task plan.\n\n" + "**Environment background:** \n{basic_background}\n\n" + '**Task goal:** \n"{task_prompt}"\n\n' + "**Available atomic actions:** \n{atom_actions}\n" + ), + }, + ] + ), + ] + ) + + return [prompt.invoke(kwargs), {"prompt": prompt_query, "kwargs": kwargs}] diff --git a/embodichain/agents/prompts/atom_actions.txt b/embodichain/agents/prompts/atom_actions.txt new file mode 100644 index 00000000..257464c5 --- /dev/null +++ b/embodichain/agents/prompts/atom_actions.txt @@ -0,0 +1,136 @@ +### Atom Functions for Robot Arm Control +Each atomic function returns a list of joint-space trajectories (list[np.ndarray]). +All functions support an optional argument: + +force_valid: bool + If True, the system will automatically correct an invalid target pose by + projecting it to the nearest valid pose. Use this option carefully: + enable it only for actions where small spatial deviations are acceptable + and will not compromise task correctness. Default is False. + +Use the following functions exactly as defined. Do not invent new APIs or parameters. + +"grasp": + def grasp(robot_name: str, obj_name: str, pre_grasp_dis: float, **kwargs) -> list[np.ndarray] + + Moves the specified arm to the target object’s affordance-based grasp pose and executes a grasp by closing the gripper. + + The function plans a two-stage trajectory: + (1) from the current pose to a pre-grasp pose offset from the object, and + (2) from the pre-grasp pose to the final grasp pose, followed by gripper closure. + + Upon completion, the gripper is closed and the target object is expected to be stably held by the gripper. + + Example: + grasp(robot_name='right_arm', obj_name='bottle', pre_grasp_dis=0.10) # Moves the right arm to a pre-grasp pose 10 cm from the bottle, then to the grasp pose and closes the gripper to grasp the bottle. + +"place_on_table": + def place_on_table(robot_name: str, obj_name: str, x: float, y: float, pre_place_dis: float, **kwargs) -> list[np.ndarray] + + Moves the specified robot arm with the target object to the desired [x, y] location on the table and opens the gripper to place the object. + The z-coordinate is automatically adjusted based on the table height and the object’s dimensions. + This function assumes that the robot is already holding the object and that the task is to place it on the table at the specified coordinates. + Remember that when you need to place some objects on the table at specific coordinates, use this function without using other movement atom actions. + Otherwise, **if you need to place some objects relative to some place, then use "move_relative_to_object" first to move to the desired position, then use "open_gripper" to release the object.** + + Example: + place_on_table(robot_name='right_arm', obj_name='bottle', x=0.1, y=0.5, pre_place_dis=0.08) # Moves the right arm to a pre-place position 8 cm from the table, then places the bottle at the specified [0.1, 0.5] location on the table and opens the gripper. + +"move_relative_to_object": + def move_relative_to_object(robot_name: str, obj_name: str, + x_offset=0, y_offset=0, z_offset=0, + **kwargs) -> list[np.ndarray] + Moves the end-effector to a pose defined relative to the target object: + target = object_position + [x_offset, y_offset, z_offset] + Orientation is preserved. + Example: + move_relative_to_object(robot_name='right_arm', obj_name='cup', + x_offset=0.05, y_offset=0.10, z_offset=0.10) # Moves the right arm’s end-effector to a spot located 5 cm forward, 10 cm to the left, and 10 cm above the cup, while preserving the current gripper orientation. + move_relative_to_object(robot_name='right_arm', obj_name='cup', + x_offset=-0.05, y_offset=-0.10, z_offset=0.10) # Moves the right arm’s end-effector to a spot located 5 cm backward, 10 cm to the right, and 10 cm above the cup, while preserving the current gripper orientation. + +"move_to_absolute_position": + def move_to_absolute_position(robot_name: str, + x=None, y=None, z=None, + **kwargs) -> list[np.ndarray] + Moves the end-effector to an absolute (x, y, z) position in world coordinates. + Any coordinate set to None remains unchanged. + Orientation is preserved. + Example: + move_to_absolute_position(robot_name='right_arm', x=0.10, y=0.10, z=None) # Moves the end-effector to the absolute world position (x=0.10 m, y=0.10 m) while leaving z unchanged, and preserves the orientation. + +"move_by_relative_offset": + def move_by_relative_offset(robot_name: str, + dx=0.0, dy=0.0, dz=0.0, mode='extrinsic', + **kwargs) -> list[np.ndarray] + Moves the end-effector by a relative translation: + new_position = current_position + [dx, dy, dz] + The offset is applied along the specified axes using the given mode, while preserving the original end-effector orientation. + Mode can be 'extrinsic' (world frame) or 'intrinsic' (end-effector frame). If you want to move along the world axes, use 'extrinsic'. If you want to move along the end-effector’s local axes, use "intrinsic". + Example: + move_by_relative_offset(robot_name='right_arm', dx=0.05, dy=-0.10, dz=0.20, mode='extrinsic') # Translates the end-effector by +5 cm in x (front), −10 cm in y (right), +20 cm in z (above) in the world coordinate, relative to its current position, with orientation preserved. + move_by_relative_offset(robot_name='right_arm', dx=0, dy=0, dz=0.1, mode='intrinsic') # Translates the end-effector by +10 cm in z (forward) in the EEF coordinate, meaning that it moves forward relative to its current facing direction, with orientation preserved. + +"rotate_eef" + def rotate_eef(robot_name: str, degree: float, **kwargs) -> list[np.ndarray] + Rotates the wrist roll joint (joint index 5) of the specified arm by the + given number of degrees. End-effector position is preserved. + Example: + rotate_eef(robot_name='right_arm', degree=-90) # Rotates the right arm’s wrist-roll joint by −45° (counterclockwise), while keeping the end-effector position unchanged. This is a joint-level rotation, not a full orientation override. + rotate_eef(robot_name='right_arm', degree=90) # Rotates the right arm’s wrist-roll joint by 45° (clockwise), while keeping the end-effector position unchanged. This is a joint-level rotation, not a full orientation override. + Typical use cases: + Pouring or tilting a grasped object. + Rotating the gripper around its forward axis without translating the end effector. + After rotating, you typically need to apply an opposite rotation back to return to the original pose. + Usage notes: + Rotation sign convention: negative = counterclockwise, positive = clockwise, viewed along the end-effector forward axis. + For pouring with the right arm, a common pattern is: first apply a negative rotation to start pouring, then apply a positive rotation to return. + For the left arm, the sign convention is typically reversed. + +"orient_eef": + def orient_eef(robot_name: str, + direction: str = 'front', # 'front' or 'down' + **kwargs) -> list[np.ndarray] + Reorients the end-effector to a predefined canonical orientation in the + WORLD coordinate frame, while keeping the EE’s current position fixed. + This function replaces the entire 3×3 orientation matrix of the current + end-effector pose. + Usage notes: + This function should only be used when you explicitly need to override the end-effector’s full orientation. + This differs from rotate_eef(). orient_eef performs a full orientation override of the end-effector, not a single-joint rotation. For tasks like pouring, no need to use it. + For general wrist rotation, prefer using rotate_eef instead. + For aligning tasks, use "front" or "down" orientations as needed. + Supported orientations: + • 'front' : Align the end-effector so its direction faces forward. + • 'down' : Align the end-effector so its direction faces downward. + Example: + orient_eef(robot_name='right_arm', direction='front') # Reorients the right arm’s end-effector so it faces forward + +"back_to_initial_pose": + def back_to_initial_pose(robot_name: str, **kwargs) -> list[np.ndarray] + Returns the specified arm to its predefined initial joint configuration + stored in the environment. + Example: + back_to_initial_pose(robot_name='right_arm') # Returns the right arm back to its predefined initial joint configuration stored in the environment, regardless of its current pose. + +"close_gripper": + def close_gripper(robot_name: str, **kwargs) -> list[np.ndarray] + Closes the arm’s gripper using a short (10-step) gripper-only trajectory. + Example: + close_gripper(robot_name='right_arm') # Closes the right gripper using a short, smooth 10-step gripper-only trajectory. + +"open_gripper": + def open_gripper(robot_name: str, **kwargs) -> list[np.ndarray] + Opens the arm’s gripper using a short (10-step) gripper-only trajectory. + Example: + open_gripper(robot_name='right_arm') # Opens the right gripper using a 10-step gripper-only trajectory. + +### Drive Function (Trajectory Synchronization) +"drive": + def drive(left_arm_action=None, right_arm_action=None, **kwargs) -> list[torch.Tensor] + Wraps one or two arm trajectories into synchronized full-robot actions. + • If only one arm action is provided, the other arm stays idle. + • If both are provided, they are temporally aligned and executed together. + • The actions are obtained from the output of the above functions. + Example: + drive(left_arm_action=left_actions, right_arm_action=right_actions) \ No newline at end of file diff --git a/embodichain/agents/prompts/basic_background.txt b/embodichain/agents/prompts/basic_background.txt new file mode 100644 index 00000000..dc6d1c30 --- /dev/null +++ b/embodichain/agents/prompts/basic_background.txt @@ -0,0 +1,42 @@ +The environment uses a right-handed world coordinate system, where 1 unit equals 1 meter. +All robot poses are represented as 4×4 homogeneous transformation matrices. + +The robot base coordinate frame is the ONLY authoritative frame for all spatial reasoning, planning, and action generation. + +CAMERA AND IMAGE INTERPRETATION + +The camera is positioned in front of the robot, facing the robot arm and looking toward the robot base. +Because of this viewpoint, the rendered image is horizontally mirrored relative to the robot base frame. +This mirroring affects LEFT–RIGHT only. There is NO vertical or depth inversion. + +Mirror mapping (image → robot base frame): + +* Image left corresponds to robot right +* Image right corresponds to robot left +* Image up corresponds to robot up +* Image down corresponds to robot down + +REQUIRED REASONING PERSPECTIVE (NON-NEGOTIABLE) + +You must ignore the camera and rendered image orientation when reasoning. +All spatial reasoning must be performed as if you are physically located at the robot base, looking outward along the robot’s +x (forward) direction. + +Do NOT reason from the camera viewpoint. +Do NOT trust left/right as shown in the image. +Always remap image left/right before reasoning. + +ROBOT BASE COORDINATE DEFINITIONS + +All directions below are defined strictly in the robot base frame: + +* Moving forward increases x +* Moving backward decreases x +* Moving left increases y (appears as right in the image) +* Moving right decreases y (appears as left in the image) +* Moving up increases z +* Moving down decreases z + +ROBOT INITIALIZATION AND TERMINATION + +Both robot arms start in predefined initial configurations with their end-effectors open. +At task completion, both arms must be returned to their initial poses. \ No newline at end of file diff --git a/embodichain/agents/prompts/code_example.txt b/embodichain/agents/prompts/code_example.txt new file mode 100644 index 00000000..c2952fed --- /dev/null +++ b/embodichain/agents/prompts/code_example.txt @@ -0,0 +1,35 @@ +# Python scripts +# Use the right arm to grasp bottle, move to the target location (x=0.2, y=0.1), and then open the gripper to release the object. + +```python +# Step 1 — Reach and grasp the bottle +drive( + right_arm_action=grasp( + robot_name="right_arm", + obj_name="bottle", + ), +) + +# Step 2 — Move to target location +drive( + right_arm_action=move_to_absolute_position( + robot_name="right_arm", + x=0.2, + y=0.1, + ), +) + +# Step 3 — Open gripper to release the object +drive( + right_arm_action=open_gripper( + robot_name="right_arm", + ), +) + +# Step 4 — Return the arm to the initial pose +drive( + right_arm_action=back_to_initial_pose( + robot_name="right_arm", + ), +) +``` \ No newline at end of file diff --git a/embodichain/agents/prompts/code_prompt.txt b/embodichain/agents/prompts/code_prompt.txt new file mode 100644 index 00000000..3fadf1c9 --- /dev/null +++ b/embodichain/agents/prompts/code_prompt.txt @@ -0,0 +1,7 @@ +Constraints: +- Every atomic action MUST be executed via a single drive(...) call. +- Each drive(...) call must directly contain the atomic action(s); do NOT define actions separately and then pass them into drive. +- For single-arm execution: specify the active arm’s action and explicitly set the unused arm to None within the same drive(...) call. +- For dual-arm execution: both arms’ actions MUST be specified within the same drive(...) call. +- Use exactly one drive(...) call per step; no exceptions. +- Output MUST be executable Python code only: no explanations, no comments, no markdown, and no extra text. \ No newline at end of file diff --git a/embodichain/data/__init__.py b/embodichain/data/__init__.py index 9e152ab9..fccbbc24 100644 --- a/embodichain/data/__init__.py +++ b/embodichain/data/__init__.py @@ -14,5 +14,13 @@ # limitations under the License. # ---------------------------------------------------------------------------- +import os + + +database_dir = os.path.dirname(os.path.abspath(__file__)).replace("data", "database") +database_2d_dir = os.path.join(database_dir, "2dasset") +database_agent_prompt_dir = os.path.join(database_dir, "agent_prompt") +database_demo_dir = os.path.join(database_dir, "demostration") + from . import assets from .dataset import * diff --git a/embodichain/data/enum.py b/embodichain/data/enum.py index cf758192..edf84c3c 100644 --- a/embodichain/data/enum.py +++ b/embodichain/data/enum.py @@ -34,6 +34,18 @@ class SemanticMask(IntEnum): ROBOT = 2 +class Modality(Enum): + STATES = "states" + STATE_INDICATOR = "state_indicator" + ACTIONS = "actions" + ACTION_INDICATOR = "action_indicator" + IMAGES = "images" + LANG = "lang" + LANG_INDICATOR = "lang_indicator" + GEOMAP = "geomap" # e.g., depth, point cloud, etc. + VISION_LANGUAGE = "vision_language" # e.g., image + lang + + class EndEffector(Enum): GRIPPER = "gripper" DEXTROUSHAND = "hand" @@ -53,6 +65,16 @@ class ControlParts(Enum): WAIST = "waist" +class ControlPartsMappingW1(Enum): + ANKLE_IN_TORSO = 0 + KNEE_IN_TORSO = 1 + BUTTOCK_IN_TORSO = 2 + WAIST_IN_TORSO = 3 + + NECK1_IN_HEAD = 0 + NECK2_IN_HEAD = 1 + + class Hints(Enum): EEF = ( ControlParts.LEFT_EEF.value, @@ -74,3 +96,33 @@ class EefType(Enum): class ActionMode(Enum): ABSOLUTE = "" RELATIVE = "delta_" # This indicates the action is relative change with respect to last state. + + +class PrivilegeType(Enum): + EXTEROCEPTION = "exteroception" + MASK = "mask" + STATE = "state" + PROGRESS = "progress" + + +SUPPORTED_PROPRIO_TYPES = [ + ControlParts.LEFT_ARM.value + EefType.POSE.value, + ControlParts.RIGHT_ARM.value + EefType.POSE.value, + ControlParts.LEFT_ARM.value + JointType.QPOS.value, + ControlParts.RIGHT_ARM.value + JointType.QPOS.value, + ControlParts.LEFT_EEF.value + EndEffector.DEXTROUSHAND.value, + ControlParts.RIGHT_EEF.value + EndEffector.DEXTROUSHAND.value, + ControlParts.LEFT_EEF.value + EndEffector.GRIPPER.value, + ControlParts.RIGHT_EEF.value + EndEffector.GRIPPER.value, + ControlParts.HEAD.value + JointType.QPOS.value, + ControlParts.WAIST.value + JointType.QPOS.value, +] +SUPPORTED_ACTION_TYPES = SUPPORTED_PROPRIO_TYPES + [ + ControlParts.LEFT_ARM.value + ActionMode.RELATIVE.value + JointType.QPOS.value, + ControlParts.RIGHT_ARM.value + ActionMode.RELATIVE.value + JointType.QPOS.value, +] +SUPPORTED_EXTRA_VISION_TYPES = [ + Modality.GEOMAP.value, + PrivilegeType.EXTEROCEPTION.value, + PrivilegeType.MASK.value, +] diff --git a/embodichain/lab/gym/envs/__init__.py b/embodichain/lab/gym/envs/__init__.py index 88257690..e4a16daa 100644 --- a/embodichain/lab/gym/envs/__init__.py +++ b/embodichain/lab/gym/envs/__init__.py @@ -26,6 +26,14 @@ PourWaterEnv, ) from embodichain.lab.gym.envs.tasks.tableware.scoop_ice import ScoopIce +from embodichain.lab.gym.envs.tasks.tableware.pour_water_v3 import ( + PourWaterEnv3, + PourWaterAgentEnv3, +) +from embodichain.lab.gym.envs.tasks.tableware.rearrangement_v3 import ( + RearrangementEnv3, + RearrangementAgentEnv3, +) # Reinforcement learning environments from embodichain.lab.gym.envs.tasks.rl.push_cube import PushCubeEnv diff --git a/embodichain/lab/gym/envs/action_bank/configurable_action.py b/embodichain/lab/gym/envs/action_bank/configurable_action.py index 3dd649e9..37c5117e 100644 --- a/embodichain/lab/gym/envs/action_bank/configurable_action.py +++ b/embodichain/lab/gym/envs/action_bank/configurable_action.py @@ -30,7 +30,7 @@ from embodichain.lab.sim.cfg import MarkerCfg from embodichain.lab.gym.utils.misc import resolve_env_params, data_key_to_control_part from embodichain.data.enum import Hints, EefExecute -from .utils import generate_affordance_from_src, get_init_affordance +from .utils import generate_affordance_from_src # https://stackoverflow.com/questions/41834530/how-to-make-python-decorators-work-like-a-tag-to-make-function-calls-by-tag @@ -580,10 +580,10 @@ def initialize_with_given_qpos(action_list, executor, executor_init_info, env): if given_qpos is None: log_warning( "No given_qpos is provided for initialize_with_given_qpos. Using {}.".format( - get_init_affordance(executor) + "{}_{}_qpos".format(executor, "init") ) ) - given_qpos = env.affordance_datas[get_init_affordance(executor)] + given_qpos = env.affordance_datas["{}_{}_qpos".format(executor, "init")] executor_qpos_dim = action_list[executor].shape[0] given_qpos = np.asarray(given_qpos) @@ -914,7 +914,7 @@ def _pad_or_trim_last(x, new_T): # After node initialization, check if env.affordance_datas contains updated initial value for each executor. for executor in scope.keys(): - init_node_name = get_init_affordance(executor) + init_node_name = "{}_{}_qpos".format(executor, "init") if ( not hasattr(env, "affordance_datas") or init_node_name not in env.affordance_datas diff --git a/embodichain/lab/gym/envs/action_bank/utils.py b/embodichain/lab/gym/envs/action_bank/utils.py index 255e5b8f..7ba44527 100644 --- a/embodichain/lab/gym/envs/action_bank/utils.py +++ b/embodichain/lab/gym/envs/action_bank/utils.py @@ -25,8 +25,26 @@ """Node Generation Utils""" -def get_init_affordance(scope: str, tag: str = "init") -> str: - return "{}_{}_qpos".format(scope, tag) +def get_control_part_joint_ids(env, key: str) -> List[int]: + from embodichain.data.enum import ( + ControlParts, + ControlPartsMappingW1, + ) + from embodichain.lab.gym.envs.action_bank.configurable_action import ( + get_control_part, + ) + + control_part = get_control_part(env, key) + if control_part == ControlParts.WAIST.value: + waist_joint_id = env.robot.get_joint_ids(name=ControlParts.TORSO.value)[ + ControlPartsMappingW1.WAIST_IN_TORSO.value + ] + if not isinstance(waist_joint_id, (list)): + return [waist_joint_id] + return waist_joint_id + + else: + return env.robot.get_joint_ids(name=control_part, remove_mimic=True) def generate_affordance_from_src( diff --git a/embodichain/lab/gym/envs/managers/dataset_manager.py b/embodichain/lab/gym/envs/managers/dataset_manager.py index a0ca1688..d1ea3049 100644 --- a/embodichain/lab/gym/envs/managers/dataset_manager.py +++ b/embodichain/lab/gym/envs/managers/dataset_manager.py @@ -87,20 +87,28 @@ def __init__(self, cfg: object, env: EmbodiedEnv): super().__init__(cfg, env) ## TODO: fix configurable_action.py to avoid getting env.metadata['dataset'] - # Extract robot_meta from first functor and add to env.metadata for backward compatibility + # Extract robot_meta and instruction from first functor and add to env.metadata for backward compatibility # This allows legacy code (like action_bank) to access robot_meta via env.metadata["dataset"]["robot_meta"] for mode_cfgs in self._mode_functor_cfgs.values(): for functor_cfg in mode_cfgs: - if "robot_meta" in functor_cfg.params: + if ( + "robot_meta" in functor_cfg.params + or "instruction" in functor_cfg.params + ): if not hasattr(env, "metadata"): env.metadata = {} if "dataset" not in env.metadata: env.metadata["dataset"] = {} - env.metadata["dataset"]["robot_meta"] = functor_cfg.params[ - "robot_meta" - ] + if "robot_meta" in functor_cfg.params: + env.metadata["dataset"]["robot_meta"] = functor_cfg.params[ + "robot_meta" + ] + if "instruction" in functor_cfg.params: + env.metadata["dataset"]["instruction"] = functor_cfg.params[ + "instruction" + ] logger.log_info( - "Added robot_meta to env.metadata for backward compatibility" + "Added robot_meta and instruction to env.metadata for backward compatibility" ) break else: diff --git a/embodichain/lab/gym/envs/tasks/tableware/base_agent_env.py b/embodichain/lab/gym/envs/tasks/tableware/base_agent_env.py new file mode 100644 index 00000000..e17557de --- /dev/null +++ b/embodichain/lab/gym/envs/tasks/tableware/base_agent_env.py @@ -0,0 +1,198 @@ +import torch +from embodichain.utils import logger + + +class BaseAgentEnv: + + def _init_agents(self, agent_config, task_name, agent_config_path=None): + from embodichain.agents.hierarchy.task_agent import TaskAgent + from embodichain.agents.hierarchy.code_agent import CodeAgent + from embodichain.agents.hierarchy.validation_agent import ValidationAgent + from embodichain.agents.hierarchy.llm import ( + task_llm, + code_llm, + validation_llm, + ) + + if agent_config.get("TaskAgent") is not None: + self.task_agent = TaskAgent( + task_llm, + **agent_config["Agent"], + **agent_config["TaskAgent"], + task_name=task_name, + config_dir=agent_config_path, + ) + self.code_agent = CodeAgent( + code_llm, + **agent_config["Agent"], + **agent_config.get("CodeAgent"), + task_name=task_name, + config_dir=agent_config_path, + ) + self.validation_agent = ValidationAgent( + validation_llm, + task_name=task_name, + task_description=self.code_agent.prompt_kwargs.get("task_prompt")[ + "content" + ], + basic_background=self.code_agent.prompt_kwargs.get("basic_background")[ + "content" + ], + atom_actions=self.code_agent.prompt_kwargs.get("atom_actions")["content"], + ) + + def get_states(self): + # TODO: only support num_env = 1 for now + # store robot states in each env.reset + self.init_qpos = self.robot.get_qpos().squeeze(0) + + self.left_arm_joints = self.robot.get_joint_ids(name="left_arm") + self.right_arm_joints = self.robot.get_joint_ids(name="right_arm") + self.left_eef_joints = self.robot.get_joint_ids(name="left_eef") + self.right_eef_joints = self.robot.get_joint_ids(name="right_eef") + + self.left_arm_init_qpos = self.init_qpos[self.left_arm_joints] + self.right_arm_init_qpos = self.init_qpos[self.right_arm_joints] + + self.left_arm_init_xpos = self.robot.compute_fk( + self.left_arm_init_qpos, name="left_arm", to_matrix=True + ).squeeze(0) + self.right_arm_init_xpos = self.robot.compute_fk( + self.right_arm_init_qpos, name="right_arm", to_matrix=True + ).squeeze(0) + + self.left_arm_current_qpos = self.left_arm_init_qpos + self.right_arm_current_qpos = self.right_arm_init_qpos + + self.left_arm_current_xpos = self.left_arm_init_xpos + self.right_arm_current_xpos = self.right_arm_init_xpos + + self.left_arm_base_pose = self.robot.get_control_part_base_pose( + "left_arm", to_matrix=True + ).squeeze(0) + self.right_arm_base_pose = self.robot.get_control_part_base_pose( + "right_arm", to_matrix=True + ).squeeze(0) + + self.open_state = torch.tensor([0.05]) + self.close_state = torch.tensor([0.0]) + self.left_arm_current_gripper_state = self.open_state + self.right_arm_current_gripper_state = self.open_state + + # store some useful obj information + init_obj_info = {} + obj_uids = self.sim.get_rigid_object_uid_list() + for obj_name in obj_uids: + obj = self.sim.get_rigid_object(obj_name) + obj_pose = obj.get_local_pose(to_matrix=True).squeeze(0) + obj_height = obj_pose[2, 3] # Extract the height (z-coordinate) + obj_grasp_pose = self.affordance_datas.get( + f"{obj_name}_grasp_pose_object", None + ) + init_obj_info[obj_name] = { + "pose": obj_pose, # Store the full pose (4x4 matrix) + "height": obj_height, # Store the height (z-coordinate) + "grasp_pose_obj": ( + obj_grasp_pose.squeeze(0) if obj_grasp_pose is not None else None + ), # Store the grasp pose if available + } + self.init_obj_info = init_obj_info + + # -------------------- Common getters / setters -------------------- + + def get_obs_for_agent(self): + obs = self.get_obs(get_valid_sensor_data=True) + rgb = obs["sensor"]["cam_high"]["color"].squeeze(0) + valid_rgb_1 = obs["sensor"]["valid_cam_1"]["color"].squeeze(0) + valid_rgb_2 = obs["sensor"]["valid_cam_2"]["color"].squeeze(0) + valid_rgb_3 = obs["sensor"]["valid_cam_3"]["color"].squeeze(0) + + # obs_image_path = save_obs_image(obs_image=self.get_obs_for_agent()["rgb_1"], save_dir='./', step_id=0) + + return { + "rgb": rgb, + "valid_rgb_1": valid_rgb_1, + "valid_rgb_2": valid_rgb_2, + "valid_rgb_3": valid_rgb_3, + } + + # depth = obs["sensor"]["cam_high"]["depth"].squeeze(0) + # mask = obs["sensor"]["cam_high"]["mask"].squeeze(0) + # semantic_mask = obs["sensor"]["cam_high"]["semantic_mask_l"].squeeze(0) + # return {"rgb": rgb, "depth": depth, "mask": mask, "semantic_mask": semantic_mask} + + def get_current_qpos_agent(self): + return self.left_arm_current_qpos, self.right_arm_current_qpos + + def set_current_qpos_agent(self, arm_qpos, is_left): + if is_left: + self.left_arm_current_qpos = arm_qpos + else: + self.right_arm_current_qpos = arm_qpos + + def get_current_xpos_agent(self): + return self.left_arm_current_xpos, self.right_arm_current_xpos + + def set_current_xpos_agent(self, arm_xpos, is_left): + if is_left: + self.left_arm_current_xpos = arm_xpos + else: + self.right_arm_current_xpos = arm_xpos + + def get_current_gripper_state_agent(self): + return self.left_arm_current_gripper_state, self.right_arm_current_gripper_state + + def set_current_gripper_state_agent(self, arm_gripper_state, is_left): + if is_left: + self.left_arm_current_gripper_state = arm_gripper_state + else: + self.right_arm_current_gripper_state = arm_gripper_state + + # -------------------- IK / FK -------------------- + def get_arm_ik(self, target_xpos, is_left, qpos_seed=None): + control_part = "left_arm" if is_left else "right_arm" + ret, qpos = self.robot.compute_ik( + name=control_part, pose=target_xpos, joint_seed=qpos_seed + ) + return ret.all().item(), qpos.squeeze(0) + + def get_arm_fk(self, qpos, is_left): + control_part = "left_arm" if is_left else "right_arm" + xpos = self.robot.compute_fk( + name=control_part, qpos=torch.as_tensor(qpos), to_matrix=True + ) + return xpos.squeeze(0) + + # -------------------- get only code for action list -------------------- + def generate_code_for_actions(self, regenerate=False, **kwargs): + logger.log_info( + f"Generate code for creating action list for {self.code_agent.task_name}.", + color="green", + ) + + # Task planning + print(f"\033[92m\nStart task planning.\n\033[0m") + + task_agent_input = self.task_agent.get_composed_observations( + env=self, regenerate=regenerate, **kwargs + ) + task_plan = self.task_agent.generate(**task_agent_input) + + # Code generation + print(f"\033[94m\nStart code generation.\n\033[0m") + code_agent_input = self.code_agent.get_composed_observations( + env=self, regenerate=regenerate, **kwargs + ) + code_agent_input["task_plan"] = task_plan + + code_file_path, kwargs, code = self.code_agent.generate(**code_agent_input) + return code_file_path, kwargs, code + + # -------------------- get action list -------------------- + def create_demo_action_list(self, regenerate=False): + code_file_path, kwargs, _ = self.generate_code_for_actions( + regenerate=regenerate + ) + action_list = self.code_agent.act(code_file_path, **kwargs) + self.action_length = len(action_list) + return action_list diff --git a/embodichain/lab/gym/envs/tasks/tableware/pour_water_v3.py b/embodichain/lab/gym/envs/tasks/tableware/pour_water_v3.py new file mode 100644 index 00000000..c4b469b6 --- /dev/null +++ b/embodichain/lab/gym/envs/tasks/tableware/pour_water_v3.py @@ -0,0 +1,78 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2025 DexForce Technology Co., Ltd. +# +# All rights reserved. +# ---------------------------------------------------------------------------- + +import os +import torch +import numpy as np +from copy import deepcopy +from typing import Dict, Union, Optional, Sequence, Tuple, List + +from embodichain.lab.gym.envs import EmbodiedEnv, EmbodiedEnvCfg +from embodichain.lab.gym.utils.registration import register_env +from embodichain.utils import configclass, logger + +from embodichain.lab.gym.envs.tasks.tableware.base_agent_env import BaseAgentEnv + +__all__ = ["PourWaterEnv3"] + + +@register_env("PourWater-v3", max_episode_steps=600) +class PourWaterEnv3(EmbodiedEnv): + def __init__(self, cfg: EmbodiedEnvCfg = None, **kwargs): + super().__init__(cfg, **kwargs) + + action_config = kwargs.get("action_config", None) + if action_config is not None: + self.action_config = action_config + + def is_task_success(self, **kwargs) -> torch.Tensor: + """Determine if the task is successfully completed. This is mainly used in the data generation process + of the imitation learning. + + Args: + **kwargs: Additional arguments for task-specific success criteria. + + Returns: + torch.Tensor: A boolean tensor indicating success for each environment in the batch. + """ + + bottle = self.sim.get_rigid_object("bottle") + cup = self.sim.get_rigid_object("cup") + + bottle_final_xpos = bottle.get_local_pose(to_matrix=True) + cup_final_xpos = cup.get_local_pose(to_matrix=True) + + bottle_ret = self._is_fall(bottle_final_xpos) + cup_ret = self._is_fall(cup_final_xpos) + + return ~(bottle_ret | cup_ret) + + def _is_fall(self, pose: torch.Tensor) -> torch.Tensor: + # Extract z-axis from rotation matrix (last column, first 3 elements) + pose_rz = pose[:, :3, 2] + world_z_axis = torch.tensor([0, 0, 1], dtype=pose.dtype, device=pose.device) + + # Compute dot product for each batch element + dot_product = torch.sum(pose_rz * world_z_axis, dim=-1) # Shape: (batch_size,) + + # Clamp to avoid numerical issues with arccos + dot_product = torch.clamp(dot_product, -1.0, 1.0) + + # Compute angle and check if fallen + angle = torch.arccos(dot_product) + return angle >= torch.pi / 4 + + +@register_env("PourWaterAgent-v3", max_episode_steps=600) +class PourWaterAgentEnv3(BaseAgentEnv, PourWaterEnv3): + def __init__(self, cfg: EmbodiedEnvCfg = None, **kwargs): + super().__init__(cfg, **kwargs) + super()._init_agents(**kwargs) + + def reset(self, seed: Optional[int] = None, options: Optional[Dict] = None): + obs, info = super().reset(seed=seed, options=options) + super().get_states() + return obs, info diff --git a/embodichain/lab/gym/envs/tasks/tableware/rearrangement_v3.py b/embodichain/lab/gym/envs/tasks/tableware/rearrangement_v3.py new file mode 100644 index 00000000..c666fc36 --- /dev/null +++ b/embodichain/lab/gym/envs/tasks/tableware/rearrangement_v3.py @@ -0,0 +1,97 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2025 DexForce Technology Co., Ltd. +# +# All rights reserved. +# ---------------------------------------------------------------------------- + +import os +import torch +import numpy as np +from copy import deepcopy +from typing import Dict, Union, Optional, Sequence, Tuple, List + +from embodichain.lab.gym.envs import EmbodiedEnv, EmbodiedEnvCfg +from embodichain.lab.gym.utils.registration import register_env +from embodichain.utils import configclass, logger + +from embodichain.lab.gym.envs.tasks.tableware.base_agent_env import BaseAgentEnv + +__all__ = ["RearrangementEnv3"] + + +@register_env("Rearrangement-v3", max_episode_steps=600) +class RearrangementEnv3(EmbodiedEnv): + def __init__(self, cfg: EmbodiedEnvCfg = None, **kwargs): + super().__init__(cfg, **kwargs) + + action_config = kwargs.get("action_config", None) + if action_config is not None: + self.action_config = action_config + + def is_task_success(self) -> bool: + fork = self.sim.get_rigid_object("fork") + spoon = self.sim.get_rigid_object("spoon") + plate = self.sim.get_rigid_object("plate") + plate_pose = plate.get_local_pose(to_matrix=True) + # TODO: now only for 1 env + ( + spoon_place_target_x, + spoon_place_target_y, + spoon_place_target_z, + ) = self.affordance_datas["spoon_place_pose"][:3, 3] + ( + fork_place_target_x, + fork_place_target_y, + fork_place_target_z, + ) = self.affordance_datas["fork_place_pose"][:3, 3] + + spoon_pose = spoon.get_local_pose(to_matrix=True) + spoon_x, spoon_y, spoon_z = spoon_pose[0, :3, 3] + + fork_pose = fork.get_local_pose(to_matrix=True) + fork_x, fork_y, fork_z = fork_pose[0, :3, 3] + + tolerance = self.metadata.get("success_params", {}).get("tolerance", 0.02) + + # spoon and fork should with the x y range of tolerance related to plate. + return ~( + abs(spoon_x - spoon_place_target_x) > tolerance + or abs(spoon_y - spoon_place_target_y) > tolerance + or abs(fork_x - fork_place_target_x) > tolerance + or abs(fork_y - fork_place_target_y) > tolerance + ) + + +@register_env("RearrangementAgent-v3", max_episode_steps=600) +class RearrangementAgentEnv3(BaseAgentEnv, RearrangementEnv3): + def __init__(self, cfg: EmbodiedEnvCfg = None, **kwargs): + super().__init__(cfg, **kwargs) + super()._init_agents(**kwargs) + + def reset(self, seed: Optional[int] = None, options: Optional[Dict] = None): + obs, info = super().reset(seed=seed, options=options) + super().get_states() + return obs, info + + def is_task_success(self): + fork = self.sim.get_rigid_object("fork") + spoon = self.sim.get_rigid_object("spoon") + plate = self.sim.get_rigid_object("plate") + + plate_pose = plate.get_local_pose(to_matrix=True) + spoon_place_target_y = plate_pose[0, 1, 3] - 0.16 + fork_place_target_y = plate_pose[0, 1, 3] + 0.16 + + spoon_pose = spoon.get_local_pose(to_matrix=True) + spoon_y = spoon_pose[0, 1, 3] + + fork_pose = fork.get_local_pose(to_matrix=True) + fork_y = fork_pose[0, 1, 3] + + tolerance = self.metadata.get("success_params", {}).get("tolerance", 0.02) + + # spoon and fork should with the y range of tolerance related to plate. + return ( + abs(spoon_y - spoon_place_target_y) <= tolerance + and abs(fork_y - fork_place_target_y) <= tolerance + ) diff --git a/embodichain/lab/gym/utils/gym_utils.py b/embodichain/lab/gym/utils/gym_utils.py index 40ced87f..4a199b70 100644 --- a/embodichain/lab/gym/utils/gym_utils.py +++ b/embodichain/lab/gym/utils/gym_utils.py @@ -437,6 +437,9 @@ class ComponentCfg: env_cfg.sim_steps_per_control = config["env"].get("sim_steps_per_control", 4) env_cfg.extensions = deepcopy(config.get("env", {}).get("extensions", {})) + # load dataset config + env_cfg.dataset = config["env"].get("dataset", None) + # TODO: support more env events, eg, grasp pose generation, mesh preprocessing, etc. env_cfg.dataset = ComponentCfg() @@ -449,9 +452,14 @@ class ComponentCfg: for dataset_name, dataset_params in config["env"]["dataset"].items(): dataset_params_modified = deepcopy(dataset_params) + # Extract function name if format is "module:ClassName" + func_name = dataset_params["func"] + if ":" in func_name: + func_name = func_name.split(":")[-1] + # Find the function from multiple modules using the utility function dataset_func = find_function_from_modules( - dataset_params["func"], + func_name, dataset_modules, raise_if_not_found=True, ) @@ -471,6 +479,7 @@ class ComponentCfg: "embodichain.lab.gym.envs.managers.randomization", "embodichain.lab.gym.envs.managers.record", "embodichain.lab.gym.envs.managers.events", + "embodichain.lab.gym.envs.managers.real2sim", ] # parser env events config diff --git a/embodichain/lab/gym/utils/misc.py b/embodichain/lab/gym/utils/misc.py index b75b70af..9f839491 100644 --- a/embodichain/lab/gym/utils/misc.py +++ b/embodichain/lab/gym/utils/misc.py @@ -28,7 +28,7 @@ from collections import OrderedDict from importlib import import_module from scipy.spatial.transform import Rotation as R -from typing import Any, Dict, List, Tuple, Union, Sequence, Callable, Mapping +from typing import Any, Dict, List, Tuple, Union, Sequence, Callable, Mapping, Optional import numpy as np @@ -757,6 +757,13 @@ def resolve_env_attr(obj: Any, env: Any) -> Any: _EXPR = re.compile(r"\$\{([^}]+)\}") # For searching ${...} marker +def is_binocularcam(sensor): + from dexsim.sensor import BinocularCam + from embodichain.lab.sim.sensors import StereoCamera + + return isinstance(sensor, BinocularCam) or isinstance(sensor, StereoCamera) + + def resolve_formatted_string(obj, local_vars=None, global_vars=None): """Given a dict carrys "${...}"-like strings , `eval` the "${...}$" values while keep the dict structure. @@ -1382,3 +1389,43 @@ def is_stereocam(sensor) -> bool: from embodichain.lab.sim.sensors import StereoCamera return isinstance(sensor, StereoCamera) + + +def _data_key_to_control_part(robot, control_parts, data_key: str) -> Optional[str]: + # TODO: Temporary workaround, should be removed after refactoring data dict extractor. + # @lru_cache(max_size=None) # NOTE: no way to pass a hashable parameter + def is_eef_hand_func(robot, control_parts) -> bool: + # TODO: This is a temporary workaround, should be used a more general method to check + # whether the end-effector is a hand. + for part in control_parts: + if "eef" in part: + joint_ids = robot.get_joint_ids(part, remove_mimic=True) + return len(joint_ids) >= 2 + return False + + from embodichain.data.enum import ( + ControlParts, + EndEffector, + ActionMode, + JointType, + EefType, + ) + + is_eef_hand = is_eef_hand_func(robot, control_parts) + + for control_part in ControlParts: + if EndEffector.DEXTROUSHAND.value in data_key and is_eef_hand: + return data_key.replace(EndEffector.DEXTROUSHAND.value, "") + elif EndEffector.DEXTROUSHAND.value in data_key and not is_eef_hand: + continue + elif EndEffector.GRIPPER.value in data_key and not is_eef_hand: + return data_key.replace(EndEffector.GRIPPER.value, "") + elif EndEffector.GRIPPER.value in data_key and is_eef_hand: + continue + elif ActionMode.RELATIVE.value + JointType.QPOS.value in data_key: + continue + elif EefType.POSE.value in data_key: + continue + elif control_part.value in data_key: + return control_part.value + return None diff --git a/embodichain/lab/scripts/run_agent.py b/embodichain/lab/scripts/run_agent.py new file mode 100644 index 00000000..e6546a59 --- /dev/null +++ b/embodichain/lab/scripts/run_agent.py @@ -0,0 +1,146 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2025 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +import gymnasium +import numpy as np +import argparse +import torch + +from embodichain.utils.utility import load_json +from embodichain.lab.sim import SimulationManagerCfg +from embodichain.lab.gym.envs import EmbodiedEnvCfg +from embodichain.lab.gym.utils.gym_utils import ( + config_to_cfg, +) +from embodichain.utils.logger import log_warning, log_info, log_error +from .run_env import main + + +if __name__ == "__main__": + np.set_printoptions(5, suppress=True) + torch.set_printoptions(precision=5, sci_mode=False) + + parser = argparse.ArgumentParser() + parser.add_argument( + "--num_envs", + help="The number of environments to run in parallel.", + default=1, + type=int, + ) + parser.add_argument( + "--device", + type=str, + default="cpu", + help="device to run the environment on, e.g., 'cpu' or 'cuda'", + ) + parser.add_argument( + "--headless", + help="Whether to perform the simulation in headless mode.", + default=False, + action="store_true", + ) + parser.add_argument( + "--enable_rt", + help="Whether to use RTX rendering backend for the simulation.", + default=False, + action="store_true", + ) + parser.add_argument( + "--render_backend", + help="The rendering backend to use for the simulation.", + default="egl", + type=str, + ) + parser.add_argument( + "--gpu_id", + help="The GPU ID to use for the simulation.", + default=0, + type=int, + ) + parser.add_argument( + "--debug_mode", + help="Enable debug mode.", + default=False, + action="store_true", + ) + parser.add_argument( + "--filter_visual_rand", + help="Whether to filter out visual randomization.", + default=False, + action="store_true", + ) + + parser.add_argument( + "--gym_config", + type=str, + help="Path to the gym configuration file.", + required=True, + ) + parser.add_argument( + "--task_name", + type=str, + help="Name of the task.", + required=True, + ) + parser.add_argument( + "--agent_config", + type=str, + help="Path to the agent configuration file.", + required=True, + ) + parser.add_argument( + "--regenerate", + action="store_true", + help="Whether to regenerate code if already existed.", + default=False, + ) + + args = parser.parse_args() + + # Validate arguments + if args.num_envs != 1: + log_error(f"Currently only support num_envs=1, but got {args.num_envs}.") + exit(1) + + # Load configurations + gym_config = load_json(args.gym_config) + agent_config = load_json(args.agent_config) + + # Build environment configuration + cfg: EmbodiedEnvCfg = config_to_cfg(gym_config) + cfg.filter_visual_rand = args.filter_visual_rand + cfg.num_envs = args.num_envs + cfg.sim_cfg = SimulationManagerCfg( + headless=args.headless, + sim_device=args.device, + enable_rt=args.enable_rt, + gpu_id=args.gpu_id, + ) + + # Create environment + env = gymnasium.make( + id=gym_config["id"], + cfg=cfg, + agent_config=agent_config, + agent_config_path=args.agent_config, + task_name=args.task_name, + ) + + # Run main function + main(args, env, gym_config) + + if args.headless: + env.reset(options={"final": True}) diff --git a/embodichain/lab/scripts/run_env.py b/embodichain/lab/scripts/run_env.py index 2268be0f..c9759409 100644 --- a/embodichain/lab/scripts/run_env.py +++ b/embodichain/lab/scripts/run_env.py @@ -32,9 +32,11 @@ from embodichain.utils.logger import log_warning, log_info, log_error -def generate_and_execute_action_list(env, idx, debug_mode): +def generate_and_execute_action_list(env, idx, debug_mode, **kwargs): - action_list = env.get_wrapper_attr("create_demo_action_list")(action_sentence=idx) + action_list = env.get_wrapper_attr("create_demo_action_list")( + action_sentence=idx, **kwargs + ) if action_list is None or len(action_list) == 0: log_warning("Action is invalid. Skip to next generation.") @@ -86,7 +88,9 @@ def generate_function( while True: ret = [] for trajectory_idx in range(num_traj): - valid = generate_and_execute_action_list(env, trajectory_idx, debug_mode) + valid = generate_and_execute_action_list( + env, trajectory_idx, debug_mode, **kwargs + ) if not valid: _, _ = env.reset() @@ -124,14 +128,15 @@ def main(args, env, gym_config): log_info("Start offline data generation.", color="green") # TODO: Support multiple trajectories per episode generation. num_traj = 1 - for i in range(gym_config["max_episodes"]): + for i in range(gym_config.get("max_episodes", 1)): generate_function( env, num_traj, i, - save_path=args.save_path, - save_video=args.save_video, + save_path=getattr(args, "save_path", ""), + save_video=getattr(args, "save_video", False), debug_mode=args.debug_mode, + regenerate=getattr(args, "regenerate", False), ) diff --git a/embodichain/lab/sim/objects/soft_object.py b/embodichain/lab/sim/objects/soft_object.py index 116925e7..52f1f0c1 100644 --- a/embodichain/lab/sim/objects/soft_object.py +++ b/embodichain/lab/sim/objects/soft_object.py @@ -64,18 +64,18 @@ def __init__( self.ps = ps self.num_instances = len(entities) - softbodies = [ + self.soft_bodies = [ self.entities[i].get_physical_body() for i in range(self.num_instances) ] - self.n_collision_vertices = softbodies[0].get_num_vertices() - self.n_sim_vertices = softbodies[0].get_num_sim_vertices() + self.n_collision_vertices = self.soft_bodies[0].get_num_vertices() + self.n_sim_vertices = self.soft_bodies[0].get_num_sim_vertices() self._rest_position_buffer = torch.empty( (self.num_instances, self.n_collision_vertices, 4), device=self.device, dtype=torch.float32, ) - for i, softbody in enumerate(softbodies): + for i, softbody in enumerate(self.soft_bodies): self._rest_position_buffer[i] = softbody.get_position_inv_mass_buffer() self._rest_sim_position_buffer = torch.empty( @@ -84,7 +84,7 @@ def __init__( dtype=torch.float32, ) - for i, softbody in enumerate(softbodies): + for i, softbody in enumerate(self.soft_bodies): self._rest_sim_position_buffer[i] = ( softbody.get_sim_position_inv_mass_buffer() ) @@ -148,9 +148,7 @@ def sim_vertex_position_buffer(self): def sim_vertex_velocity_buffer(self): """Get the current vertex velocity buffer of the soft bodies.""" for i, softbody in enumerate(self.soft_bodies): - self._sim_vertex_velocity_buffer[i] = ( - softbody.get_sim_position_inv_mass_buffer() - ) + self._sim_vertex_velocity_buffer[i] = softbody.get_sim_velocity_buffer() return self._sim_vertex_velocity_buffer.clone() diff --git a/embodichain/toolkits/code_generation.py b/embodichain/toolkits/code_generation.py new file mode 100644 index 00000000..360e9e90 --- /dev/null +++ b/embodichain/toolkits/code_generation.py @@ -0,0 +1,88 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2025 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from typing import Dict, Tuple +from langchain_core.output_parsers import BaseOutputParser + +import numpy as np + + +def merge_dicts(dicts: Dict): + return {k: v for d in dicts for k, v in d.items()} + + +def get_executable_code_str(input_string, language="python"): + start_marker = f"```{language}" + end_marker = f"```" + if input_string.find(start_marker) >= 0: + + start_index = input_string.find(start_marker) + len(start_marker) + end_index = input_string.rfind(end_marker) + + code_string = input_string[start_index:end_index].strip() + else: + code_string = input_string + + return code_string + + +class OutputFormatting: + @staticmethod + def flatten_dict(output: Dict[str, Dict]) -> Dict[str, np.ndarray]: + ret = {} + for _, val in output.items(): + ret.update(val) + return ret + + +class ExecutableOutputParser(BaseOutputParser): + # https://python.langchain.com/v0.1/docs/modules/model_io/output_parsers/custom/ + + _fixed_vars = {"np": np} + variable_vars = {} + + def update_vars(self, variable_vars: Dict): + self.variable_vars = variable_vars + + def parse(self, text: str) -> Tuple[str, Dict, Dict]: + code_str = get_executable_code_str(text) + # if self._cfg["include_context"] and context != "": + # to_exec = f"{context}\n{code_str}" + # to_log = f"{context}\n{use_query}\n{code_str}" + # else: + # to_exec = code_str + # to_log = f"{use_query}\n{to_exec}" + + # to_log_pretty = highlight(to_log, PythonLexer(), TerminalFormatter()) + # print( + # f"\033[34m====================================================\nLMP {self._name} exec:\033[0m\n\n{to_log_pretty}\n\n\033[34m====================================================\n\033[0m" + # ) + + # generate new functions + # new_fs = self._lmp_fgen.create_new_fs_from_code(code_str) + # self._variable_vars.update(new_fs) + + gvars = merge_dicts([self._fixed_vars, self.variable_vars]) + lvars = None + + if gvars is None: + gvars = {} + if lvars is None: + lvars = {} + empty_fn = lambda *args, **kwargs: None + custom_gvars = merge_dicts([gvars, {"exec": empty_fn, "eval": empty_fn}]) + + return code_str, custom_gvars, lvars diff --git a/embodichain/toolkits/interfaces.py b/embodichain/toolkits/interfaces.py new file mode 100644 index 00000000..5d6d967f --- /dev/null +++ b/embodichain/toolkits/interfaces.py @@ -0,0 +1,1252 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2025 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +import numpy as np +from embodichain.utils.logger import log_info, log_warning, log_error +from copy import deepcopy +from embodichain.lab.gym.utils.misc import ( + mul_linear_expand, + get_rotation_replaced_pose, +) +import torch +from tqdm import tqdm +from embodichain.lab.sim.planners.motion_generator import MotionGenerator +from scipy.spatial.transform import Rotation as R +from embodichain.utils.utility import encode_image +import ast + +""" +--------------------------------------------Some useful functions---------------------------------------------------- +--------------------------------------------Some useful functions---------------------------------------------------- +--------------------------------------------Some useful functions---------------------------------------------------- +""" + + +def draw_axis(env, pose): + from embodichain.lab.sim.cfg import MarkerCfg + + marker_cfg = MarkerCfg( + name="test", + marker_type="axis", + axis_xpos=pose, + axis_size=0.01, + axis_len=0.2, + arena_index=-1, # All arenas + ) + env.sim.draw_marker(cfg=marker_cfg) + env.sim.update() + + +def get_arm_states(env, robot_name): + + left_arm_current_qpos, right_arm_current_qpos = env.get_current_qpos_agent() + left_arm_current_pose, right_arm_current_pose = env.get_current_xpos_agent() + left_arm_current_gripper_state, right_arm_current_gripper_state = ( + env.get_current_gripper_state_agent() + ) + + side = "right" if "right" in robot_name else "left" + is_left = True if side == "left" else False + select_arm = "left_arm" if is_left else "right_arm" + + arms = { + "left": ( + left_arm_current_qpos, + left_arm_current_pose, + left_arm_current_gripper_state, + ), + "right": ( + right_arm_current_qpos, + right_arm_current_pose, + right_arm_current_gripper_state, + ), + } + ( + select_arm_current_qpos, + select_arm_current_pose, + select_arm_current_gripper_state, + ) = arms[side] + + return ( + is_left, + select_arm, + select_arm_current_qpos, + select_arm_current_pose, + select_arm_current_gripper_state, + ) + + +def find_nearest_valid_pose(env, select_arm, pose, xpos_resolution=0.1): + # use the validator to choose the nearest valid pose + # delete the cache every time + if isinstance(pose, torch.Tensor): + pose = pose.detach().cpu().numpy() + ret, _ = env.robot.compute_xpos_reachability( + select_arm, + pose, + xpos_resolution=xpos_resolution, + qpos_resolution=np.radians(60), + cache_mode="disk", + use_cached=False, + visualize=False, + ) + ret = np.stack(ret, axis=0) + # find the nearest valid pose + xyz = pose[:3, 3] + ts = np.stack([M[:3, 3] for M in ret], axis=0) # shape (N,3) + dists = np.linalg.norm(ts - xyz[None, :], axis=1) + best_idx = np.argmin(dists) + nearest_valid_pose = ret[best_idx] + return torch.from_numpy(nearest_valid_pose) + + +def get_qpos(env, is_left, select_arm, pose, qpos_seed, force_valid=False, name=""): + if force_valid: + try: + ret, qpos = env.get_arm_ik(pose, is_left=is_left, qpos_seed=qpos_seed) + if not ret: + log_error(f"Generate {name} qpos failed.\n") + except Exception as e: + log_warning( + f"Original {name} pose invalid, using nearest valid pose. ({e})\n" + ) + pose = find_nearest_valid_pose(env, select_arm, pose) + + ret, qpos = env.get_arm_ik(pose, is_left=is_left, qpos_seed=qpos_seed) + else: + ret, qpos = env.get_arm_ik(pose, is_left=is_left, qpos_seed=qpos_seed) + if not ret: + log_error(f"Generate {name} qpos failed.\n") + + return qpos + + +def get_offset_pose( + pose_to_change: torch.Tensor, + offset_value: float, + direction: str = "z", + mode: str = "intrinsic", +) -> torch.Tensor: + + device = pose_to_change.device + dtype = pose_to_change.dtype + + if isinstance(direction, str): + if direction == "x": + direction_vec = torch.tensor([1.0, 0.0, 0.0], device=device, dtype=dtype) + elif direction == "y": + direction_vec = torch.tensor([0.0, 1.0, 0.0], device=device, dtype=dtype) + elif direction == "z": + direction_vec = torch.tensor([0.0, 0.0, 1.0], device=device, dtype=dtype) + else: + log_error(f"Invalid direction '{direction}'. Must be 'x', 'y', or 'z'.") + return pose_to_change + else: + direction_vec = torch.as_tensor(direction, device=device, dtype=dtype) + + direction_vec = direction_vec / torch.linalg.norm(direction_vec) + offset_matrix = torch.eye(4, device=device, dtype=dtype) + offset_matrix[:3, 3] = offset_value * direction_vec + + if mode == "extrinsic": + offset_pose = offset_matrix @ pose_to_change + elif mode == "intrinsic": + offset_pose = pose_to_change @ offset_matrix + else: + log_error(f"Invalid mode '{mode}'. Must be 'extrinsic' or 'intrinsic'.") + return pose_to_change + + return offset_pose + + +def plan_trajectory( + env, + select_arm, + qpos_list, + sample_num, + select_arm_current_gripper_state, + select_qpos_traj, + ee_state_list_select, +): + motion_generator = MotionGenerator( + robot=env.robot, + uid=select_arm, + **getattr(env, "planning_config", {}), + ) + traj_list, _ = motion_generator.create_discrete_trajectory( + qpos_list=qpos_list, + sample_num=sample_num, + qpos_seed=qpos_list[0], + is_use_current_qpos=False, + **getattr(env, "planning_config", {}), + ) + + select_qpos_traj.extend(traj_list) + ee_state_list_select.extend([select_arm_current_gripper_state] * len(traj_list)) + + +def plan_gripper_trajectory( + env, + is_left, + sample_num, + execute_open, + select_arm_current_qpos, + select_qpos_traj, + ee_state_list_select, +): + open_state = env.open_state + close_state = env.close_state + + if execute_open: + ee_state_expand_select = np.array([close_state, open_state]) + env.set_current_gripper_state_agent(open_state, is_left=is_left) + else: + ee_state_expand_select = np.array([open_state, close_state]) + env.set_current_gripper_state_agent(close_state, is_left=is_left) + + ee_state_expand_select = mul_linear_expand(ee_state_expand_select, [sample_num]) + + select_qpos_traj.extend([select_arm_current_qpos] * sample_num) + ee_state_list_select.extend(ee_state_expand_select) + + +def finalize_actions(select_qpos_traj, ee_state_list_select): + # mimic eef state + actions = np.concatenate( + [ + np.array(select_qpos_traj), + np.array(ee_state_list_select), + np.array(ee_state_list_select), + ], + axis=-1, + ) + return actions + + +def extract_drive_calls(code_str: str) -> list[str]: + tree = ast.parse(code_str) + lines = code_str.splitlines() + + drive_blocks = [] + + for node in tree.body: + # Match: drive(...) + if ( + isinstance(node, ast.Expr) + and isinstance(node.value, ast.Call) + and isinstance(node.value.func, ast.Name) + and node.value.func.id == "drive" + ): + # AST line numbers are 1-based + start = node.lineno - 1 + end = node.end_lineno + block = "\n".join(lines[start:end]) + drive_blocks.append(block) + + return drive_blocks + + +""" +--------------------------------------------Atom action functions---------------------------------------------------- +--------------------------------------------Atom action functions---------------------------------------------------- +--------------------------------------------Atom action functions---------------------------------------------------- +""" + + +# TODO: write a move_to_pose atom action, the use this action to form other atom actions +def grasp( + robot_name: str, + obj_name: str, + pre_grasp_dis: float = 0.05, + env=None, + force_valid=False, + **kwargs, +): + # Get target object + obj_uids = env.sim.get_rigid_object_uid_list() + if obj_name in obj_uids: + target_obj = env.sim.get_rigid_object(obj_name) + else: + log_error(f"No matched object {obj_uids}.") + target_obj_pose = target_obj.get_local_pose(to_matrix=True).squeeze(0) + + # Open the gripper if currently closed + actions = None + select_arm_current_gripper_state = ( + env.left_arm_current_gripper_state + if "left" in robot_name + else env.right_arm_current_gripper_state + ) + if select_arm_current_gripper_state <= env.open_state - 0.01: + actions = open_gripper(robot_name, env, **kwargs) + + # Retract the end-effector to avoid collision + ( + is_left, + select_arm, + select_arm_current_qpos, + select_arm_current_pose, + select_arm_current_gripper_state, + ) = get_arm_states(env, robot_name) + select_arm_base_pose = ( + env.left_arm_base_pose if is_left else env.right_arm_base_pose + ) + base_to_eef_xy_dis = torch.norm( + select_arm_base_pose[:2, 3] - select_arm_current_pose[:2, 3] + ) + base_to_obj_xy_dis = torch.norm( + select_arm_base_pose[:2, 3] - target_obj_pose[:2, 3] + ) + dis_eps = kwargs.get("dis_eps", 0.05) + select_arm_init_pose = ( + env.left_arm_init_xpos if is_left else env.right_arm_init_xpos + ) + if base_to_eef_xy_dis > base_to_obj_xy_dis and not torch.allclose( + select_arm_current_pose, select_arm_init_pose, rtol=1e-5, atol=1e-8 + ): + delta = base_to_eef_xy_dis - (base_to_obj_xy_dis - dis_eps) + back_actions = move_by_relative_offset( + robot_name=robot_name, + dx=0.0, + dy=0.0, + dz=-delta, + env=env, + force_valid=force_valid, + mode="intrinsic", + sample_num=15, + **kwargs, + ) + actions = ( + np.concatenate([actions, back_actions], axis=0) + if actions is not None + else back_actions + ) + + # ---------------------------------------- Prepare ---------------------------------------- + select_qpos_traj = [] + ee_state_list_select = [] + + ( + is_left, + select_arm, + select_arm_current_qpos, + select_arm_current_pose, + select_arm_current_gripper_state, + ) = get_arm_states(env, robot_name) + + # ---------------------------------------- Pose ---------------------------------------- + # Move the end-effector to a good place for starting grasping to avoid bad poses + select_arm_retract_pose = deepcopy( + env.left_arm_init_xpos if is_left else env.right_arm_init_xpos + ) + select_arm_retract_pose = get_offset_pose( + select_arm_retract_pose, 0.15, "z", "intrinsic" + ) + select_arm_retract_qpos = get_qpos( + env, + is_left, + select_arm, + select_arm_retract_pose, + env.left_arm_init_qpos if is_left else env.right_arm_init_qpos, + force_valid=force_valid, + name="retract_to_good_pose", + ) + qpos_list_back_to_retract = [select_arm_current_qpos, select_arm_retract_qpos] + sample_num = 30 + + plan_trajectory( + env, + select_arm, + qpos_list_back_to_retract, + sample_num, + select_arm_current_gripper_state, + select_qpos_traj, + ee_state_list_select, + ) + + select_arm_current_qpos = select_arm_retract_qpos + select_arm_current_pose = select_arm_retract_pose + + # Rotate the arm base to face the object for better grasping + delta_xy = target_obj_pose[:2, 3] - select_arm_base_pose[:2, 3] + dx, dy = delta_xy[0], delta_xy[1] + aim_horizontal_angle = np.arctan2(dy, dx) + select_arm_aim_qpos = deepcopy(select_arm_current_qpos) + select_arm_aim_qpos[0] = aim_horizontal_angle + + # Get best grasp pose from affordance data + grasp_pose_object = env.init_obj_info.get(obj_name)["grasp_pose_obj"] + if ( + grasp_pose_object[0, 2] > 0.5 + ): # whether towards x direction TODO: make it robust + # Align the object pose's z-axis with the arm's aiming direction + target_obj_pose = torch.tensor( + get_rotation_replaced_pose( + np.array(target_obj_pose), + float(select_arm_aim_qpos[0]), + "z", + "intrinsic", + ) + ) + best_pickpose = target_obj_pose @ grasp_pose_object + grasp_pose = deepcopy(best_pickpose) + grasp_pose_pre1 = deepcopy(grasp_pose) + grasp_pose_pre1 = get_offset_pose(grasp_pose_pre1, -pre_grasp_dis, "z", "intrinsic") + + # Solve IK for pre-grasp and grasp poses + grasp_qpos_pre1 = get_qpos( + env, + is_left, + select_arm, + grasp_pose_pre1, + select_arm_aim_qpos, + force_valid=force_valid, + name="grasp pre1", + ) + grasp_qpos = get_qpos( + env, + is_left, + select_arm, + grasp_pose, + grasp_qpos_pre1, + force_valid=force_valid, + name="grasp", + ) + + # Update env state to final grasp pose + env.set_current_qpos_agent(grasp_qpos, is_left=is_left) + env.set_current_xpos_agent(grasp_pose, is_left=is_left) + + # ------------------------------------ Traj 0: init → aim ------------------------------------ + qpos_list_init_to_aim = [select_arm_current_qpos, select_arm_aim_qpos] + # base_sample_num = 10 + # base_angle = 0.08 + # sample_num = max(int(delta_angle / base_angle * base_sample_num), 2) + + sample_num = 10 + + plan_trajectory( + env, + select_arm, + qpos_list_init_to_aim, + sample_num, + select_arm_current_gripper_state, + select_qpos_traj, + ee_state_list_select, + ) + + # ------------------------------------ Traj 1: aim → pre-grasp ------------------------------------ + qpos_list_aim_to_pre1 = [select_arm_aim_qpos, grasp_qpos_pre1] + sample_num = kwargs.get("sample_num", 30) + + plan_trajectory( + env, + select_arm, + qpos_list_aim_to_pre1, + sample_num, + select_arm_current_gripper_state, + select_qpos_traj, + ee_state_list_select, + ) + + # ------------------------------------ Traj 2: pre-grasp → grasp ------------------------------------ + qpos_list_pre1_to_grasp = [grasp_qpos_pre1, grasp_qpos] + sample_num = kwargs.get("sample_num", 20) + + plan_trajectory( + env, + select_arm, + qpos_list_pre1_to_grasp, + sample_num, + select_arm_current_gripper_state, + select_qpos_traj, + ee_state_list_select, + ) + + # ---------------------------------------- Final ---------------------------------------- + traj_actions = finalize_actions(select_qpos_traj, ee_state_list_select) + actions = ( + traj_actions + if actions is None + else np.concatenate([actions, traj_actions], axis=0) + ) + + # ------------------------------------ Close gripper ------------------------------------ + close_gripper_actions = close_gripper(robot_name, env, **kwargs) + actions = np.concatenate([actions, close_gripper_actions], axis=0) + + log_info( + f"Total generated trajectory number for grasp: {len(actions)}.", color="green" + ) + + return actions + + +# def place_on_table( +# robot_name: str, +# obj_name: str, +# x: float = None, +# y: float = None, +# pre_place_dis: float = 0.08, +# env=None, +# force_valid=False, +# **kwargs +# ): +# +# # ---------------------------------------- Prepare ---------------------------------------- +# select_qpos_traj = [] +# ee_state_list_select = [] +# +# is_left, select_arm, select_arm_current_qpos, select_arm_current_pose, \ +# select_arm_current_gripper_state = get_arm_states(env, robot_name) +# +# grasp_pose_object = env.init_obj_info.get(obj_name).get('grasp_pose_obj') +# init_obj_pose = env.init_obj_info.get(obj_name).get('pose') +# +# select_arm_base_pose = env.left_arm_base_pose if is_left else env.right_arm_base_pose +# delta_xy = init_obj_pose[:2, 3] - select_arm_base_pose[:2, 3] +# aim_horizontal_angle = np.arctan2(delta_xy[1], delta_xy[0]) +# +# # Align the object pose's z-axis with the arm's aiming direction +# init_obj_pose = torch.tensor(get_rotation_replaced_pose(np.array(init_obj_pose), float(aim_horizontal_angle), "z","intrinsic")) +# +# place_pose = init_obj_pose @ grasp_pose_object +# place_pose[0, 3] = x +# place_pose[1, 3] = y +# place_pose[2, 3] += kwargs.get('eps', 0.02) +# +# pre_place_pose = deepcopy(place_pose) +# pre_place_pose[2, 3] += pre_place_dis +# +# # Solve IK for pre-place and place poses +# place_qpos_pre1 = get_qpos(env, is_left, select_arm, pre_place_pose, select_arm_current_qpos, force_valid=force_valid, name='place pre1') +# place_qpos = get_qpos(env, is_left, select_arm, place_pose, place_qpos_pre1, force_valid=force_valid, name='place') +# +# # Update env states +# env.set_current_qpos_agent(place_qpos, is_left=is_left) +# env.set_current_xpos_agent(place_pose, is_left=is_left) +# +# # ------------------------------------ Traj 0: current → pre-place ------------------------------------ +# qpos_list_current_to_preplace = [select_arm_current_qpos, place_qpos_pre1] +# sample_num = 30 +# +# plan_trajectory( +# env, +# select_arm, +# qpos_list_current_to_preplace, +# sample_num, +# select_arm_current_gripper_state, +# select_qpos_traj, +# ee_state_list_select +# ) +# +# # ------------------------------------ Traj 1: pre-place → place ------------------------------------ +# qpos_list_preplace_to_place = [place_qpos_pre1, place_qpos] +# sample_num = 20 +# +# plan_trajectory( +# env, +# select_arm, +# qpos_list_preplace_to_place, +# sample_num, +# select_arm_current_gripper_state, +# select_qpos_traj, +# ee_state_list_select +# ) +# +# # ---------------------------------------- Final ---------------------------------------- +# traj_actions = finalize_actions(select_qpos_traj, ee_state_list_select) +# +# open_actions = open_gripper(robot_name, env, **kwargs) +# +# actions = np.concatenate([traj_actions, open_actions], axis=0) +# +# log_info(f"Total generated trajectory number for place on table: {len(actions)}.", color="green") +# +# return actions + + +def place_on_table( + robot_name: str, + obj_name: str, + x: float = None, + y: float = None, + pre_place_dis: float = 0.08, + env=None, + force_valid=False, + **kwargs, +): + + init_obj_height = env.init_obj_info.get(obj_name).get("height") + height = init_obj_height + kwargs.get("eps", 0.03) + + traj_actions = move_to_absolute_position( + robot_name, x=x, y=y, z=height, env=env, force_valid=force_valid, **kwargs + ) + open_actions = open_gripper(robot_name, env, **kwargs) + + actions = np.concatenate([traj_actions, open_actions], axis=0) + + log_info( + f"Total generated trajectory number for place on table: {len(actions)}.", + color="green", + ) + + return actions + + +def move_relative_to_object( + robot_name: str, + obj_name: str, + x_offset: float = 0, + y_offset: float = 0, + z_offset: float = 0, + env=None, + force_valid=False, + **kwargs, +): + + # ---------------------------------------- Prepare ---------------------------------------- + select_qpos_traj = [] + ee_state_list_select = [] + + ( + is_left, + select_arm, + select_arm_current_qpos, + select_arm_current_pose, + select_arm_current_gripper_state, + ) = get_arm_states(env, robot_name) + + # ---------------------------------------- Pose ---------------------------------------- + # Resolve target object + obj_uids = env.sim.get_rigid_object_uid_list() + if obj_name in obj_uids: + target_obj = env.sim.get_rigid_object(obj_name) + else: + log_error("No matched object.") + + # Get object base pose (4x4 matrix) + target_obj_pose = target_obj.get_local_pose(to_matrix=True).squeeze(0) + + # Construct target pose (preserve orientation) + move_target_pose = deepcopy(select_arm_current_pose) + move_target_pose[:3, 3] = target_obj_pose[:3, 3] + move_target_pose[0, 3] += x_offset + move_target_pose[1, 3] += y_offset + move_target_pose[2, 3] += z_offset + + # Solve IK for target pose + move_target_qpos = get_qpos( + env, + is_left, + select_arm, + move_target_pose, + select_arm_current_qpos, + force_valid=force_valid, + name="move relative to object", + ) + + # Update env states + env.set_current_qpos_agent(move_target_qpos, is_left=is_left) + env.set_current_xpos_agent(move_target_pose, is_left=is_left) + + # ------------------------------------ Traj 1: init → target ------------------------------------ + qpos_list_init_to_target = [select_arm_current_qpos, move_target_qpos] + sample_num = kwargs.get("sample_num", 30) + + plan_trajectory( + env, + select_arm, + qpos_list_init_to_target, + sample_num, + select_arm_current_gripper_state, + select_qpos_traj, + ee_state_list_select, + ) + + # ---------------------------------------- Final ---------------------------------------- + actions = finalize_actions(select_qpos_traj, ee_state_list_select) + + log_info( + f"Total generated trajectory number for move relative to object: {len(actions)}.", + color="green", + ) + + return actions + + +def move_to_absolute_position( + robot_name: str, + x: float = None, + y: float = None, + z: float = None, + env=None, + force_valid=False, + **kwargs, +): + + # ---------------------------------------- Prepare ---------------------------------------- + select_qpos_traj = [] + ee_state_list_select = [] + + ( + is_left, + select_arm, + select_arm_current_qpos, + select_arm_current_pose, + select_arm_current_gripper_state, + ) = get_arm_states(env, robot_name) + + # ---------------------------------------- Pose ---------------------------------------- + # Start from current pose, then selectively update xyz + move_pose = deepcopy(select_arm_current_pose) + + current_xyz = move_pose[:3, 3].clone() + + target_xyz = current_xyz.clone() + if x is not None: + target_xyz[0] = x + if y is not None: + target_xyz[1] = y + if z is not None: + target_xyz[2] = z + + move_pose[:3, 3] = target_xyz + + # Try IK on target pose + move_qpos = get_qpos( + env, + is_left, + select_arm, + move_pose, + select_arm_current_qpos, + force_valid=force_valid, + name="move to absolute position", + ) + + # Update env states + env.set_current_qpos_agent(move_qpos, is_left=is_left) + env.set_current_xpos_agent(move_pose, is_left=is_left) + + # ------------------------------------ Traj: init → target ------------------------------------ + qpos_list_init_to_move = [select_arm_current_qpos, move_qpos] + sample_num = kwargs.get("sample_num", 30) + + plan_trajectory( + env, + select_arm, + qpos_list_init_to_move, + sample_num, + select_arm_current_gripper_state, + select_qpos_traj, + ee_state_list_select, + ) + + # ---------------------------------------- Final ---------------------------------------- + actions = finalize_actions(select_qpos_traj, ee_state_list_select) + + log_info( + f"Total generated trajectory number for move to absolute position: {len(actions)}.", + color="green", + ) + + return actions + + +def move_by_relative_offset( + robot_name: str, + dx: float = 0.0, + dy: float = 0.0, + dz: float = 0.0, + mode: str = "extrinsic", + env=None, + force_valid=False, + **kwargs, +): + + # ---------------------------------------- Prepare ---------------------------------------- + select_qpos_traj = [] + ee_state_list_select = [] + + ( + is_left, + select_arm, + select_arm_current_qpos, + select_arm_current_pose, + select_arm_current_gripper_state, + ) = get_arm_states(env, robot_name) + + # ---------------------------------------- Pose ---------------------------------------- + move_pose = deepcopy(select_arm_current_pose) + + # Apply relative offsets (dx, dy, dz always floats) + move_pose = get_offset_pose(move_pose, dx, "x", mode) + move_pose = get_offset_pose(move_pose, dy, "y", mode) + move_pose = get_offset_pose(move_pose, dz, "z", mode) + + # Solve IK + move_qpos = get_qpos( + env, + is_left, + select_arm, + move_pose, + select_arm_current_qpos, + force_valid=force_valid, + name="move by relative offset", + ) + + # Update environment states + env.set_current_qpos_agent(move_qpos, is_left=is_left) + env.set_current_xpos_agent(move_pose, is_left=is_left) + + # ------------------------------------ Traj: init → target ------------------------------------ + qpos_list_init_to_move = [select_arm_current_qpos, move_qpos] + sample_num = kwargs.get("sample_num", 20) + + plan_trajectory( + env, + select_arm, + qpos_list_init_to_move, + sample_num, + select_arm_current_gripper_state, + select_qpos_traj, + ee_state_list_select, + ) + + # ---------------------------------------- Final ---------------------------------------- + actions = finalize_actions(select_qpos_traj, ee_state_list_select) + + log_info( + f"Total generated trajectory number for move by relative offset: {len(actions)}.", + color="green", + ) + + return actions + + +def back_to_initial_pose(robot_name: str, env=None, **kwargs): + + # ---------------------------------------- Prepare ---------------------------------------- + select_qpos_traj = [] + ee_state_list_select = [] + + # Get arm states + ( + is_left, + select_arm, + select_arm_current_qpos, + select_arm_current_pose, + select_arm_current_gripper_state, + ) = get_arm_states(env, robot_name) + + # Retrieve the initial joint configuration of this arm + target_qpos = env.left_arm_init_qpos if is_left else env.right_arm_init_qpos + target_qpos = torch.as_tensor(target_qpos, dtype=select_arm_current_qpos.dtype) + + # ---------------------------------------- Pose ---------------------------------------- + # Pre-back pose: move along tool z by a small offset (use intrinsic frame) + pre_back_pose = deepcopy(select_arm_current_pose) + pre_back_pose = get_offset_pose(pre_back_pose, -0.08, "z", "intrinsic") + + # IK for pre-back + pre_back_qpos = get_qpos( + env, + is_left, + select_arm, + pre_back_pose, + select_arm_current_qpos, + force_valid=kwargs.get("force_valid", False), + name="pre back pose", + ) + + # Update env states (move to target pose) + target_pose = env.get_arm_fk(qpos=target_qpos, is_left=is_left) + env.set_current_qpos_agent(target_qpos, is_left=is_left) + env.set_current_xpos_agent(target_pose, is_left=is_left) + + # ------------------------------------ Traj: init → pre back_pose ------------------------------------ + qpos_list_init_to_preback = [select_arm_current_qpos, pre_back_qpos] + sample_num = 20 + + plan_trajectory( + env, + select_arm, + qpos_list_init_to_preback, + sample_num, + select_arm_current_gripper_state, + select_qpos_traj, + ee_state_list_select, + ) + + # ------------------------------------ Traj: init → initial_pose ------------------------------------ + qpos_list_preback_to_target = [pre_back_qpos, target_qpos] + sample_num = kwargs.get("sample_num", 30) + + plan_trajectory( + env, + select_arm, + qpos_list_preback_to_target, + sample_num, + select_arm_current_gripper_state, + select_qpos_traj, + ee_state_list_select, + ) + + # ---------------------------------------- Final ---------------------------------------- + actions = finalize_actions(select_qpos_traj, ee_state_list_select) + + log_info( + f"Total generated trajectory number for back to initial pose: {len(actions)}.", + color="green", + ) + + return actions + + +def rotate_eef(robot_name: str, degree: float = 0, env=None, **kwargs): + + # ---------------------------------------- Prepare ---------------------------------------- + select_qpos_traj = [] + ee_state_list_select = [] + + ( + is_left, + select_arm, + select_arm_current_qpos, + select_arm_current_pose, + select_arm_current_gripper_state, + ) = get_arm_states(env, robot_name) + + # ---------------------------------------- Pose ---------------------------------------- + # Compute new joint positions + rotated_qpos = deepcopy(select_arm_current_qpos) + rotated_qpos[5] += np.deg2rad(degree) + + # Optional: limit checking (commented out by default) + # joint5_limit = env.get_joint_limits(select_arm)[5] + # if rotated_qpos[5] < joint5_limit[0] or rotated_qpos[5] > joint5_limit[1]: + # log_warning("Rotated qpos exceeds joint limits.\n") + + # Compute FK for new pose + rotated_pose = env.get_arm_fk( + qpos=rotated_qpos, + is_left=is_left, + ) + + # Update environment state + env.set_current_qpos_agent(rotated_qpos, is_left=is_left) + env.set_current_xpos_agent(rotated_pose, is_left=is_left) + + # ------------------------------------ Traj 1: init → rotated ------------------------------------ + qpos_list_init_to_rotated = [select_arm_current_qpos, rotated_qpos] + sample_num = kwargs.get("sample_num", 20) + + plan_trajectory( + env, + select_arm, + qpos_list_init_to_rotated, + sample_num, + select_arm_current_gripper_state, + select_qpos_traj, + ee_state_list_select, + ) + + # ---------------------------------------- Final ---------------------------------------- + actions = finalize_actions(select_qpos_traj, ee_state_list_select) + + log_info( + f"Total generated trajectory number for rotate eef: {len(actions)}.", + color="green", + ) + + return actions + + +def orient_eef( + robot_name: str, + direction: str = "front", # 'front' or 'down' + env=None, + force_valid=False, + **kwargs, +): + + # ---------------------------------------- Prepare ---------------------------------------- + select_qpos_traj = [] + ee_state_list_select = [] + + # Get arm state + ( + is_left, + select_arm, + select_arm_current_qpos, + select_arm_current_pose, + select_arm_current_gripper_state, + ) = get_arm_states(env, robot_name) + + # ---------------------------------------- Pose ---------------------------------------- + # Generate replacement rotation matrix + replaced_rotation_matrix = np.eye(4) + if direction == "front": + rotation_matrix = R.from_euler("xyz", [180, -90, 0], degrees=True).as_matrix() + replaced_rotation_matrix[:3, :3] = ( + rotation_matrix @ replaced_rotation_matrix[:3, :3] + ) + elif direction == "down": + rotation_matrix = R.from_euler("x", 180, degrees=True).as_matrix() + replaced_rotation_matrix[:3, :3] = ( + rotation_matrix @ replaced_rotation_matrix[:3, :3] + ) + else: + log_error("Rotation direction must be 'front' or 'down'.") + + rotation_replaced_pose = deepcopy(select_arm_current_pose) + rot_torch = torch.as_tensor( + replaced_rotation_matrix[:3, :3], + dtype=rotation_replaced_pose.dtype, + device=rotation_replaced_pose.device, + ) + rotation_replaced_pose[:3, :3] = rot_torch + + # Solve IK for the new pose + replace_target_qpos = get_qpos( + env, + is_left, + select_arm, + rotation_replaced_pose, + select_arm_current_qpos, + force_valid=force_valid, + name="replaced-rotation", + ) + + # ---------------------------------------- Update env ---------------------------------------- + env.set_current_qpos_agent(replace_target_qpos, is_left=is_left) + env.set_current_xpos_agent(rotation_replaced_pose, is_left=is_left) + + # ------------------------------------ Traj: init → target ------------------------------------ + qpos_list_init_to_rotated = [select_arm_current_qpos, replace_target_qpos] + sample_num = kwargs.get("sample_num", 20) + + plan_trajectory( + env, + select_arm, + qpos_list_init_to_rotated, + sample_num, + select_arm_current_gripper_state, + select_qpos_traj, + ee_state_list_select, + ) + + # ---------------------------------------- Final ---------------------------------------- + actions = finalize_actions(select_qpos_traj, ee_state_list_select) + + log_info( + f"Total generated trajectory number for orient eef: {len(actions)}.", + color="green", + ) + + return actions + + +def close_gripper(robot_name: str, env=None, **kwargs): + + # ---------------------------------------- Prepare ---------------------------------------- + select_qpos_traj = [] + ee_state_list_select = [] + + ( + is_left, + select_arm, + select_arm_current_qpos, + select_arm_current_pose, + select_arm_current_gripper_state, + ) = get_arm_states(env, robot_name) + + # ---------------------------------------- Traj ---------------------------------------- + sample_num = kwargs.get("sample_num", 15) + execute_open = False # False → closing motion + + plan_gripper_trajectory( + env, + is_left, + sample_num, + execute_open, + select_arm_current_qpos, + select_qpos_traj, + ee_state_list_select, + ) + + # ---------------------------------------- Final ---------------------------------------- + actions = finalize_actions(select_qpos_traj, ee_state_list_select) + + log_info( + f"Total generated trajectory number for close gripper: {len(actions)}.", + color="green", + ) + + return actions + + +def open_gripper(robot_name: str, env=None, **kwargs): + + # ---------------------------------------- Prepare ---------------------------------------- + select_qpos_traj = [] + ee_state_list_select = [] + + ( + is_left, + select_arm, + select_arm_current_qpos, + select_arm_current_pose, + select_arm_current_gripper_state, + ) = get_arm_states(env, robot_name) + + # ---------------------------------------- Traj ---------------------------------------- + sample_num = kwargs.get("sample_num", 15) + execute_open = True # True → opening motion + + plan_gripper_trajectory( + env, + is_left, + sample_num, + execute_open, + select_arm_current_qpos, + select_qpos_traj, + ee_state_list_select, + ) + + # ---------------------------------------- Final ---------------------------------------- + actions = finalize_actions(select_qpos_traj, ee_state_list_select) + + log_info( + f"Total generated trajectory number for open gripper: {len(actions)}.", + color="green", + ) + + return actions + + +def drive( + left_arm_action=None, + right_arm_action=None, + env=None, + **kwargs, +): + + if left_arm_action is not None and right_arm_action is not None: + len_left = len(left_arm_action) + len_right = len(right_arm_action) + + if len_left < len_right: + diff = len_right - len_left + padding = np.repeat(left_arm_action[-1:], diff, axis=0) + left_arm_action = np.concatenate([left_arm_action, padding], axis=0) + elif len_right < len_left: + diff = len_left - len_right + padding = np.repeat(right_arm_action[-1:], diff, axis=0) + right_arm_action = np.concatenate([right_arm_action, padding], axis=0) + + left_arm_index = env.left_arm_joints + env.left_eef_joints + right_arm_index = env.right_arm_joints + env.right_eef_joints + actions = np.zeros((len(right_arm_action), len(env.init_qpos))) + actions[:, left_arm_index] = left_arm_action + actions[:, right_arm_index] = right_arm_action + + elif left_arm_action is None and right_arm_action is not None: + left_arm_index = env.left_arm_joints + env.left_eef_joints + right_arm_index = env.right_arm_joints + env.right_eef_joints + left_arm_action = finalize_actions( + env.left_arm_current_qpos, env.left_arm_current_gripper_state + ) + left_arm_action = np.repeat( + left_arm_action[None, :], len(right_arm_action), axis=0 + ) + + actions = np.zeros( + (len(right_arm_action), len(env.robot.get_qpos().squeeze(0))), + dtype=np.float32, + ) + actions[:, left_arm_index] = left_arm_action + actions[:, right_arm_index] = right_arm_action + + elif right_arm_action is None and left_arm_action is not None: + left_arm_index = env.left_arm_joints + env.left_eef_joints + right_arm_index = env.right_arm_joints + env.right_eef_joints + right_arm_action = finalize_actions( + env.right_arm_current_qpos, env.right_arm_current_gripper_state + ) + right_arm_action = np.repeat( + right_arm_action[None, :], len(left_arm_action), axis=0 + ) + + actions = np.zeros( + (len(left_arm_action), len(env.robot.get_qpos().squeeze(0))), + dtype=np.float32, + ) + actions[:, left_arm_index] = left_arm_action + actions[:, right_arm_index] = right_arm_action + + else: + log_error("At least one arm action should be provided.") + + actions = torch.from_numpy(actions).to(dtype=torch.float32).unsqueeze(1) + actions = list(actions.unbind(dim=0)) + for i in tqdm(range(len(actions))): + action = actions[i] + env.step(action) + return actions + + +def save_observations( + step_id: int = 0, + step_name: str = None, + env=None, + **kwargs, +): + # When using feedback script + log_dir = kwargs.get("log_dir") + if log_dir: + save_dir = log_dir / "camera_images" + + # Prepare subfolder: {id}_generate_num/episode{current_check_num} + gen_id = kwargs.get("id", "unknown_id") + episode_id = kwargs.get("current_check_num", 0) + + sub_dir = save_dir / f"{gen_id}_generate_num" / f"episode{episode_id}" + sub_dir.mkdir(parents=True, exist_ok=True) + + # Encode image to Base64 + base64_image = encode_image(env.get_obs_for_agent()["rgb"]) + + # Decode Base64 back to raw image bytes + import base64 + + img_bytes = base64.b64decode(base64_image) + + # Ensure step_name is not None + step_name = step_name if step_name is not None else "unnamed_step" + + # Save the decoded image + output_path = sub_dir / f"step{step_id}_{step_name}.png" + with open(output_path, "wb") as f: + f.write(img_bytes) + + # Print save info + log_info(f"[save_observations] Saved image to: {output_path}") + + # When only running the script (no feedback script) + else: + pass diff --git a/embodichain/toolkits/toolkits.py b/embodichain/toolkits/toolkits.py new file mode 100644 index 00000000..5d388fca --- /dev/null +++ b/embodichain/toolkits/toolkits.py @@ -0,0 +1,33 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2025 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from abc import ABCMeta, abstractmethod +import os +from embodichain.utils.utility import load_json + + +class ToolkitsBase(metaclass=ABCMeta): + @classmethod + def from_config(cls, path: str): + assert ( + os.path.basename(path).split(".")[-1] == "json" + ), "only json file is supported." + config = load_json(path) + return config["ToolKits"][cls.__name__] + + @abstractmethod + def call(self, **kwargs): + pass diff --git a/pyproject.toml b/pyproject.toml index 50fef254..6f5d8bf7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,6 +29,8 @@ dependencies = [ "dexsim_engine==0.3.9", "setuptools>=78.1.1", "gymnasium>=0.29.1", + "langchain", + "langchain-openai", "toppra==0.6.3", "pin", "pin-pink", @@ -49,6 +51,8 @@ dependencies = [ "black==24.3.0", "fvcore", "h5py", + "zmq==0.0.0", + "pycocotools", ] [project.optional-dependencies]