diff --git a/predicators/approaches/bilevel_planning_approach.py b/predicators/approaches/bilevel_planning_approach.py index cb8ca38347..4f67894bad 100644 --- a/predicators/approaches/bilevel_planning_approach.py +++ b/predicators/approaches/bilevel_planning_approach.py @@ -72,6 +72,11 @@ def _solve(self, task: Task, timeout: int) -> Callable[[State], Action]: for act in nsrt_plan: logging.debug(act) + for act in nsrt_plan: + print(act) + import ipdb + ipdb.set_trace() + # Run full bilevel planning. else: option_plan, nsrt_plan, metrics = self._run_sesame_plan( diff --git a/predicators/approaches/documentation/grammar_search_invention_approach.md b/predicators/approaches/documentation/grammar_search_invention_approach.md index 7d629f0d78..c527e747c0 100644 --- a/predicators/approaches/documentation/grammar_search_invention_approach.md +++ b/predicators/approaches/documentation/grammar_search_invention_approach.md @@ -62,7 +62,7 @@ apple_coring__vlm_demos__456__2 | options.txt ### Running predicate invention using these image demos -To use the Gemini VLM, you need to set the `GOOGLE_API_KEY` environment variable in your terminal. You can make/get an API key [here](https://aistudio.google.com/app/apikey). +To use the Gemini VLM, you need to set the `GOOGLE_API_KEY` environment variable in your terminal. You can make/get an API key [here](https://aistudio.google.com/app/apikey). To use an Open AI VLM, the `OPENAI_API_KEY` variable needs to be set. Example command: `python predicators/main.py --env apple_coring --seed 456 --approach grammar_search_invention --excluded_predicates all --num_train_tasks 1 --num_test_tasks 0 --offline_data_method saved_vlm_img_demos_folder --vlm_trajs_folder_name apple_coring__vlm_demos__456__1` diff --git a/predicators/approaches/spot_wrapper_approach.py b/predicators/approaches/spot_wrapper_approach.py index 85af72d148..ba82a42f92 100644 --- a/predicators/approaches/spot_wrapper_approach.py +++ b/predicators/approaches/spot_wrapper_approach.py @@ -18,6 +18,7 @@ from predicators import utils from predicators.approaches import BaseApproach, BaseApproachWrapper from predicators.envs.spot_env import get_detection_id_for_object, get_robot +from predicators.settings import CFG from predicators.spot_utils.skills.spot_find_objects import find_objects from predicators.spot_utils.skills.spot_stow_arm import stow_arm from predicators.spot_utils.utils import get_allowed_map_regions @@ -66,7 +67,7 @@ def _policy(state: State) -> Action: state.get(obj, "lost") > 0.5: lost_objects.add(obj) # Need to find the objects. - if lost_objects: + if lost_objects and len(CFG.spot_vlm_teleop_demo_folderpath) == 0: logging.info(f"[Spot Wrapper] Lost objects: {lost_objects}") # Reset the base approach policy. base_approach_policy = None diff --git a/predicators/envs/spot_env.py b/predicators/envs/spot_env.py index 911ff34dd4..af7a944e14 100644 --- a/predicators/envs/spot_env.py +++ b/predicators/envs/spot_env.py @@ -6,7 +6,7 @@ import time from dataclasses import dataclass from pathlib import Path -from typing import Callable, ClassVar, Collection, Dict, Iterator, List, \ +from typing import Any, Callable, ClassVar, Collection, Dict, Iterator, List, \ Optional, Sequence, Set, Tuple import matplotlib @@ -42,7 +42,7 @@ from predicators.spot_utils.spot_localization import SpotLocalizer from predicators.spot_utils.utils import _base_object_type, _broom_type, \ _container_type, _dustpan_type, _immovable_object_type, \ - _movable_object_type, _robot_type, _wrappers_type, \ + _movable_object_type, _robot_type, _table_type, _wrappers_type, \ construct_state_given_pbrspot, get_allowed_map_regions, \ get_graph_nav_dir, get_robot_gripper_open_percentage, get_spot_home_pose, \ load_spot_metadata, object_to_top_down_geom, update_pbrspot_given_state, \ @@ -84,6 +84,9 @@ class _SpotObservation: # A placeholder until all predicates have classifiers nonpercept_atoms: Set[GroundAtom] nonpercept_predicates: Set[Predicate] + # Object detections per camera in self.images. + object_detections_per_camera: Dict[str, List[Tuple[ObjectDetectionID, + SegmentedBoundingBox]]] @dataclass(frozen=True) @@ -140,10 +143,8 @@ def allclose(self, other: State) -> bool: def copy(self) -> State: state_copy = {o: self._copy_state_value(self.data[o]) for o in self} - sim_state_copy = { - "predicates": self._simulator_state_predicates.copy(), - "atoms": self._simulator_state_atoms.copy() - } + if self.simulator_state is not None: + sim_state_copy = self.simulator_state.copy() return _PartialPerceptionState(state_copy, simulator_state=sim_state_copy) @@ -758,7 +759,18 @@ def _build_realworld_observation( break if response == "n": break - + # Construct the detections per camera. + obj_detections_per_camera: Dict[str, + List[Tuple[ObjectDetectionID, + SegmentedBoundingBox]]] = { + k: [] + for k in rgbds.keys() + } + for object_id, d in all_artifacts['language'][ + 'object_id_to_img_detections'].items(): + for camera_name, seg_bb in d.items(): + obj_detections_per_camera[camera_name].append( + (object_id, seg_bb)) # Prepare the non-percepts. nonpercept_preds = self.predicates - self.percept_predicates assert all(a.predicate in nonpercept_preds for a in ground_atoms) @@ -766,7 +778,8 @@ def _build_realworld_observation( objects_in_hand_view, objects_in_any_view_except_back, self._spot_object, gripper_open_percentage, - robot_pos, ground_atoms, nonpercept_preds) + robot_pos, ground_atoms, nonpercept_preds, + obj_detections_per_camera) return obs @@ -857,7 +870,8 @@ def _actively_construct_env_task(self) -> EnvironmentTask: # an initial observation. assert self._robot is not None assert self._localizer is not None - objects_in_view = self._actively_construct_initial_object_views() + objects_in_view, artifacts = self._actively_construct_initial_object_views( + ) rgbd_images = capture_images(self._robot, self._localizer) gripper_open_percentage = get_robot_gripper_open_percentage( self._robot) @@ -866,9 +880,20 @@ def _actively_construct_env_task(self) -> EnvironmentTask: nonpercept_atoms = self._get_initial_nonpercept_atoms() nonpercept_preds = self.predicates - self.percept_predicates assert all(a.predicate in nonpercept_preds for a in nonpercept_atoms) + # Construct the detections per camera. + obj_detections_per_camera: Dict[str, List[Tuple[ + ObjectDetectionID, + SegmentedBoundingBox]]] = {k: [] + for k in rgbd_images.keys()} + for object_id, d in artifacts['language'][ + 'object_id_to_img_detections'].items(): + for camera_name, seg_bb in d.items(): + obj_detections_per_camera[camera_name].append( + (object_id, seg_bb)) obs = _SpotObservation(rgbd_images, objects_in_view, set(), set(), self._spot_object, gripper_open_percentage, - robot_pos, nonpercept_atoms, nonpercept_preds) + robot_pos, nonpercept_atoms, nonpercept_preds, + obj_detections_per_camera) goal_description = self._generate_goal_description() task = EnvironmentTask(obs, goal_description) # Save the task for future use. @@ -985,41 +1010,40 @@ def _load_task_from_json(self, json_file: Path) -> EnvironmentTask: # Prepare the non-percepts. nonpercept_atoms = self._get_initial_nonpercept_atoms() nonpercept_preds = self.predicates - self.percept_predicates - init_obs = _SpotObservation( - images, - objects_in_view, - set(), - set(), - robot, - gripper_open_percentage, - robot_pos, - nonpercept_atoms, - nonpercept_preds, - ) + init_obs = _SpotObservation(images, objects_in_view, set(), set(), + robot, gripper_open_percentage, robot_pos, + nonpercept_atoms, nonpercept_preds, {}) # The goal can remain the same. goal = base_env_task.goal_description return EnvironmentTask(init_obs, goal) def _actively_construct_initial_object_views( - self) -> Dict[Object, math_helpers.SE3Pose]: + self) -> Tuple[Dict[Object, math_helpers.SE3Pose], Dict[str, Any]]: assert self._robot is not None assert self._localizer is not None stow_arm(self._robot) - go_home(self._robot, self._localizer) + # go_home(self._robot, self._localizer) self._localizer.localize() detection_ids = self._detection_id_to_obj.keys() - detections = self._run_init_search_for_objects(set(detection_ids)) + detections, artifacts = self._run_init_search_for_objects( + set(detection_ids)) stow_arm(self._robot) obj_to_se3_pose = { self._detection_id_to_obj[det_id]: val for (det_id, val) in detections.items() } self._last_known_object_poses.update(obj_to_se3_pose) - return obj_to_se3_pose + # Move the robot into a good place to construct the initial state + # by running VLM predicates. + prompt = "Finished initial search for objects. Take control of the robot and move it into a good initial location for constructing the initial state of the task. Press 'Enter' when done!" + _ = input(prompt) + assert self._lease_client is not None + self._lease_client.take() + return obj_to_se3_pose, artifacts def _run_init_search_for_objects( self, detection_ids: Set[ObjectDetectionID] - ) -> Dict[ObjectDetectionID, math_helpers.SE3Pose]: + ) -> Tuple[Dict[ObjectDetectionID, math_helpers.SE3Pose], Dict[str, Any]]: """Have the hand look down from high up at first.""" assert self._robot is not None assert self._localizer is not None @@ -1041,7 +1065,8 @@ def _run_init_search_for_objects( no_detections_outfile = outdir / f"no_detections_{time_str}.png" visualize_all_artifacts(artifacts, detections_outfile, no_detections_outfile) - return detections + self._lease_client + return detections, artifacts @property @abc.abstractmethod @@ -1075,11 +1100,8 @@ def _generate_goal_description(self) -> GoalDescription: ## Types _ALL_TYPES = { - _robot_type, - _base_object_type, - _movable_object_type, - _immovable_object_type, - _container_type, + _robot_type, _base_object_type, _movable_object_type, + _immovable_object_type, _container_type, _table_type } @@ -1483,6 +1505,24 @@ def _get_vlm_query_str(pred_name: str, objects: Sequence[Object]) -> str: _VLMOn = utils.create_vlm_predicate("VLMOn", [_movable_object_type, _base_object_type], lambda o: _get_vlm_query_str("OnTopOf", o)) +_VLMOnTable = utils.create_vlm_predicate( + "VLMOnTable", [_movable_object_type, _table_type], + lambda o: _get_vlm_query_str("OnTopTable", o)) +_VLMOnFloor = utils.create_vlm_predicate( + "VLMOnFloor", [_movable_object_type], + lambda o: _get_vlm_query_str("OnFloor", o)) +_TableClear = utils.create_vlm_predicate( + "TableClear", [_table_type], + lambda o: _get_vlm_query_str("ClearOfObjects", o)) +_CanBeUsedForErasing = utils.create_vlm_predicate( + "CanBeUsedForErasing", [_base_object_type], + lambda o: _get_vlm_query_str("CanBeUsedForErasing", o)) +_TableWiped = utils.create_vlm_predicate( + "TableWiped", [_table_type], + lambda o: _get_vlm_query_str("WipedOfMarkerScribbles", o)) +_TableClean = utils.create_vlm_predicate( + "TableClean", [_table_type], + lambda o: _get_vlm_query_str("CleanOfObjectsAndMarkings", o)) _Upright = utils.create_vlm_predicate( "Upright", [_movable_object_type], lambda o: _get_vlm_query_str("Upright", o)) @@ -1491,7 +1531,7 @@ def _get_vlm_query_str(pred_name: str, objects: Sequence[Object]) -> str: lambda o: _get_vlm_query_str("Toasted", o)) _VLMIn = utils.create_vlm_predicate( "VLMIn", [_movable_object_type, _immovable_object_type], - lambda o: _get_vlm_query_str("In", o)) + lambda o: _get_vlm_query_str("Inside", o)) _Open = utils.create_vlm_predicate("Open", [_movable_object_type], lambda o: _get_vlm_query_str("Open", o)) _Stained = utils.create_vlm_predicate( @@ -1513,8 +1553,14 @@ def _get_vlm_query_str(pred_name: str, objects: Sequence[Object]) -> str: } _VLM_PREDICATES = { _VLMOn, + _VLMOnTable, + _VLMOnFloor, + _TableClear, + _TableWiped, + _TableClean, _Upright, _Toasted, + _CanBeUsedForErasing, _VLMIn, _Open, _Stained, @@ -1996,7 +2042,7 @@ def _dry_simulate_move_to_view_hand( robot_pos=robot_pose, nonpercept_atoms=nonpercept_atoms, nonpercept_predicates=last_obs.nonpercept_predicates, - ) + object_detections_per_camera={}) return next_obs @@ -2028,7 +2074,7 @@ def _dry_simulate_move_to_reach_obj( robot_pos=robot_pose, nonpercept_atoms=nonpercept_atoms, nonpercept_predicates=last_obs.nonpercept_predicates, - ) + object_detections_per_camera={}) return next_obs @@ -2066,7 +2112,7 @@ def _dry_simulate_pick_from_top( robot_pos=robot_pose, nonpercept_atoms=nonpercept_atoms, nonpercept_predicates=last_obs.nonpercept_predicates, - ) + object_detections_per_camera={}) return next_obs @@ -2138,7 +2184,7 @@ def _dry_simulate_place_on_top( robot_pos=robot_pose, nonpercept_atoms=nonpercept_atoms, nonpercept_predicates=last_obs.nonpercept_predicates, - ) + object_detections_per_camera={}) return next_obs @@ -2182,7 +2228,7 @@ def _dry_simulate_drop_inside( robot_pos=robot_pose, nonpercept_atoms=nonpercept_atoms, nonpercept_predicates=last_obs.nonpercept_predicates, - ) + object_detections_per_camera={}) return next_obs @@ -2229,7 +2275,7 @@ def _dry_simulate_drag(last_obs: _SpotObservation, held_obj: Object, robot_pos=robot_pose, nonpercept_atoms=nonpercept_atoms, nonpercept_predicates=last_obs.nonpercept_predicates, - ) + object_detections_per_camera={}) return next_obs @@ -2284,7 +2330,7 @@ def _dry_simulate_prepare_container_for_sweeping( robot_pos=robot_pose, nonpercept_atoms=nonpercept_atoms, nonpercept_predicates=last_obs.nonpercept_predicates, - ) + object_detections_per_camera={}) return next_obs @@ -2362,7 +2408,7 @@ def _dry_simulate_sweep_into_container( robot_pos=robot_pose, nonpercept_atoms=nonpercept_atoms, nonpercept_predicates=last_obs.nonpercept_predicates, - ) + object_detections_per_camera={}) return next_obs @@ -2388,7 +2434,7 @@ def _dry_simulate_drop_not_placeable_object( robot_pos=robot_pose, nonpercept_atoms=nonpercept_atoms, nonpercept_predicates=last_obs.nonpercept_predicates, - ) + object_detections_per_camera={}) return next_obs @@ -2410,7 +2456,7 @@ def _dry_simulate_noop(last_obs: _SpotObservation, robot_pos=robot_pose, nonpercept_atoms=nonpercept_atoms, nonpercept_predicates=last_obs.nonpercept_predicates, - ) + object_detections_per_camera={}) return next_obs @@ -2439,21 +2485,22 @@ def _dry_simulate_pick_and_dump_container( robot_pos=obs.robot_pos, nonpercept_atoms=nonpercept_atoms, nonpercept_predicates=last_obs.nonpercept_predicates, - ) + object_detections_per_camera={}) return next_obs ############################################################################### -# VLM Generic Test Env # +# VLM No Teleop Test Env # ############################################################################### class SpotMinimalVLMPredicateEnv(SpotRearrangementEnv): """An abstract env that makes it easy to test the VLM-based predicate invention and evaluation pipeline on the real Spot robot. Importantly note that every env that inherits from this doesn't - require a map. Rather, it just works directly without a map. TODO: - see if this assumption is actually tenable, or if we need to remove - it? + require a map. Rather, it just works directly without a map. This + means that we don't get to use all the nice navigation skills that + we have on the robot. To use those, we need to use a variant of the + SpotRearrangementEnv from above. """ def __init__(self, use_gui: bool = True) -> None: #pylint:disable=super-init-not-called @@ -2469,7 +2516,10 @@ def __init__(self, use_gui: bool = True) -> None: #pylint:disable=super-init-no self._last_action: Optional[Action] = None # Create constant objects. self._spot_object = Object("robot", _robot_type) - op_to_name = {o.name: o for o in self._create_operators()} + op_to_name = {o.name: o + for o in self._create_operators() + } | {o.name: o + for o in _create_operators()} self._strips_operators = { op_to_name[o] for o in self.op_names_to_keep() @@ -2707,7 +2757,7 @@ def goal_predicates(self) -> Set[Predicate]: @classmethod def get_name(cls) -> str: - return "spot_vlm_cup_table_env" + return "spot_vlm_simple_cup_table_env" @property def _detection_id_to_obj(self) -> Dict[ObjectDetectionID, Object]: @@ -2764,6 +2814,135 @@ def _generate_goal_description(self) -> GoalDescription: return "get the cup onto the table!" +class SimpleTableWipingEnv(SpotMinimalVLMPredicateEnv): + """An environment to test the task of actually clearing objects from a + table and then wiping the table.""" + + @property + def predicates(self) -> Set[Predicate]: + preds = set(p for p in _ALL_PREDICATES | _VLM_PREDICATES if p.name in [ + "Holding", "HandEmpty", "NotHolding", "Inside", "VLMOnTable", + "VLMIn", "CanBeUsedForErasing", "TableClean", "TableWiped", + "TableClear", "VLMOnFloor" + ]) + return preds + + @property + def goal_predicates(self) -> Set[Predicate]: + return self.predicates + + @classmethod + def get_name(cls) -> str: + return "spot_vlm_simple_table_wiping_env" + + @property + def _detection_id_to_obj(self) -> Dict[ObjectDetectionID, Object]: + detection_id_to_obj: Dict[ObjectDetectionID, Object] = {} + objects = { + Object("clear_plastic_trash_can", _immovable_object_type), + Object("fluffy_toy_duster", _movable_object_type), + Object("apple", _movable_object_type), + Object("childrens_play_table", _table_type), + } + for o in objects: + detection_id = LanguageObjectDetectionID(o.name) + detection_id_to_obj[detection_id] = o + return detection_id_to_obj + + def _create_operators(self) -> Iterator[STRIPSOperator]: + # Pick object to clear table. + robot = Variable("?robot", _robot_type) + obj = Variable("?object", _movable_object_type) + surface = Variable("?table", _table_type) + parameters = [robot, obj, surface] + preconds: Set[LiftedAtom] = { + LiftedAtom(_HandEmpty, [robot]), + LiftedAtom(_NotHolding, [robot, obj]), + LiftedAtom(_VLMOnTable, [obj, surface]), + } + add_effs: Set[LiftedAtom] = { + LiftedAtom(_Holding, [robot, obj]), + LiftedAtom(_TableClear, [surface]) + } + del_effs: Set[LiftedAtom] = { + LiftedAtom(_HandEmpty, [robot]), + LiftedAtom(_NotHolding, [robot, obj]), + LiftedAtom(_VLMOnTable, [obj, surface]), + } + ignore_effs: Set[Predicate] = set() + yield STRIPSOperator("TeleopPickToClearTable", parameters, preconds, + add_effs, del_effs, ignore_effs) + + # Picking from the floor. + robot = Variable("?robot", _robot_type) + obj = Variable("?object", _movable_object_type) + surface = Variable("?table", _table_type) + parameters = [robot, obj] + preconds: Set[LiftedAtom] = { + LiftedAtom(_HandEmpty, [robot]), + LiftedAtom(_NotHolding, [robot, obj]), + } + add_effs: Set[LiftedAtom] = { + LiftedAtom(_Holding, [robot, obj]), + } + del_effs: Set[LiftedAtom] = { + LiftedAtom(_HandEmpty, [robot]), + LiftedAtom(_NotHolding, [robot, obj]), + } + ignore_effs: Set[Predicate] = set() + yield STRIPSOperator("TeleopPickFromFloor", parameters, preconds, + add_effs, del_effs, ignore_effs) + + # Place object inside + robot = Variable("?robot", _robot_type) + obj = Variable("?object", _movable_object_type) + surf = Variable("?surf", _immovable_object_type) + parameters = [robot, obj, surf] + preconds = {LiftedAtom(_Holding, [robot, obj])} + add_effs = { + LiftedAtom(_HandEmpty, [robot]), + LiftedAtom(_NotHolding, [robot, obj]), + LiftedAtom(_VLMIn, [obj, surf]) + } + del_effs = {LiftedAtom(_Holding, [robot, obj])} + ignore_effs = set() + yield STRIPSOperator("TeleopPlaceInside", parameters, preconds, + add_effs, del_effs, ignore_effs) + + # Wipe surface + robot = Variable("?robot", _robot_type) + obj = Variable("?object", _movable_object_type) + surf = Variable("?surf", _table_type) + parameters = [robot, obj, surf] + preconds = { + LiftedAtom(_Holding, [robot, obj]), + LiftedAtom(_CanBeUsedForErasing, [obj]), + LiftedAtom(_TableClear, [surf]), + } + add_effs = { + LiftedAtom(_TableWiped, [surf]), + } + del_effs = {} + ignore_effs = set() + yield STRIPSOperator("TeleopWipe", parameters, preconds, add_effs, + del_effs, ignore_effs) + + def op_names_to_keep(self) -> Set[str]: + """Return the names of the operators we want to keep.""" + return { + "TeleopPickToClearTable", + "TeleopPlaceInside", + "TeleopWipe", + "MoveToReachObject", + "MoveToHandViewObject", + "TeleopPickFromFloor", + "PlaceObjectOnTop", + } + + def _generate_goal_description(self) -> GoalDescription: + return "clean up the table!" + + class DustpanSweepingTestEnv(SpotMinimalVLMPredicateEnv): """An environment to test a demo task of sweeping some wrappers into a dustpan.""" @@ -2826,8 +3005,8 @@ def _create_operators(self) -> Iterator[STRIPSOperator]: } del_effs = {LiftedAtom(_Holding, [robot, dustpan])} ignore_effs = set() - yield STRIPSOperator("PlaceNextTo", parameters, preconds, add_effs, - del_effs, ignore_effs) + yield STRIPSOperator("TeleopPlaceNextTo", parameters, preconds, + add_effs, del_effs, ignore_effs) # Pick(robot, broom) robot = Variable("?robot", _robot_type) @@ -2860,8 +3039,8 @@ def _create_operators(self) -> Iterator[STRIPSOperator]: add_effs = {LiftedAtom(_Inside, [mess, dustpan])} del_effs = set() ignore_effs = set() - yield STRIPSOperator("Sweep", parameters, preconds, add_effs, del_effs, - ignore_effs) + yield STRIPSOperator("TeleopSweep", parameters, preconds, add_effs, + del_effs, ignore_effs) # Place(robot, broom) robot = Variable("?robot", _robot_type) @@ -2874,14 +3053,14 @@ def _create_operators(self) -> Iterator[STRIPSOperator]: } del_effs = {LiftedAtom(_Holding, [robot, broom])} ignore_effs = set() - yield STRIPSOperator("PlaceOnFloor", parameters, preconds, add_effs, - del_effs, ignore_effs) + yield STRIPSOperator("TeleopPlaceOnFloor", parameters, preconds, + add_effs, del_effs, ignore_effs) def op_names_to_keep(self) -> Set[str]: """Return the names of the operators we want to keep.""" return { - "TeleopPick1", "PlaceNextTo", "TeleopPick2", "Sweep", - "PlaceOnFloor" + "TeleopPick1", "TeleopPlaceNextTo", "TeleopPick2", "TeleopSweep", + "TeleopPlaceOnFloor" } def _generate_goal_description(self) -> GoalDescription: @@ -2997,7 +3176,7 @@ def _get_dry_task(self, train_or_test: str, robot_pos=robot_pose, nonpercept_atoms=self._get_initial_nonpercept_atoms(), nonpercept_predicates=(self.predicates - self.percept_predicates), - ) + object_detections_per_camera={}) # Finish the task. goal_description = self._generate_goal_description() @@ -3421,7 +3600,7 @@ def _get_dry_task(self, train_or_test: str, robot_pos=robot_pose, nonpercept_atoms=self._get_initial_nonpercept_atoms(), nonpercept_predicates=(self.predicates - self.percept_predicates), - ) + object_detections_per_camera={}) # Finish the task. goal_description = self._generate_goal_description() @@ -3580,7 +3759,7 @@ def _detection_id_to_obj(self) -> Dict[ObjectDetectionID, Object]: red_block = Object("red_block", _movable_object_type) red_block_detection = LanguageObjectDetectionID( - "red block/orange block/yellow block") + "green block/red block/orange block/yellow block") detection_id_to_obj[red_block_detection] = red_block for obj, pose in get_known_immovable_objects().items(): @@ -3595,3 +3774,291 @@ def _generate_goal_description(self) -> GoalDescription: def _get_dry_task(self, train_or_test: str, task_idx: int) -> EnvironmentTask: raise NotImplementedError("Dry task generation not implemented.") + + +############################################################################### +# VLM Test Env with Map # +############################################################################### + + +class VLMCupEnv(SpotRearrangementEnv): + """A version of the SimpleVLMCupEnv, but with actual skills that the robot + can execute instead of relying on teleop.""" + + def __init__(self, use_gui: bool = True) -> None: + super().__init__(use_gui) + + op_to_name = {o.name: o for o in _create_operators()} + op_names_to_keep = { + "MoveToReachObject", + "MoveToHandViewObject", + "PickObjectFromTop", + } + self._strips_operators = {op_to_name[o] for o in op_names_to_keep} + # We add in a place operator that uses VLMOn instead + # of the typical 'OnTop' predicate. + # PlaceObjectOnTop + robot = Variable("?robot", _robot_type) + held = Variable("?held", _movable_object_type) + surface = Variable("?surface", _immovable_object_type) + parameters = [robot, held, surface] + preconds = { + LiftedAtom(_Holding, [robot, held]), + LiftedAtom(_Reachable, [robot, surface]), + LiftedAtom(_NEq, [held, surface]), + LiftedAtom(_IsPlaceable, [held]), + LiftedAtom(_HasFlatTopSurface, [surface]), + LiftedAtom(_FitsInXY, [held, surface]), + } + add_effs = { + LiftedAtom(_VLMOn, [held, surface]), + LiftedAtom(_HandEmpty, [robot]), + LiftedAtom(_NotHolding, [robot, held]), + } + del_effs = { + LiftedAtom(_Holding, [robot, held]), + } + ignore_effs: Set[Predicate] = set() + self._strips_operators.add( + STRIPSOperator("PlaceObjectOnTop", parameters, preconds, add_effs, + del_effs, ignore_effs)) + + @property + def predicates(self) -> Set[Predicate]: + return set(p for p in _ALL_PREDICATES | _VLM_PREDICATES if p.name in [ + "Holding", "HandEmpty", "NotHolding", "Inside", "VLMOn", + "Reachable", "InHandView", "InView" + ]) + + @property + def goal_predicates(self) -> Set[Predicate]: + return self.predicates + + @classmethod + def get_name(cls) -> str: + return "spot_vlm_cup_table_env" + + @property + def _detection_id_to_obj(self) -> Dict[ObjectDetectionID, Object]: + detection_id_to_obj: Dict[ObjectDetectionID, Object] = {} + objects = { + Object("yellow_toy_cup", _movable_object_type), + Object("small_cardboard_box_with_black_tape", + _immovable_object_type), + } + for o in objects: + detection_id = LanguageObjectDetectionID(o.name) + detection_id_to_obj[detection_id] = o + + for obj, pose in get_known_immovable_objects().items(): + stat_detection_id = KnownStaticObjectDetectionID(obj.name, pose) + detection_id_to_obj[stat_detection_id] = obj + + return detection_id_to_obj + + def _generate_goal_description(self) -> GoalDescription: + return "get the cup onto the table!" + + def _get_dry_task(self, train_or_test: str, + task_idx: int) -> EnvironmentTask: + raise NotImplementedError("Dry task generation not implemented.") + + +############################################################################### +# Table Wiping Env with Map # +############################################################################### + + +class VLMTableWipingEnv(SpotRearrangementEnv): + """A version of the SimpleTableWipingEnv, but with an actual map and some + skills that the robot can execute instead of relying on teleop. + + NOTE: for now, we mostly have teleop actions, but we intend to + replace them with actual skills in the future. + """ + + def __init__(self, use_gui: bool = True) -> None: + super().__init__(use_gui) + + # NOTE: temporary just to make data collection easier. + # Comment out this block below when actually running! + # op_to_name = {o.name: o for o in _create_operators()} + # op_names_to_keep = { + # "MoveToReachObject", + # "MoveToHandViewObject", + # } + # self._strips_operators = {op_to_name[o] for o in op_names_to_keep} + + # NOTE: temporary just to make data collection easier. + # Add teleop versions of MoveToReach and MoveToHandView! + # MoveToReachObject + self._strips_operators = set() + robot = Variable("?robot", _robot_type) + obj = Variable("?object", _base_object_type) + parameters = [robot, obj] + preconds = { + LiftedAtom(_NotBlocked, [obj]), + LiftedAtom(_NotHolding, [robot, obj]), + } + add_effs = {LiftedAtom(_Reachable, [robot, obj])} + del_effs: Set[LiftedAtom] = set() + ignore_effs = { + _Reachable, _InHandView, _InView, _RobotReadyForSweeping + } + self._strips_operators.add( + STRIPSOperator("TeleopMoveToReachObject", parameters, preconds, + add_effs, del_effs, ignore_effs)) + + # MoveToHandViewObject + robot = Variable("?robot", _robot_type) + obj = Variable("?object", _movable_object_type) + parameters = [robot, obj] + preconds = { + LiftedAtom(_NotBlocked, [obj]), + LiftedAtom(_HandEmpty, [robot]) + } + add_effs = {LiftedAtom(_InHandView, [robot, obj])} + del_effs = set() + ignore_effs = { + _Reachable, _InHandView, _InView, _RobotReadyForSweeping + } + self._strips_operators.add( + STRIPSOperator("TeleopMoveToHandViewObject", parameters, preconds, + add_effs, del_effs, ignore_effs)) + + # We now add in specific operators for the table wiping task that + # are tied to specific teleop actions. + # Pick object to clear table. + robot = Variable("?robot", _robot_type) + obj = Variable("?object", _movable_object_type) + surface = Variable("?table", _table_type) + parameters = [robot, obj, surface] + preconds: Set[LiftedAtom] = { + LiftedAtom(_InHandView, [robot, obj]), + LiftedAtom(_HandEmpty, [robot]), + LiftedAtom(_NotHolding, [robot, obj]), + LiftedAtom(_VLMOnTable, [obj, surface]), + } + add_effs: Set[LiftedAtom] = { + LiftedAtom(_Holding, [robot, obj]), + LiftedAtom(_TableClear, [surface]) + } + del_effs: Set[LiftedAtom] = { + LiftedAtom(_HandEmpty, [robot]), + LiftedAtom(_NotHolding, [robot, obj]), + LiftedAtom(_VLMOnTable, [obj, surface]), + LiftedAtom(_InHandView, [robot, obj]), + } + ignore_effs: Set[Predicate] = set() + self._strips_operators.add( + STRIPSOperator("TeleopPickToClearTable", parameters, preconds, + add_effs, del_effs, ignore_effs)) + + # Picking from the floor. + robot = Variable("?robot", _robot_type) + obj = Variable("?object", _movable_object_type) + surface = Variable("?table", _table_type) + parameters = [robot, obj] + preconds: Set[LiftedAtom] = { + LiftedAtom(_HandEmpty, [robot]), + LiftedAtom(_NotHolding, [robot, obj]), + LiftedAtom(_InHandView, [robot, obj]), + } + add_effs: Set[LiftedAtom] = { + LiftedAtom(_Holding, [robot, obj]), + } + del_effs: Set[LiftedAtom] = { + LiftedAtom(_HandEmpty, [robot]), + LiftedAtom(_NotHolding, [robot, obj]), + LiftedAtom(_InHandView, [robot, obj]), + } + ignore_effs: Set[Predicate] = set() + self._strips_operators.add( + STRIPSOperator("TeleopPickFromFloor", parameters, preconds, + add_effs, del_effs, ignore_effs)) + + # Place object inside + robot = Variable("?robot", _robot_type) + obj = Variable("?object", _movable_object_type) + surf = Variable("?surf", _immovable_object_type) + parameters = [robot, obj, surf] + preconds = { + LiftedAtom(_Holding, [robot, obj]), + LiftedAtom(_Reachable, [robot, surf]), + } + add_effs = { + LiftedAtom(_HandEmpty, [robot]), + LiftedAtom(_NotHolding, [robot, obj]), + LiftedAtom(_VLMIn, [obj, surf]) + } + del_effs = {LiftedAtom(_Holding, [robot, obj])} + ignore_effs = set() + self._strips_operators.add( + STRIPSOperator("TeleopPlaceInside", parameters, preconds, add_effs, + del_effs, ignore_effs)) + + # Wipe surface + robot = Variable("?robot", _robot_type) + obj = Variable("?object", _movable_object_type) + surf = Variable("?surf", _table_type) + parameters = [robot, obj, surf] + preconds = { + LiftedAtom(_Reachable, [robot, surf]), + LiftedAtom(_Holding, [robot, obj]), + LiftedAtom(_CanBeUsedForErasing, [obj]), + LiftedAtom(_TableClear, [surf]), + } + add_effs = { + LiftedAtom(_TableWiped, [surf]), + } + del_effs = {} + ignore_effs = set() + self._strips_operators.add( + STRIPSOperator("TeleopWipe", parameters, preconds, add_effs, + del_effs, ignore_effs)) + + @property + def predicates(self) -> Set[Predicate]: + preds = set(p for p in _ALL_PREDICATES | _VLM_PREDICATES if p.name in [ + "Holding", "HandEmpty", "NotHolding", "Inside", "VLMOnTable", + "VLMIn", "CanBeUsedForErasing", "TableClean", "TableWiped", + "TableClear", "VLMOnFloor", "InHandView", "Reachable" + ]) + return preds + + @property + def goal_predicates(self) -> Set[Predicate]: + return self.predicates + + @classmethod + def get_name(cls) -> str: + return "spot_vlm_table_wiping_env" + + @property + def _detection_id_to_obj(self) -> Dict[ObjectDetectionID, Object]: + detection_id_to_obj: Dict[ObjectDetectionID, Object] = {} + objects = { + Object("clear_plastic_trash_can", _immovable_object_type), + Object("fluffy_toy_duster", _movable_object_type), + } + for o in objects: + detection_id = LanguageObjectDetectionID(o.name) + detection_id_to_obj[detection_id] = o + + detection_id_to_obj[LanguageObjectDetectionID( + "coffee_table/surfboard")] = Object("table", _table_type) + detection_id_to_obj[LanguageObjectDetectionID( + "apple/red_ball")] = Object("apple", _movable_object_type) + + for obj, pose in get_known_immovable_objects().items(): + stat_detection_id = KnownStaticObjectDetectionID(obj.name, pose) + detection_id_to_obj[stat_detection_id] = obj + + return detection_id_to_obj + + def _generate_goal_description(self) -> GoalDescription: + return "clean up the table!" + + def _get_dry_task(self, train_or_test: str, + task_idx: int) -> EnvironmentTask: + raise NotImplementedError("Dry task generation not implemented.") diff --git a/predicators/execution_monitoring/expected_atoms_monitor.py b/predicators/execution_monitoring/expected_atoms_monitor.py index 4c3bd1b564..4d9b1f2c36 100644 --- a/predicators/execution_monitoring/expected_atoms_monitor.py +++ b/predicators/execution_monitoring/expected_atoms_monitor.py @@ -40,11 +40,20 @@ def step(self, state: State) -> bool: for a in (next_expected_atoms - next_expected_vlm_atoms) if not a.holds(state) } + # Query the VLM for the expected atom values by leveraging + # utils.abstract(). This is a bit ugly, but it's better than querying + # for the specific atoms we want to check - especially in the Spot env, + # we already have evaluated all the necessary VLM atoms and put them in + # the state. vlm_unsat_atoms = set() if len(next_expected_vlm_atoms) > 0: - vlm_unsat_atoms = next_expected_atoms - ( - utils.query_vlm_for_atom_vals(next_expected_vlm_atoms, - state)) # pragma: no cover + next_expected_vlm_preds = { + atom.predicate + for atom in next_expected_vlm_atoms + } + vlm_atom_vals = utils.abstract(state, next_expected_vlm_preds) + vlm_atoms_that_hold_in_state = next_expected_atoms & vlm_atom_vals + vlm_unsat_atoms = next_expected_vlm_atoms - vlm_atoms_that_hold_in_state unsat_atoms = non_vlm_unsat_atoms | vlm_unsat_atoms if not unsat_atoms: return False diff --git a/predicators/ground_truth_models/spot_env/nsrts.py b/predicators/ground_truth_models/spot_env/nsrts.py index 55dfbe571d..1c76ac9665 100644 --- a/predicators/ground_truth_models/spot_env/nsrts.py +++ b/predicators/ground_truth_models/spot_env/nsrts.py @@ -289,7 +289,8 @@ def get_env_names(cls) -> Set[str]: "spot_cube_env", "spot_soda_floor_env", "spot_soda_table_env", "spot_soda_bucket_env", "spot_soda_chair_env", "spot_main_sweep_env", "spot_ball_and_cup_sticky_table_env", - "spot_brush_shelf_env", "lis_spot_block_floor_env" + "spot_brush_shelf_env", "lis_spot_block_floor_env", + "spot_vlm_simple_table_wiping_env", "spot_vlm_table_wiping_env" } @staticmethod @@ -321,10 +322,7 @@ def get_nsrts(env_name: str, types: Dict[str, Type], "PrepareContainerForSweeping": _prepare_sweeping_sampler, "DropNotPlaceableObject": utils.null_sampler, "MoveToReadySweep": utils.null_sampler, - "TeleopPick1": utils.null_sampler, - "TeleopPlace1": utils.null_sampler, "PlaceNextTo": utils.null_sampler, - "TeleopPick2": utils.null_sampler, "Sweep": utils.null_sampler, "PlaceOnFloor": utils.null_sampler } @@ -337,7 +335,10 @@ def get_nsrts(env_name: str, types: Dict[str, Type], # similarly in the future. for strips_op in env.strips_operators: - sampler = operator_name_to_sampler[strips_op.name] + if "teleop" in strips_op.name.lower(): + sampler = utils.null_sampler + else: + sampler = operator_name_to_sampler[strips_op.name] option = options[strips_op.name] nsrt = strips_op.make_nsrt( option=option, diff --git a/predicators/ground_truth_models/spot_env/options.py b/predicators/ground_truth_models/spot_env/options.py index e5562c7ad3..75a9412138 100644 --- a/predicators/ground_truth_models/spot_env/options.py +++ b/predicators/ground_truth_models/spot_env/options.py @@ -963,12 +963,6 @@ def _teleop(robot: Robot, lease_client: LeaseClient) -> None: "PrepareContainerForSweeping": Box(-np.inf, np.inf, (3, )), # dx, dy, dyaw "DropNotPlaceableObject": Box(0, 1, (0, )), # empty "MoveToReadySweep": Box(0, 1, (0, )), # empty - "TeleopPick1": Box(0, 1, (0, )), # empty - "PlaceNextTo": Box(0, 1, (0, )), # empty - "TeleopPick2": Box(0, 1, (0, )), # empty - "TeleopPlace1": Box(0, 1, (0, )), # empty - "Sweep": Box(0, 1, (0, )), # empty - "PlaceOnFloor": Box(0, 1, (0, )) # empty } # NOTE: the policies MUST be unique because they output actions with extra info @@ -992,13 +986,6 @@ def _teleop(robot: Robot, lease_client: LeaseClient) -> None: "PrepareContainerForSweeping": _prepare_container_for_sweeping_policy, "DropNotPlaceableObject": _drop_not_placeable_object_policy, "MoveToReadySweep": _move_to_ready_sweep_policy, - "TeleopPick1": _create_teleop_policy_with_name("TeleopPick1"), - "PlaceNextTo": _create_teleop_policy_with_name("PlaceNextTo"), - "TeleopPlace": _create_teleop_policy_with_name("TeleopPlace"), - "TeleopPick2": _create_teleop_policy_with_name("TeleopPick2"), - "TeleopPlace1": _create_teleop_policy_with_name("TeleopPlace1"), - "Sweep": _create_teleop_policy_with_name("Sweep"), - "PlaceOnFloor": _create_teleop_policy_with_name("PlaceOnFloor") } @@ -1021,8 +1008,12 @@ def __init__(self, operator_name: str, types: List[Type]) -> None: 0, 1, (0, )) _OPERATOR_NAME_TO_POLICY[ "PickObjectFromTop"] = _sim_safe_pick_object_from_top_policy - params_space = _OPERATOR_NAME_TO_PARAM_SPACE[operator_name] - policy = _OPERATOR_NAME_TO_POLICY[operator_name] + if "teleop" in operator_name.lower(): + policy = _create_teleop_policy_with_name(operator_name) + params_space = Box(0, 1, (0, )) # null + else: + params_space = _OPERATOR_NAME_TO_PARAM_SPACE[operator_name] + policy = _OPERATOR_NAME_TO_POLICY[operator_name] super().__init__(operator_name, policy, types, params_space) def __reduce__(self) -> Tuple: @@ -1035,17 +1026,12 @@ class SpotEnvsGroundTruthOptionFactory(GroundTruthOptionFactory): @classmethod def get_env_names(cls) -> Set[str]: return { - "spot_vlm_dustpan_test_env", - "spot_vlm_cup_table_env", - "spot_cube_env", - "spot_soda_floor_env", - "spot_soda_table_env", - "spot_soda_bucket_env", - "spot_soda_chair_env", - "spot_main_sweep_env", - "spot_ball_and_cup_sticky_table_env", - "spot_brush_shelf_env", - "lis_spot_block_floor_env", + "spot_vlm_dustpan_test_env", "spot_vlm_cup_table_env", + "spot_cube_env", "spot_soda_floor_env", "spot_soda_table_env", + "spot_soda_bucket_env", "spot_soda_chair_env", + "spot_main_sweep_env", "spot_ball_and_cup_sticky_table_env", + "spot_brush_shelf_env", "lis_spot_block_floor_env", + "spot_vlm_simple_table_wiping_env", "spot_vlm_table_wiping_env" } @classmethod diff --git a/predicators/perception/spot_perceiver.py b/predicators/perception/spot_perceiver.py index 1a5f9114ca..2d80cc80de 100644 --- a/predicators/perception/spot_perceiver.py +++ b/predicators/perception/spot_perceiver.py @@ -3,7 +3,7 @@ import logging import time from pathlib import Path -from typing import Dict, List, Optional, Set +from typing import Dict, List, Optional, Set, Tuple import imageio.v2 as iio import numpy as np @@ -15,19 +15,103 @@ from predicators import utils from predicators.envs import BaseEnv, get_or_create_env from predicators.envs.spot_env import HANDEMPTY_GRIPPER_THRESHOLD, \ - SpotCubeEnv, SpotRearrangementEnv, _drafting_table_type, \ - _PartialPerceptionState, _SpotObservation, in_general_view_classifier + LanguageObjectDetectionID, ObjectDetectionID, RGBDImageWithContext, \ + SegmentedBoundingBox, SpotCubeEnv, SpotRearrangementEnv, \ + _drafting_table_type, _PartialPerceptionState, _SpotObservation, \ + in_general_view_classifier from predicators.perception.base_perceiver import BasePerceiver from predicators.settings import CFG from predicators.spot_utils.utils import _container_type, _dustpan_type, \ - _immovable_object_type, _movable_object_type, _robot_type, \ + _immovable_object_type, _movable_object_type, _robot_type, _table_type, \ _wrappers_type, get_allowed_map_regions, load_spot_metadata, \ object_to_top_down_geom from predicators.structs import Action, DefaultState, EnvironmentTask, \ GoalDescription, GroundAtom, Object, Observation, Predicate, \ SpotActionExtraInfo, State, Task, Video, VLMPredicate, _Option +# Helper functions. +CAMERA_NAME_TO_ANNOTATION = { + 'hand_color_image': "Hand Camera Image", + 'back_fisheye_image': "Back Camera Image", + 'frontleft_fisheye_image': "Front Left Camera Image", + 'frontright_fisheye_image': "Front Right Camera Image", + 'left_fisheye_image': "Left Camera Image", + 'right_fisheye_image': "Right Camera Image" +} + +def annotate_imgs_with_detections( + img_objects: Dict[str, RGBDImageWithContext], + object_detections_per_camera: Dict[str, List[Tuple[ObjectDetectionID, + SegmentedBoundingBox]]] +) -> List[PIL.Image.Image]: + """Annotate images via editing the pixesl directly to include object + detection bounding boxes and camera names.""" + img_names = [v.camera_name for _, v in img_objects.items()] + imgs = [v.rotated_rgb for _, v in img_objects.items()] + pil_imgs = [PIL.Image.fromarray(img) for img in imgs] # type: ignore + # Annotate images with detected objects (names + bounding box) + # and camera name. + for i, camera_name in enumerate(img_names): + draw = ImageDraw.Draw(pil_imgs[i]) + # Annotate with camera name. + font = utils.get_scaled_default_font(draw, 4) + _ = utils.add_text_to_draw_img(draw, (0, 0), + CAMERA_NAME_TO_ANNOTATION[camera_name], + font) + # TODO: just commenting out for now to see if this helps labelling. + # # Annotate with object detections. + # detections = object_detections_per_camera[camera_name] + # for obj_id, seg_bb in detections: + # if isinstance(obj_id, LanguageObjectDetectionID): + # x0, y0, x1, y1 = seg_bb.bounding_box + # x0, x1 = sorted([x0, x1]) + # y0, y1 = sorted([y0, y1]) + # draw.rectangle([(x0, y0), (x1, y1)], outline='green', width=2) + # text = f"{obj_id.language_id}" + # font = utils.get_scaled_default_font(draw, 3) + # text_mask = font.getmask(text) # type: ignore + # text_width, text_height = text_mask.size + # text_bbox = [(x0, y0 - 1.5 * text_height), + # (x0 + text_width + 1, y0)] + # draw.rectangle(text_bbox, fill='green') + # draw.text((x0 + 1, y0 - 1.5 * text_height), + # text, + # fill='white', + # font=font) + annotated_imgs = list(pil_imgs) + return annotated_imgs + + +def save_annotated_imgs_for_vlm_demo(annotated_imgs: List[PIL.Image.Image], + save_dir: Path) -> None: + """Save annotated images useful images as part of creating demonstrations + for learning from teleop'ed Spot trajectories.""" + # If `save_dir` doesn't exist, create it. + save_dir.mkdir(parents=True, exist_ok=True) + # Collect all the names of folders within `save_dir`. + subfolders = [f for f in save_dir.iterdir() if f.is_dir()] + # Find the highest int that's a subfolder of `save_dir`. + prev_timestep = -1 + for folder in subfolders: + try: + folder_int = int(folder.name) + prev_timestep = max(prev_timestep, folder_int) + except ValueError: + # Skip folders that are not integers. + continue + # Add 1 to `prev_timestep` to get `curr_timestep`. + curr_timestep = prev_timestep + 1 + curr_timestep_dir = save_dir / str(curr_timestep) + # Create the new folder for `curr_timestep`. + curr_timestep_dir.mkdir(parents=True, exist_ok=True) + # Save all the images as jpg files under the new folder. + for i, img in enumerate(annotated_imgs): + img_path = curr_timestep_dir / f"image_{i}.jpg" + img.save(img_path, format="JPEG") + + +# Main perceiver classes. class SpotPerceiver(BasePerceiver): """A perceiver specific to spot envs.""" @@ -57,6 +141,13 @@ def __init__(self) -> None: # Load static, hard-coded features of objects, like their shapes. meta = load_spot_metadata() self._static_object_features = meta.get("static-object-features", {}) + # Histories and other artefacts (for VLM labelling). + self._curr_state: Optional[State] = DefaultState + self._curr_state.simulator_state = {} + self._curr_annotated_imgs: List[PIL.Image.Image] = [] + self._state_history: List[State] = [] + self._executed_skill_history: List[Optional[_Option]] = [] + self._vlm_label_history: List[str] = [] @classmethod def get_name(cls) -> str: @@ -85,6 +176,15 @@ def reset(self, env_task: EnvironmentTask) -> Task: self._prev_action = None # already processed at the end of the cycle init_state = self._create_state() goal = self._create_goal(init_state, env_task.goal_description) + + # Reset run-specific things. + self._curr_state = DefaultState + self._curr_state.simulator_state = {} + self._state_history = [] + self._executed_skill_history = [] + self._vlm_label_history = [] + self._prev_action = None + return Task(init_state, goal) def update_perceiver_with_action(self, action: Action) -> None: @@ -96,6 +196,11 @@ def update_perceiver_with_action(self, action: Action) -> None: def step(self, observation: Observation) -> State: self._update_state_from_observation(observation) + # If we're trying to record VLM demos, then save the images. + if len(CFG.spot_vlm_teleop_demo_folderpath) > 0: + save_annotated_imgs_for_vlm_demo( + self._curr_annotated_imgs, + Path(CFG.spot_vlm_teleop_demo_folderpath)) # Update the curr held item when applicable. assert self._curr_env is not None if self._prev_action is not None: @@ -166,7 +271,6 @@ def step(self, observation: Observation) -> State: logging.info("[Perceiver] An object was lost: " f"{prev_held_object} was lost!") self._lost_objects.add(prev_held_object) - return self._create_state() def _update_state_from_observation(self, observation: Observation) -> None: @@ -202,10 +306,13 @@ def _update_state_from_observation(self, observation: Observation) -> None: self._robot_pos = observation.robot_pos for obj in observation.objects_in_view: self._lost_objects.discard(obj) + self._curr_annotated_imgs = annotate_imgs_with_detections( + observation.images, observation.object_detections_per_camera) def _create_state(self) -> State: if self._waiting_for_observation: return DefaultState + assert self._curr_state is not None # Build the continuous part of the state. assert self._robot is not None state_dict = { @@ -284,10 +391,68 @@ def _create_state(self) -> State: # logging.info("Simulator state:") # logging.info(simulator_state) + # Add the images and histories into the simulator_state. + simulator_state["images"] = self._curr_annotated_imgs + # At the first timestep, these histories will be empty due to + # self.reset(). But at every timestep that isn't the first one, + # they will be non-empty. + simulator_state["state_history"] = list(self._state_history) + # We do this here so the call to `utils.abstract()` a few lines later + # has the skill that was just run. + executed_skill = None + + if self._prev_action is not None: + assert self._prev_action.extra_info is not None + if self._prev_action.extra_info.action_name == "done": + # Just return the default state + return DefaultState + if self._prev_action.has_option(): + executed_skill = self._prev_action.get_option() + self._executed_skill_history.append( + executed_skill) # None in first timestep. + simulator_state["skill_history"] = list(self._executed_skill_history) + simulator_state["vlm_label_history"] = list(self._vlm_label_history) + + # Add to histories. + # A bit of extra work is required to build the VLM label history. + # We want to keep `utils.abstract()` as straightforward as possible, + # so we'll "rebuild" the VLM labels from the abstract state + # returned by `utils.abstract()`. And since we call this function, + # we might as well store the abstract state as a part of the simulator + # state so that we don't need to recompute it later in the approach or + # in planning. + assert self._curr_env is not None + preds = self._curr_env.predicates + state_copy = percept_state.copy() + state_copy.simulator_state = simulator_state + abstract_state = utils.abstract(state_copy, preds) + simulator_state["abstract_state"] = abstract_state + # Compute all the VLM atoms. `utils.abstract()` only returns the ones + # that are True. The remaining ones are the ones that are False. + vlm_preds = set(pred for pred in preds + if isinstance(pred, VLMPredicate)) + vlm_atoms = set() + for pred in vlm_preds: + for choice in utils.get_object_combinations( + list(state_copy), pred.types): + vlm_atoms.add(GroundAtom(pred, choice)) + vlm_atoms_list = sorted(vlm_atoms) + reconstructed_all_vlm_responses = [] + for atom in vlm_atoms_list: + if atom in abstract_state: + truth_value = 'True' + else: + truth_value = 'False' + atom_label = f"* {atom.get_vlm_query_str()}: {truth_value}" + reconstructed_all_vlm_responses.append(atom_label) + str_vlm_response = '\n'.join(reconstructed_all_vlm_responses) + self._vlm_label_history.append(str_vlm_response) + # Now finish the state. state = _PartialPerceptionState(percept_state.data, simulator_state=simulator_state) - + self._curr_state = state + self._state_history.append(self._curr_state.copy()) return state def _create_goal(self, state: State, @@ -503,6 +668,35 @@ def _create_goal(self, state: State, GroundAtom(ContainerReadyForSweeping, [bucket, black_table]), GroundAtom(IsSweeper, [brush]) } + if goal_description == "get the cup onto the table!": + robot = Object("robot", _robot_type) + cup = Object("yellow_toy_cup", _movable_object_type) + table = Object("small_cardboard_box_with_black_tape", + _immovable_object_type) + HandEmpty = pred_name_to_pred["HandEmpty"] + VLMOn = pred_name_to_pred["VLMOn"] + goal = { + GroundAtom(HandEmpty, [robot]), + GroundAtom(VLMOn, [cup, table]) + } + return goal + if goal_description == "clean up the table!": + VLMIn = pred_name_to_pred["VLMIn"] + TableClean = pred_name_to_pred["TableClean"] + TableClear = pred_name_to_pred["TableClear"] + TableWiped = pred_name_to_pred["TableWiped"] + CanBeUsedForErasing = pred_name_to_pred["CanBeUsedForErasing"] + trash_can = Object("clear_plastic_trash_can", + _immovable_object_type) + apple = Object("apple", _movable_object_type) + table = Object("table", _table_type) + eraser = Object("fluffy_toy_duster", _movable_object_type) + robot = Object("robot", _robot_type) + goal = { + GroundAtom(VLMIn, [apple, trash_can]), + GroundAtom(TableWiped, [table]), + } + return goal raise NotImplementedError("Unrecognized goal description") def render_mental_images(self, observation: Observation, @@ -590,15 +784,6 @@ class SpotMinimalPerceiver(BasePerceiver): anything about. """ - camera_name_to_annotation = { - 'hand_color_image': "Hand Camera Image", - 'back_fisheye_image': "Back Camera Image", - 'frontleft_fisheye_image': "Front Left Camera Image", - 'frontright_fisheye_image': "Front Right Camera Image", - 'left_fisheye_image': "Left Camera Image", - 'right_fisheye_image': "Right Camera Image" - } - def render_mental_images(self, observation: Observation, env_task: EnvironmentTask) -> Video: raise NotImplementedError() @@ -636,9 +821,9 @@ def _create_goal(self, state: State, Inside = pred_name_to_pred["Inside"] Holding = pred_name_to_pred["Holding"] HandEmpty = pred_name_to_pred["HandEmpty"] - VLMOn = pred_name_to_pred["VLMOn"] if goal_description == "get the cup onto the table!": + VLMOn = pred_name_to_pred["VLMOn"] robot = Object("robot", _robot_type) cup = Object("yellow_toy_cup", _movable_object_type) table = Object("cardboard_table", _immovable_object_type) @@ -656,6 +841,23 @@ def _create_goal(self, state: State, GroundAtom(Holding, [robot, dustpan]) } return goal + if goal_description == "clean up the table!": + VLMIn = pred_name_to_pred["VLMIn"] + TableClean = pred_name_to_pred["TableClean"] + TableClear = pred_name_to_pred["TableClear"] + TableWiped = pred_name_to_pred["TableWiped"] + CanBeUsedForErasing = pred_name_to_pred["CanBeUsedForErasing"] + trash_can = Object("clear_plastic_trash_can", + _immovable_object_type) + apple = Object("apple", _movable_object_type) + table = Object("childrens_play_table", _table_type) + eraser = Object("neon_green_fluffy_eraser", _movable_object_type) + robot = Object("robot", _robot_type) + goal = { + GroundAtom(VLMIn, [apple, trash_can]), + GroundAtom(TableWiped, [table]), + } + return goal raise NotImplementedError("Unrecognized goal description") @@ -689,47 +891,20 @@ def step(self, observation: Observation) -> State: self._waiting_for_observation = False self._robot = observation.robot - img_objects = observation.rgbd_images # RGBDImage objects - img_names = [v.camera_name for _, v in img_objects.items()] - imgs = [v.rotated_rgb for _, v in img_objects.items()] - pil_imgs = [PIL.Image.fromarray(img) for img in imgs] # type: ignore - # Annotate images with detected objects (names + bounding box) - # and camera name. - object_detections_per_camera = observation.object_detections_per_camera - for i, camera_name in enumerate(img_names): - draw = ImageDraw.Draw(pil_imgs[i]) - # Annotate with camera name. - font = utils.get_scaled_default_font(draw, 4) - _ = utils.add_text_to_draw_img( - draw, (0, 0), self.camera_name_to_annotation[camera_name], - font) - # Annotate with object detections. - detections = object_detections_per_camera[camera_name] - for obj_id, seg_bb in detections: - x0, y0, x1, y1 = seg_bb.bounding_box - x0, x1 = sorted([x0, x1]) - y0, y1 = sorted([y0, y1]) - draw.rectangle([(x0, y0), (x1, y1)], outline='green', width=2) - text = f"{obj_id.language_id}" - font = utils.get_scaled_default_font(draw, 3) - text_mask = font.getmask(text) # type: ignore - text_width, text_height = text_mask.size - text_bbox = [(x0, y0 - 1.5 * text_height), - (x0 + text_width + 1, y0)] - draw.rectangle(text_bbox, fill='green') - draw.text((x0 + 1, y0 - 1.5 * text_height), - text, - fill='white', - font=font) - annotated_imgs = list(pil_imgs) + annotated_imgs = annotate_imgs_with_detections( + observation.rgbd_images, observation.object_detections_per_camera) self._gripper_open_percentage = observation.gripper_open_percentage + # If we're trying to record VLM demos, then save the images. + if len(CFG.spot_vlm_teleop_demo_folderpath) > 0: + save_annotated_imgs_for_vlm_demo( + annotated_imgs, Path(CFG.spot_vlm_teleop_demo_folderpath)) self._curr_state = self._create_state() if observation.executed_skill is not None: if "Pick" in observation.executed_skill.extra_info.action_name: for obj in observation.executed_skill.extra_info.\ operator_objects: - if not obj.is_instance(_robot_type): + if obj.is_instance(_movable_object_type): # Turn the held feature on self._curr_state.set(obj, "held", 1.0) if "Place" in observation.executed_skill.extra_info.action_name: @@ -842,7 +1017,7 @@ def _create_state(self) -> State: "in_view": 0, "is_sweeper": 0, }) - elif obj.type.name == "immovable": + elif obj.type.name in ["immovable", "table"]: state_dict[obj].update({"flat_top_surface": 1}) else: raise ValueError( diff --git a/predicators/planning.py b/predicators/planning.py index c578c1dfd9..36e20bb3cd 100644 --- a/predicators/planning.py +++ b/predicators/planning.py @@ -326,6 +326,8 @@ def task_plan( logging.info(f"Initial atoms: {init_atoms}") logging.info( f"Reachable atoms not in init: {reachable_atoms - init_atoms}") + import ipdb + ipdb.set_trace() raise PlanningFailure(f"Goal {goal} not dr-reachable") dummy_task = Task(DefaultState, goal) metrics: Metrics = defaultdict(float) @@ -1197,7 +1199,6 @@ def run_task_plan_once( init_atoms = utils.abstract(task.init, preds) goal = task.goal objects = set(task.init) - start_time = time.perf_counter() if CFG.sesame_task_planner == "astar": diff --git a/predicators/settings.py b/predicators/settings.py index 68fe316ac0..4310b6c28c 100644 --- a/predicators/settings.py +++ b/predicators/settings.py @@ -186,6 +186,7 @@ class GlobalSettings: spot_run_dry = False spot_use_perfect_samplers = False # for debugging spot_sweep_env_goal_description = "get the objects into the bucket" + spot_vlm_teleop_demo_folderpath = "" # pddl blocks env parameters pddl_blocks_procedural_train_min_num_blocks = 3 @@ -435,7 +436,7 @@ class GlobalSettings: override_json_with_input = False # Only works with SpotEnv for now # parameters for vision language models - # gemini-1.5-pro-latest, gemini-1.5-pro-flash gpt-4-turbo, gpt-4o + # gemini-1.5-pro, gemini-1.5-flash, gpt-4-turbo, gpt-4o # NOTE: we need to create a dummy vlm so that tests on CI pass. vlm_model_name = "dummy" vlm_temperature = 0.0 diff --git a/predicators/spot_utils/README.md b/predicators/spot_utils/README.md index 45e743651e..6524673f7a 100644 --- a/predicators/spot_utils/README.md +++ b/predicators/spot_utils/README.md @@ -25,7 +25,7 @@ python predicators/main.py --env --approach "spot_wrapper[oracle]" -- python predicators/main.py --spot_robot_ip 192.168.80.3 --spot_graph_nav_map b45-621 --env lis_spot_block_floor_env --approach spot_wrapper[oracle] --bilevel_plan_without_sim True --seed 0 # an example to run LIS spot without a map -python predicators/main.py --env spot_vlm_cup_table_env --approach "spot_wrapper[oracle]" --seed 0 --num_train_tasks 0 --num_test_tasks 1 --spot_robot_ip 192.168.80.3 --perceiver spot_minimal_perceiver --bilevel_plan_without_sim True --vlm_test_time_atom_label_prompt_type img_option_diffs_label_history --vlm_model_name gpt-4o --execution_monitor expected_atoms +python predicators/main.py --env spot_vlm_simple_cup_table_env --approach "spot_wrapper[oracle]" --seed 0 --num_train_tasks 0 --num_test_tasks 1 --spot_robot_ip 192.168.80.3 --perceiver spot_minimal_perceiver --bilevel_plan_without_sim True --vlm_test_time_atom_label_prompt_type img_option_diffs_label_history --vlm_model_name gpt-4o --execution_monitor expected_atoms ``` ### Implement Your Task diff --git a/predicators/spot_utils/graph_nav_maps/b45-621/metadata.yaml b/predicators/spot_utils/graph_nav_maps/b45-621/metadata.yaml index cb37e2d951..404dcef608 100644 --- a/predicators/spot_utils/graph_nav_maps/b45-621/metadata.yaml +++ b/predicators/spot_utils/graph_nav_maps/b45-621/metadata.yaml @@ -46,4 +46,45 @@ static-object-features: length: 0.1 width: 0.1 placeable: 1 - is_sweeper: 0 \ No newline at end of file + is_sweeper: 0 + yellow_toy_cup: + shape: 2 + height: 0.1 + length: 0.1 + width: 0.1 + placeable: 1 + is_sweeper: 0 + small_cardboard_box_with_black_tape: + shape: 1 # cuboid + height: 0.55 + length: 0.28 + width: 0.6 + flat_top_surface: 1 + apple: + shape: 2 + height: 0.1 + length: 0.1 + width: 0.1 + placeable: 1 + is_sweeper: 0 + clear_plastic_trash_can: + shape: 2 + height: 0.1 + length: 0.2 + width: 0.2 + placeable: 1 + is_sweeper: 0 + flat_top_surface: 0 + fluffy_toy_duster: + shape: 1 + height: 0.1 + length: 0.1 + width: 0.1 + placeable: 1 + is_sweeper: 0 + table: + shape: 1 + height: 0.1 + length: 0.3 + width: 0.5 + flat_top_surface: 1 \ No newline at end of file diff --git a/predicators/spot_utils/perception/object_detection.py b/predicators/spot_utils/perception/object_detection.py index 388cf61845..c2e158a9e7 100644 --- a/predicators/spot_utils/perception/object_detection.py +++ b/predicators/spot_utils/perception/object_detection.py @@ -244,6 +244,7 @@ def _get_detection_score(img_detections: Dict[str, SegmentedBoundingBox], f"{obj_id} because it's out of bounds. " + \ f"(pose = {pose_xy})") continue + # Pose extraction succeeded. detections[obj_id] = pose break diff --git a/predicators/spot_utils/skills/spot_find_objects.py b/predicators/spot_utils/skills/spot_find_objects.py index ce8db3826d..43c42eb7d1 100644 --- a/predicators/spot_utils/skills/spot_find_objects.py +++ b/predicators/spot_utils/skills/spot_find_objects.py @@ -97,12 +97,12 @@ def _find_objects_with_choreographed_moves( return all_detections, all_artifacts # Fail. Analyze the RGBDs if you want (by uncommenting here). - # import imageio.v2 as iio - # for i, rgbds in enumerate(all_rgbds): - # for camera, rgbd in rgbds.items(): - # path = f"init_search_for_objects_angle{i}_{camera}.png" - # iio.imsave(path, rgbd.rgb) - # print(f"Wrote out to {path}.") + import imageio.v2 as iio + for i, rgbds in enumerate(all_rgbds): + for camera, rgbd in rgbds.items(): + path = f"init_search_for_objects_angle{i}_{camera}.png" + iio.imsave(path, rgbd.rgb) + print(f"Wrote out to {path}.") remaining_object_ids = set(object_ids) - set(all_detections) raise RuntimeError(f"Could not find objects: {remaining_object_ids}") diff --git a/predicators/spot_utils/spot_record_option_demo.py b/predicators/spot_utils/spot_record_option_demo.py new file mode 100644 index 0000000000..6eed642a40 --- /dev/null +++ b/predicators/spot_utils/spot_record_option_demo.py @@ -0,0 +1,49 @@ +"""Script to make it easy to record demonstrations from the Spot.""" + +from predicators.perception.spot_perceiver import save_annotated_imgs_for_vlm_demo, annotate_imgs_with_detections +from predicators import utils +from predicators.settings import CFG +from predicators.spot_utils.utils import verify_estop +from predicators.spot_utils.perception.spot_cameras import capture_images_without_context +from pathlib import Path + +def main(): + # Run this file alone to test manually. + # Make sure to pass in --spot_robot_ip. + + # pylint: disable=ungrouped-imports + import numpy as np + from bosdyn.client import create_standard_sdk + from bosdyn.client.lease import LeaseClient + from bosdyn.client.util import authenticate + + # Put inside a function to avoid variable scoping issues. + args = utils.parse_args(env_required=False, + seed_required=False, + approach_required=False) + utils.update_config(args) + + # Get constants. + hostname = CFG.spot_robot_ip + + # Instantiate a robot. + sdk = create_standard_sdk('MoveHandSkillTestClient') + robot = sdk.create_robot(hostname) + authenticate(robot) + verify_estop(robot) + lease_client = robot.ensure_client(LeaseClient.default_service_name) + lease_client.take() + + assert len(CFG.spot_vlm_teleop_demo_folderpath) > 0, "Please set the spot_vlm_teleop_demo_folderpath!" + + # Pull all the images from the spot cameras and annotate them + # with the camera names. + # NOTE: we currently don't run any object detection, though we could. + rgbd_images = capture_images_without_context(robot) + annotated_imgs = annotate_imgs_with_detections(rgbd_images, {}) + # Save the images properly. + save_annotated_imgs_for_vlm_demo(annotated_imgs, Path(CFG.spot_vlm_teleop_demo_folderpath)) + + +if __name__ == '__main__': + main() diff --git a/predicators/spot_utils/utils.py b/predicators/spot_utils/utils.py index d9e046a013..b3dc3109a6 100644 --- a/predicators/spot_utils/utils.py +++ b/predicators/spot_utils/utils.py @@ -96,6 +96,9 @@ class _Spot3DShape(Enum): _wrappers_type = Type("wrappers", list(_movable_object_type.feature_names), parent=_movable_object_type) +_table_type = Type("table", + list(_immovable_object_type.feature_names), + parent=_immovable_object_type) def get_collision_geoms_for_nav(state: State) -> List[_Geom2D]: diff --git a/predicators/utils.py b/predicators/utils.py index 2e7c59c383..92d08f1471 100644 --- a/predicators/utils.py +++ b/predicators/utils.py @@ -2660,13 +2660,18 @@ def query_vlm_for_atom_vals( logging.info(f"VLM output: \n{vlm_output_str}") # Parse out stuff. if len(label_history) > 0: # pragma: no cover - truth_values = re.findall(r'\* (.*): (True|False)', vlm_output_str) + truth_values = re.findall(r'\* (.*): (True|False|Unknown)', + vlm_output_str) for i, (atom_query, pred_label) in enumerate(zip(atom_queries_list, truth_values)): pred, label = pred_label - assert pred in atom_query + try: + assert pred in atom_query + except AssertionError: + import ipdb + ipdb.set_trace() label = label.lower() - if label == "true": + if "true" in label.lower(): true_atoms.add(vlm_atoms[i]) else: all_vlm_responses = vlm_output_str.strip().split("\n") @@ -2683,7 +2688,7 @@ def query_vlm_for_atom_vals( # value = curr_vlm_output_line[len(atom_query + ":"): # period_idx].lower().strip() value = curr_vlm_output_line.split(': ')[-1].strip('.').lower() - if value == "true": + if "true" in value: true_atoms.add(vlm_atoms[i]) return true_atoms @@ -2699,7 +2704,11 @@ def abstract(state: State, try: if state.simulator_state is not None and "abstract_state" in \ state.simulator_state: # pragma: no cover - return state.simulator_state["abstract_state"] + return { + atom + for atom in state.simulator_state["abstract_state"] + if atom.predicate in preds + } except (AttributeError, TypeError): pass # Start by pulling out all VLM predicates.