From 454e72ac7d4dd12e41a22545f93c693423c346d6 Mon Sep 17 00:00:00 2001 From: Samir mlika <105347215+mlikasam-askui@users.noreply.github.com> Date: Thu, 14 Aug 2025 11:48:34 +0200 Subject: [PATCH 1/3] Feat: Add locate all function --- README.md | 4 +- src/askui/agent.py | 2 +- src/askui/agent_base.py | 60 ++++++++++++++++++---- src/askui/android_agent.py | 2 +- src/askui/models/anthropic/messages_api.py | 16 +++--- src/askui/models/askui/inference_api.py | 21 ++++---- src/askui/models/askui/model_router.py | 4 +- src/askui/models/huggingface/spaces_api.py | 8 +-- src/askui/models/model_router.py | 2 +- src/askui/models/models.py | 6 +-- src/askui/models/shared/facade.py | 2 +- src/askui/models/ui_tars_ep/ui_tars_api.py | 4 +- tests/integration/agent/test_retry.py | 4 +- tests/integration/test_custom_models.py | 4 +- 14 files changed, 93 insertions(+), 46 deletions(-) diff --git a/README.md b/README.md index 862ab362..74636394 100644 --- a/README.md +++ b/README.md @@ -367,12 +367,12 @@ class MyGetAndLocateModel(GetModel, LocateModel): locator: str | Locator, image: ImageSource, model_choice: ModelComposition | str, - ) -> Point: + ) -> list[Point]: # Implement custom locate logic, e.g.: # - Use a different object detection model # - Implement custom element finding # - Call external vision services - return (100, 100) # Example coordinates + return [(100, 100)] # Example coordinates # Create model registry diff --git a/src/askui/agent.py b/src/askui/agent.py index 98fb82af..a0fa569a 100644 --- a/src/askui/agent.py +++ b/src/askui/agent.py @@ -182,7 +182,7 @@ def _click( def _mouse_move( self, locator: str | Locator, model: ModelComposition | str | None = None ) -> None: - point = self._locate(locator=locator, model=model) + point = self._locate(locator=locator, model=model)[0] self.tools.os.mouse_move(point[0], point[1]) @telemetry.record_call(exclude={"locator"}) diff --git a/src/askui/agent_base.py b/src/askui/agent_base.py index a98a4679..41ab429c 100644 --- a/src/askui/agent_base.py +++ b/src/askui/agent_base.py @@ -326,8 +326,8 @@ def _locate( locator: str | Locator, screenshot: Optional[Img] = None, model: ModelComposition | str | None = None, - ) -> Point: - def locate_with_screenshot() -> Point: + ) -> list[Point]: + def locate_with_screenshot() -> list[Point]: _screenshot = ImageSource( self._agent_os.screenshot() if screenshot is None else screenshot ) @@ -337,10 +337,10 @@ def locate_with_screenshot() -> Point: model_choice=model or self._model_choice["locate"], ) - point = self._retry.attempt(locate_with_screenshot) - self._reporter.add_message("ModelRouter", f"locate: ({point[0]}, {point[1]})") - logger.debug("ModelRouter locate: (%d, %d)", point[0], point[1]) - return point + points = self._retry.attempt(locate_with_screenshot) + self._reporter.add_message("ModelRouter", f"locate {len(points)} elements") + logger.debug("ModelRouter locate: %d elements", len(points)) + return points @telemetry.record_call(exclude={"locator", "screenshot"}) @validate_call(config=ConfigDict(arbitrary_types_allowed=True)) @@ -351,7 +351,7 @@ def locate( model: ModelComposition | str | None = None, ) -> Point: """ - Locates the UI element identified by the provided locator. + Locates the first matching UI element identified by the provided locator. Args: locator (str | Locator): The identifier or description of the element to @@ -374,8 +374,50 @@ def locate( print(f"Element found at coordinates: {point}") ``` """ - self._reporter.add_message("User", f"locate {locator}") - logger.debug("VisionAgent received instruction to locate %s", locator) + self._reporter.add_message("User", f"locate first matching element {locator}") + logger.debug( + "VisionAgent received instruction to locate first matching element %s", + locator, + ) + return self._locate(locator, screenshot, model)[0] + + @telemetry.record_call(exclude={"locator", "screenshot"}) + @validate_call(config=ConfigDict(arbitrary_types_allowed=True)) + def locate_all( + self, + locator: str | Locator, + screenshot: Optional[Img] = None, + model: ModelComposition | str | None = None, + ) -> list[Point]: + """ + Locates all matching UI elements identified by the provided locator. + + Args: + locator (str | Locator): The identifier or description of the element to + locate. + screenshot (Img | None, optional): The screenshot to use for locating the + element. Can be a path to an image file, a PIL Image object or a data + URL. If `None`, takes a screenshot of the currently selected display. + model (ModelComposition | str | None, optional): The composition or name + of the model(s) to be used for locating the element using the `locator`. + + Returns: + list[Point]: The coordinates of the elements as a list of tuples (x, y). + + Example: + ```python + from askui import VisionAgent + + with VisionAgent() as agent: + points = agent.locate_all("Submit button") + print(f"Found {len(points)} elements at coordinates: {points}") + ``` + """ + self._reporter.add_message("User", f"locate all matching UI elements {locator}") + logger.debug( + "VisionAgent received instruction to locate all matching UI elements %s", + locator, + ) return self._locate(locator, screenshot, model) @telemetry.record_call() diff --git a/src/askui/android_agent.py b/src/askui/android_agent.py index 6fcb6d35..6834f88b 100644 --- a/src/askui/android_agent.py +++ b/src/askui/android_agent.py @@ -198,7 +198,7 @@ def tap( msg += f" on {target}" self._reporter.add_message("User", msg) logger.debug("VisionAgent received instruction to click on %s", target) - point = self._locate(locator=target, model=model) + point = self._locate(locator=target, model=model)[0] self.os.tap(point[0], point[1]) @telemetry.record_call(exclude={"text"}) diff --git a/src/askui/models/anthropic/messages_api.py b/src/askui/models/anthropic/messages_api.py index bfb8704a..c5f40b99 100644 --- a/src/askui/models/anthropic/messages_api.py +++ b/src/askui/models/anthropic/messages_api.py @@ -200,7 +200,7 @@ def locate( locator: str | Locator, image: ImageSource, model_choice: ModelComposition | str, - ) -> Point: + ) -> list[Point]: if not isinstance(model_choice, str): error_msg = "Model composition is not supported for Claude" raise NotImplementedError(error_msg) @@ -221,12 +221,14 @@ def locate( ), model_choice=model_choice, ) - return scale_coordinates( - extract_click_coordinates(content), - image.root.size, - self._settings.resolution, - inverse=True, - ) + return [ + scale_coordinates( + extract_click_coordinates(content), + image.root.size, + self._settings.resolution, + inverse=True, + ) + ] except ( _UnexpectedResponseError, ValueError, diff --git a/src/askui/models/askui/inference_api.py b/src/askui/models/askui/inference_api.py index d40c5150..e706172c 100644 --- a/src/askui/models/askui/inference_api.py +++ b/src/askui/models/askui/inference_api.py @@ -160,7 +160,7 @@ def locate( locator: str | Locator, image: ImageSource, model_choice: ModelComposition | str, - ) -> Point: + ) -> list[Point]: serialized_locator = ( self._locator_serializer.serialize(locator=locator) if isinstance(locator, Locator) @@ -169,7 +169,7 @@ def locate( logger.debug(f"serialized_locator:\n{json_lib.dumps(serialized_locator)}") json: dict[str, Any] = { "image": image.to_data_url(), - "instruction": f"Click on {serialized_locator['instruction']}", + "instruction": f"get element {serialized_locator['instruction']}", } if "customElements" in serialized_locator: json["customElements"] = serialized_locator["customElements"] @@ -180,17 +180,20 @@ def locate( ) response = self._post(path="/inference", json=json) content = response.json() - assert content["type"] == "COMMANDS", ( + assert content["type"] == "DETECTED_ELEMENTS", ( f"Received unknown content type {content['type']}" ) - actions = [ - el for el in content["data"]["actions"] if el["inputEvent"] == "MOUSE_MOVE" - ] - if len(actions) == 0: + detected_elements = content["data"]["detected_elements"] + if len(detected_elements) == 0: raise ElementNotFoundError(locator, serialized_locator) - position = actions[0]["position"] - return int(position["x"]), int(position["y"]) + return [ + ( + int((element["bndbox"]["xmax"] + element["bndbox"]["xmin"]) / 2), + int((element["bndbox"]["ymax"] + element["bndbox"]["ymin"]) / 2), + ) + for element in detected_elements + ] @override def get( diff --git a/src/askui/models/askui/model_router.py b/src/askui/models/askui/model_router.py index d2bf857f..6998003a 100644 --- a/src/askui/models/askui/model_router.py +++ b/src/askui/models/askui/model_router.py @@ -18,7 +18,7 @@ def __init__(self, inference_api: AskUiInferenceApi): def _locate_with_askui_ocr( self, screenshot: ImageSource, locator: str | Text - ) -> Point: + ) -> list[Point]: locator = Text(locator) if isinstance(locator, str) else locator return self._inference_api.locate( locator, screenshot, model_choice=ModelName.ASKUI__OCR @@ -30,7 +30,7 @@ def locate( locator: str | Locator, image: ImageSource, model_choice: ModelComposition | str, - ) -> Point: + ) -> list[Point]: if ( isinstance(model_choice, ModelComposition) or model_choice == ModelName.ASKUI diff --git a/src/askui/models/huggingface/spaces_api.py b/src/askui/models/huggingface/spaces_api.py index a12d37bd..81ac7775 100644 --- a/src/askui/models/huggingface/spaces_api.py +++ b/src/askui/models/huggingface/spaces_api.py @@ -65,7 +65,7 @@ def locate( locator: str | Locator, image: ImageSource, model_choice: ModelComposition | str, - ) -> Point: + ) -> list[Point]: """Predict element location using Hugging Face Spaces.""" if not isinstance(model_choice, str): error_msg = "Model composition is not supported for Hugging Face Spaces" @@ -76,9 +76,9 @@ def locate( if isinstance(locator, Locator) else locator ) - return self._spaces[model_choice]( - image.root, serialized_locator, model_choice - ) + return [ + self._spaces[model_choice](image.root, serialized_locator, model_choice) + ] except (ValueError, json.JSONDecodeError, httpx.HTTPError) as e: error_msg = f"Hugging Face Spaces Exception: {e}" raise AutomationError(error_msg) from e diff --git a/src/askui/models/model_router.py b/src/askui/models/model_router.py index 457ec10d..9ca6877a 100644 --- a/src/askui/models/model_router.py +++ b/src/askui/models/model_router.py @@ -212,7 +212,7 @@ def locate( screenshot: ImageSource, locator: str | Locator, model_choice: ModelComposition | str, - ) -> Point: + ) -> list[Point]: _model_choice = ( ModelName.ASKUI if isinstance(model_choice, ModelComposition) diff --git a/src/askui/models/models.py b/src/askui/models/models.py index 5bd65ddc..a89eb6d3 100644 --- a/src/askui/models/models.py +++ b/src/askui/models/models.py @@ -303,9 +303,9 @@ def locate( locator: str | Locator, image: ImageSource, model_choice: ModelComposition | str, - ) -> Point: + ) -> list[Point]: # Implement custom locate logic - return (100, 100) + return [(100, 100)] with VisionAgent(models={"my-locate": MyLocateModel()}) as agent: agent.click("button", model="my-locate") @@ -318,7 +318,7 @@ def locate( locator: str | Locator, image: ImageSource, model_choice: ModelComposition | str, - ) -> Point: + ) -> list[Point]: """Find the coordinates of a UI element in an image. Args: diff --git a/src/askui/models/shared/facade.py b/src/askui/models/shared/facade.py index 9789f3b1..cb8b4ec2 100644 --- a/src/askui/models/shared/facade.py +++ b/src/askui/models/shared/facade.py @@ -56,5 +56,5 @@ def locate( locator: str | Locator, image: ImageSource, model_choice: ModelComposition | str, - ) -> Point: + ) -> list[Point]: return self._locate_model.locate(locator, image, model_choice) diff --git a/src/askui/models/ui_tars_ep/ui_tars_api.py b/src/askui/models/ui_tars_ep/ui_tars_api.py index 4c2c84ee..78e38714 100644 --- a/src/askui/models/ui_tars_ep/ui_tars_api.py +++ b/src/askui/models/ui_tars_ep/ui_tars_api.py @@ -146,7 +146,7 @@ def locate( locator: str | Locator, image: ImageSource, model_choice: ModelComposition | str, - ) -> Point: + ) -> list[Point]: if not isinstance(model_choice, str): error_msg = "Model composition is not supported for UI-TARS" raise NotImplementedError(error_msg) @@ -169,7 +169,7 @@ def locate( width, height = image.root.size new_height, new_width = smart_resize(height, width) x, y = (int(x / new_width * width), int(y / new_height * height)) - return x, y + return [(x, y)] raise ElementNotFoundError(locator, locator_serialized) @override diff --git a/tests/integration/agent/test_retry.py b/tests/integration/agent/test_retry.py index 76f1dd67..44aaffb6 100644 --- a/tests/integration/agent/test_retry.py +++ b/tests/integration/agent/test_retry.py @@ -27,11 +27,11 @@ def locate( locator: Union[str, Locator], image: ImageSource, # noqa: ARG002 model_choice: Union[ModelComposition, str], # noqa: ARG002 - ) -> Tuple[int, int]: + ) -> list[Tuple[int, int]]: self.calls += 1 if self.calls <= self.fail_times: raise ElementNotFoundError(locator, locator) - return self.succeed_point + return [self.succeed_point] @pytest.fixture diff --git a/tests/integration/test_custom_models.py b/tests/integration/test_custom_models.py index c1991b49..7347f551 100644 --- a/tests/integration/test_custom_models.py +++ b/tests/integration/test_custom_models.py @@ -93,11 +93,11 @@ def locate( locator: str | Locator, image: ImageSource, model_choice: ModelComposition | str, - ) -> Point: + ) -> list[Point]: self.locators.append(locator) self.images.append(image) self.model_choices.append(model_choice) - return self._point + return [self._point] class SimpleResponseSchema(ResponseSchemaBase): From 748b9870145f27f1ed1b74f0abe37285714f131d Mon Sep 17 00:00:00 2001 From: Samir Mlika Date: Fri, 15 Aug 2025 10:37:23 +0200 Subject: [PATCH 2/3] implement review remarks --- README.md | 2 +- src/askui/__init__.py | 2 ++ src/askui/agent_base.py | 13 +++++++++---- src/askui/models/__init__.py | 2 ++ src/askui/models/anthropic/messages_api.py | 4 ++-- src/askui/models/askui/inference_api.py | 4 ++-- src/askui/models/askui/model_router.py | 6 +++--- src/askui/models/huggingface/spaces_api.py | 4 ++-- src/askui/models/model_router.py | 4 ++-- src/askui/models/models.py | 13 +++++++++---- src/askui/models/shared/facade.py | 10 ++++++++-- src/askui/models/ui_tars_ep/ui_tars_api.py | 10 ++++++++-- tests/integration/test_custom_models.py | 3 ++- 13 files changed, 52 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index 74636394..e1bd7b4d 100644 --- a/README.md +++ b/README.md @@ -367,7 +367,7 @@ class MyGetAndLocateModel(GetModel, LocateModel): locator: str | Locator, image: ImageSource, model_choice: ModelComposition | str, - ) -> list[Point]: + ) -> PointList: # Implement custom locate logic, e.g.: # - Use a different object detection model # - Implement custom element finding diff --git a/src/askui/__init__.py b/src/askui/__init__.py index 393c72f5..d9eb4be5 100644 --- a/src/askui/__init__.py +++ b/src/askui/__init__.py @@ -25,6 +25,7 @@ OnMessageCb, OnMessageCbParam, Point, + PointList, TextBlockParam, TextCitationParam, ToolResultBlockParam, @@ -82,6 +83,7 @@ "OnMessageCbParam", "PcKey", "Point", + "PointList", "ResponseSchema", "ResponseSchemaBase", "Retry", diff --git a/src/askui/agent_base.py b/src/askui/agent_base.py index 41ab429c..af8f3379 100644 --- a/src/askui/agent_base.py +++ b/src/askui/agent_base.py @@ -26,6 +26,7 @@ ModelName, ModelRegistry, Point, + PointList, TotalModelChoice, ) from .models.types.response_schemas import ResponseSchema @@ -321,13 +322,14 @@ class LinkedListNode(ResponseSchemaBase): self._reporter.add_message("Agent", message_content) return response + @validate_call(config=ConfigDict(arbitrary_types_allowed=True)) def _locate( self, locator: str | Locator, screenshot: Optional[Img] = None, model: ModelComposition | str | None = None, - ) -> list[Point]: - def locate_with_screenshot() -> list[Point]: + ) -> PointList: + def locate_with_screenshot() -> PointList: _screenshot = ImageSource( self._agent_os.screenshot() if screenshot is None else screenshot ) @@ -388,10 +390,13 @@ def locate_all( locator: str | Locator, screenshot: Optional[Img] = None, model: ModelComposition | str | None = None, - ) -> list[Point]: + ) -> PointList: """ Locates all matching UI elements identified by the provided locator. + Note: Some LocateModels can only locate a single element. In this case, the + returned list will have a length of 1. + Args: locator (str | Locator): The identifier or description of the element to locate. @@ -402,7 +407,7 @@ def locate_all( of the model(s) to be used for locating the element using the `locator`. Returns: - list[Point]: The coordinates of the elements as a list of tuples (x, y). + PointList: The coordinates of the elements as a list of tuples (x, y). Example: ```python diff --git a/src/askui/models/__init__.py b/src/askui/models/__init__.py index 3de6de65..f496d769 100644 --- a/src/askui/models/__init__.py +++ b/src/askui/models/__init__.py @@ -9,6 +9,7 @@ ModelName, ModelRegistry, Point, + PointList, ) from .openrouter.model import OpenRouterModel from .openrouter.settings import ChatCompletionsCreateSettings, OpenRouterSettings @@ -53,6 +54,7 @@ "OpenRouterModel", "OpenRouterSettings", "Point", + "PointList", "TextBlockParam", "TextCitationParam", "ToolResultBlockParam", diff --git a/src/askui/models/anthropic/messages_api.py b/src/askui/models/anthropic/messages_api.py index c5f40b99..fcb93dad 100644 --- a/src/askui/models/anthropic/messages_api.py +++ b/src/askui/models/anthropic/messages_api.py @@ -27,7 +27,7 @@ LocateModel, ModelComposition, ModelName, - Point, + PointList, ) from askui.models.shared.agent_message_param import ( Base64ImageSourceParam, @@ -200,7 +200,7 @@ def locate( locator: str | Locator, image: ImageSource, model_choice: ModelComposition | str, - ) -> list[Point]: + ) -> PointList: if not isinstance(model_choice, str): error_msg = "Model composition is not supported for Claude" raise NotImplementedError(error_msg) diff --git a/src/askui/models/askui/inference_api.py b/src/askui/models/askui/inference_api.py index e706172c..a967172e 100644 --- a/src/askui/models/askui/inference_api.py +++ b/src/askui/models/askui/inference_api.py @@ -20,7 +20,7 @@ from askui.locators.serializers import AskUiLocatorSerializer, AskUiSerializedLocator from askui.logger import logger from askui.models.exceptions import ElementNotFoundError -from askui.models.models import GetModel, LocateModel, ModelComposition, Point +from askui.models.models import GetModel, LocateModel, ModelComposition, PointList from askui.models.shared.agent_message_param import MessageParam from askui.models.shared.messages_api import MessagesApi from askui.models.shared.settings import MessageSettings @@ -160,7 +160,7 @@ def locate( locator: str | Locator, image: ImageSource, model_choice: ModelComposition | str, - ) -> list[Point]: + ) -> PointList: serialized_locator = ( self._locator_serializer.serialize(locator=locator) if isinstance(locator, Locator) diff --git a/src/askui/models/askui/model_router.py b/src/askui/models/askui/model_router.py index 6998003a..0dc7076c 100644 --- a/src/askui/models/askui/model_router.py +++ b/src/askui/models/askui/model_router.py @@ -8,7 +8,7 @@ ElementNotFoundError, ModelNotFoundError, ) -from askui.models.models import LocateModel, ModelComposition, ModelName, Point +from askui.models.models import LocateModel, ModelComposition, ModelName, PointList from askui.utils.image_utils import ImageSource @@ -18,7 +18,7 @@ def __init__(self, inference_api: AskUiInferenceApi): def _locate_with_askui_ocr( self, screenshot: ImageSource, locator: str | Text - ) -> list[Point]: + ) -> PointList: locator = Text(locator) if isinstance(locator, str) else locator return self._inference_api.locate( locator, screenshot, model_choice=ModelName.ASKUI__OCR @@ -30,7 +30,7 @@ def locate( locator: str | Locator, image: ImageSource, model_choice: ModelComposition | str, - ) -> list[Point]: + ) -> PointList: if ( isinstance(model_choice, ModelComposition) or model_choice == ModelName.ASKUI diff --git a/src/askui/models/huggingface/spaces_api.py b/src/askui/models/huggingface/spaces_api.py index 81ac7775..eedef8c1 100644 --- a/src/askui/models/huggingface/spaces_api.py +++ b/src/askui/models/huggingface/spaces_api.py @@ -10,7 +10,7 @@ from askui.exceptions import AutomationError from askui.locators.locators import Locator from askui.locators.serializers import VlmLocatorSerializer -from askui.models.models import LocateModel, ModelComposition, ModelName, Point +from askui.models.models import LocateModel, ModelComposition, ModelName, PointList from askui.utils.image_utils import ImageSource @@ -65,7 +65,7 @@ def locate( locator: str | Locator, image: ImageSource, model_choice: ModelComposition | str, - ) -> list[Point]: + ) -> PointList: """Predict element location using Hugging Face Spaces.""" if not isinstance(model_choice, str): error_msg = "Model composition is not supported for Hugging Face Spaces" diff --git a/src/askui/models/model_router.py b/src/askui/models/model_router.py index 9ca6877a..b91be94d 100644 --- a/src/askui/models/model_router.py +++ b/src/askui/models/model_router.py @@ -21,7 +21,7 @@ ModelComposition, ModelName, ModelRegistry, - Point, + PointList, ) from askui.models.shared.agent import Agent from askui.models.shared.agent_message_param import MessageParam @@ -212,7 +212,7 @@ def locate( screenshot: ImageSource, locator: str | Locator, model_choice: ModelComposition | str, - ) -> list[Point]: + ) -> PointList: _model_choice = ( ModelName.ASKUI if isinstance(model_choice, ModelComposition) diff --git a/src/askui/models/models.py b/src/askui/models/models.py index a89eb6d3..22f7256b 100644 --- a/src/askui/models/models.py +++ b/src/askui/models/models.py @@ -147,6 +147,11 @@ def __getitem__(self, index: int) -> ModelDefinition: A tuple of two integers representing the coordinates of a point on the screen. """ +PointList = Annotated[list[Point], Field(min_length=1)] +""" +A list of points representing the coordinates of elements on the screen. +""" + class ActModel(abc.ABC): """Abstract base class for models that can execute autonomous actions. @@ -294,7 +299,7 @@ class LocateModel(abc.ABC): Example: ```python - from askui import LocateModel, VisionAgent, Locator, ImageSource, Point + from askui import LocateModel, VisionAgent, Locator, ImageSource, PointList from askui.models import ModelComposition class MyLocateModel(LocateModel): @@ -303,7 +308,7 @@ def locate( locator: str | Locator, image: ImageSource, model_choice: ModelComposition | str, - ) -> list[Point]: + ) -> PointList: # Implement custom locate logic return [(100, 100)] @@ -318,7 +323,7 @@ def locate( locator: str | Locator, image: ImageSource, model_choice: ModelComposition | str, - ) -> list[Point]: + ) -> PointList: """Find the coordinates of a UI element in an image. Args: @@ -329,7 +334,7 @@ def locate( `ModelComposition` for models that support composition Returns: - A tuple of (x, y) coordinates where the element was found + A list of (x, y) coordinates where the element was found, minimum length 1 """ raise NotImplementedError diff --git a/src/askui/models/shared/facade.py b/src/askui/models/shared/facade.py index cb8b4ec2..655f0c43 100644 --- a/src/askui/models/shared/facade.py +++ b/src/askui/models/shared/facade.py @@ -3,7 +3,13 @@ from typing_extensions import override from askui.locators.locators import Locator -from askui.models.models import ActModel, GetModel, LocateModel, ModelComposition, Point +from askui.models.models import ( + ActModel, + GetModel, + LocateModel, + ModelComposition, + PointList, +) from askui.models.shared.agent_message_param import MessageParam from askui.models.shared.agent_on_message_cb import OnMessageCb from askui.models.shared.settings import ActSettings @@ -56,5 +62,5 @@ def locate( locator: str | Locator, image: ImageSource, model_choice: ModelComposition | str, - ) -> list[Point]: + ) -> PointList: return self._locate_model.locate(locator, image, model_choice) diff --git a/src/askui/models/ui_tars_ep/ui_tars_api.py b/src/askui/models/ui_tars_ep/ui_tars_api.py index 78e38714..122a82cb 100644 --- a/src/askui/models/ui_tars_ep/ui_tars_api.py +++ b/src/askui/models/ui_tars_ep/ui_tars_api.py @@ -11,7 +11,13 @@ from askui.locators.locators import Locator from askui.locators.serializers import VlmLocatorSerializer from askui.models.exceptions import ElementNotFoundError, QueryNoResponseError -from askui.models.models import ActModel, GetModel, LocateModel, ModelComposition, Point +from askui.models.models import ( + ActModel, + GetModel, + LocateModel, + ModelComposition, + PointList, +) from askui.models.shared.agent_message_param import MessageParam from askui.models.shared.agent_on_message_cb import OnMessageCb from askui.models.shared.settings import ActSettings @@ -146,7 +152,7 @@ def locate( locator: str | Locator, image: ImageSource, model_choice: ModelComposition | str, - ) -> list[Point]: + ) -> PointList: if not isinstance(model_choice, str): error_msg = "Model composition is not supported for UI-TARS" raise NotImplementedError(error_msg) diff --git a/tests/integration/test_custom_models.py b/tests/integration/test_custom_models.py index 7347f551..8f49d09b 100644 --- a/tests/integration/test_custom_models.py +++ b/tests/integration/test_custom_models.py @@ -11,6 +11,7 @@ LocateModel, ModelRegistry, Point, + PointList, ResponseSchema, ResponseSchemaBase, VisionAgent, @@ -93,7 +94,7 @@ def locate( locator: str | Locator, image: ImageSource, model_choice: ModelComposition | str, - ) -> list[Point]: + ) -> PointList: self.locators.append(locator) self.images.append(image) self.model_choices.append(model_choice) From 1d1c2ee59ca85574aa82182ef75b88894d9ac920 Mon Sep 17 00:00:00 2001 From: Samir mlika <105347215+mlikasam-askui@users.noreply.github.com> Date: Fri, 15 Aug 2025 15:33:11 +0200 Subject: [PATCH 3/3] fix type --- src/askui/agent_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/askui/agent_base.py b/src/askui/agent_base.py index 656c502f..a270b6f7 100644 --- a/src/askui/agent_base.py +++ b/src/askui/agent_base.py @@ -360,7 +360,7 @@ def _locate( screenshot: Optional[Img] = None, model: ModelComposition | str | None = None, ) -> PointList: - def locate_with_screenshot() -> Point: + def locate_with_screenshot() -> PointList: _screenshot = load_image_source( self._agent_os.screenshot() if screenshot is None else screenshot )