diff --git a/src/askui/tools/computer.py b/src/askui/tools/computer.py index 8289b591..08723ad6 100644 --- a/src/askui/tools/computer.py +++ b/src/askui/tools/computer.py @@ -12,8 +12,12 @@ from pydantic import Field, validate_call from typing_extensions import Self, override -from askui.tools.agent_os import AgentOs, ModifierKey, PcKey -from askui.utils.image_utils import scale_coordinates_back, scale_image_with_padding +from askui.tools.agent_os import AgentOs, Coordinate, ModifierKey, PcKey +from askui.utils.image_utils import ( + scale_coordinates_back, + scale_coordinates_with_padding, + scale_image_with_padding, +) from ..models.shared.tools import InputSchema, Tool @@ -223,10 +227,10 @@ def __call__( # noqa: C901 text: str | None = None, coordinate: tuple[Annotated[int, Field(ge=0)], Annotated[int, Field(ge=0)]] | None = None, - ) -> Image.Image | None: + ) -> Image.Image | None | Coordinate: match action: case "cursor_position": - raise ActionNotImplementedError(action, self.name) + return self._get_mouse_position_scaled() case "double_click": return self._agent_os.click("left", 2) case "key": @@ -325,6 +329,19 @@ def _screenshot(self) -> Image.Image: self._real_screen_height = screenshot.height return scale_image_with_padding(screenshot, self._width, self._height) + def _get_mouse_position_scaled(self) -> Coordinate: + mouse_position: Coordinate = self._agent_os.get_mouse_position() + real_screen_width, real_screen_height = self._get_real_screen_resolution() + x, y = scale_coordinates_with_padding( + mouse_position.x, + mouse_position.y, + real_screen_width, + real_screen_height, + self._width, + self._height, + ) + return Coordinate(x=int(x), y=int(y)) + class Computer20241022Tool(ComputerToolBase): type: Literal["computer_20241022"] = "computer_20241022" @@ -426,7 +443,7 @@ def __call__( # noqa: C901 scroll_amount: Annotated[int, Field(ge=0)] | None = None, duration: Annotated[float, Field(ge=0.0, le=100.0)] | None = None, key: str | None = None, # maybe not all keys supported - ) -> Image.Image | None: + ) -> Image.Image | None | Coordinate: match action: case "hold_key": self._hold_key(keystroke=text, duration=duration) # type: ignore[arg-type] diff --git a/src/askui/utils/image_utils.py b/src/askui/utils/image_utils.py index c2647f76..1cd40a89 100644 --- a/src/askui/utils/image_utils.py +++ b/src/askui/utils/image_utils.py @@ -190,6 +190,55 @@ def scale_image_with_padding( ) +def scale_coordinates_with_padding( + x: float, + y: float, + original_width: int, + original_height: int, + max_width: int, + max_height: int, +) -> Tuple[float, float]: + """Convert coordinates from an original image to a scaled and padded image. + + This function takes coordinates from the original image and calculates + their corresponding position in an image that has been scaled and + padded to fit within `max_width` and `max_height`. + + Args: + x (float): The x-coordinate in the original image. + y (float): The y-coordinate in the original image. + original_width (int): The width of the original image. + original_height (int): The height of the original image. + max_width (int): The maximum width of the output scaled and padded image. + max_height (int): The maximum height of the output scaled and padded image. + + Returns: + Tuple[float, float]: A tuple of (scaled_x, scaled_y) coordinates + in the padded image. + """ + aspect_ratio = original_width / original_height + if (max_width / max_height) > aspect_ratio: + scale_factor = max_height / original_height + scaled_width = int(original_width * scale_factor) + scaled_height = max_height + else: + scale_factor = max_width / original_width + scaled_width = max_width + scaled_height = int(original_height * scale_factor) + + pad_left = (max_width - scaled_width) // 2 + pad_top = (max_height - scaled_height) // 2 + + scaled_x = x * scale_factor + pad_left + scaled_y = y * scale_factor + pad_top + + if scaled_x < 0 or scaled_y < 0 or scaled_x > max_width or scaled_y > max_height: + error_msg = "Coordinates are outside the padded image area" + raise ValueError(error_msg) + + return scaled_x, scaled_y + + def scale_coordinates_back( x: float, y: float,