From a2223701c29080caca2b48fb800a10372707bb81 Mon Sep 17 00:00:00 2001 From: danyalxahid-askui Date: Thu, 24 Jul 2025 21:19:53 +0200 Subject: [PATCH 1/2] refactor: enhance ComputerToolBase functionality and add coordinate scaling - Added `_get_mouse_position_scaled` method to `ComputerToolBase` for retrieving and scaling mouse position. - Updated `action` method to return scaled mouse position. - Introduced `scale_coordinates_with_padding` function in `image_utils.py` for scaling coordinates with padding. - Cleaned up import statements for better organization and readability. --- src/askui/tools/computer.py | 23 +++++++++----- src/askui/utils/image_utils.py | 58 +++++++++++++++++++++++++++++++++- 2 files changed, 72 insertions(+), 9 deletions(-) diff --git a/src/askui/tools/computer.py b/src/askui/tools/computer.py index 8289b591..70ed4260 100644 --- a/src/askui/tools/computer.py +++ b/src/askui/tools/computer.py @@ -4,16 +4,16 @@ from dataclasses import dataclass from typing import Annotated, Literal, TypedDict, cast, get_args -from anthropic.types.beta import ( - BetaToolComputerUse20241022Param, - BetaToolComputerUse20250124Param, -) +from anthropic.types.beta import (BetaToolComputerUse20241022Param, + BetaToolComputerUse20250124Param) from PIL import Image from pydantic import Field, validate_call from typing_extensions import Self, override -from askui.tools.agent_os import AgentOs, ModifierKey, PcKey -from askui.utils.image_utils import scale_coordinates_back, scale_image_with_padding +from askui.tools.agent_os import AgentOs, Coordinate, ModifierKey, PcKey +from askui.utils.image_utils import (scale_coordinates_back, + scale_coordinates_with_padding, + scale_image_with_padding) from ..models.shared.tools import InputSchema, Tool @@ -223,10 +223,10 @@ def __call__( # noqa: C901 text: str | None = None, coordinate: tuple[Annotated[int, Field(ge=0)], Annotated[int, Field(ge=0)]] | None = None, - ) -> Image.Image | None: + ) -> Image.Image | None | Coordinate: match action: case "cursor_position": - raise ActionNotImplementedError(action, self.name) + return self._get_mouse_position_scaled() case "double_click": return self._agent_os.click("left", 2) case "key": @@ -326,6 +326,13 @@ def _screenshot(self) -> Image.Image: return scale_image_with_padding(screenshot, self._width, self._height) + def _get_mouse_position_scaled(self) -> Coordinate: + mouse_position: Coordinate = self._agent_os.get_mouse_position() + real_screen_width, real_screen_height = self._get_real_screen_resolution() + x, y = scale_coordinates_with_padding(mouse_position.x, mouse_position.y, real_screen_width, real_screen_height, self._width, self._height) + return Coordinate(x=int(x), y=int(y)) + + class Computer20241022Tool(ComputerToolBase): type: Literal["computer_20241022"] = "computer_20241022" diff --git a/src/askui/utils/image_utils.py b/src/askui/utils/image_utils.py index c2647f76..6caa7881 100644 --- a/src/askui/utils/image_utils.py +++ b/src/askui/utils/image_utils.py @@ -7,8 +7,9 @@ from pathlib import Path from typing import Any, Literal, Tuple, Union -from PIL import Image, ImageDraw, ImageOps, UnidentifiedImageError +from PIL import Image from PIL import Image as PILImage +from PIL import ImageDraw, ImageOps, UnidentifiedImageError from pydantic import ConfigDict, RootModel, field_validator # Regex to capture any kind of valid base64 data url (with optional media type and ;base64) @@ -190,6 +191,61 @@ def scale_image_with_padding( ) +def scale_coordinates_with_padding( + x: float, + y: float, + original_width: int, + original_height: int, + max_width: int, + max_height: int, +) -> Tuple[float, float]: + """Convert coordinates from an original image to a scaled and padded image. + + This function takes coordinates from the original image and calculates + their corresponding position in an image that has been scaled and + padded to fit within `max_width` and `max_height`. + + Args: + x (float): The x-coordinate in the original image. + y (float): The y-coordinate in the original image. + original_width (int): The width of the original image. + original_height (int): The height of the original image. + max_width (int): The maximum width of the output scaled and padded image. + max_height (int): The maximum height of the output scaled and padded image. + + Returns: + Tuple[float, float]: A tuple of (scaled_x, scaled_y) coordinates + in the padded image. + """ + aspect_ratio = original_width / original_height + if (max_width / max_height) > aspect_ratio: + scale_factor = max_height / original_height + scaled_width = int(original_width * scale_factor) + scaled_height = max_height + else: + scale_factor = max_width / original_width + scaled_width = max_width + scaled_height = int(original_height * scale_factor) + + pad_left = (max_width - scaled_width) // 2 + pad_top = (max_height - scaled_height) // 2 + + scaled_x = x * scale_factor + pad_left + scaled_y = y * scale_factor + pad_top + + if ( + scaled_x < 0 + or scaled_y < 0 + or scaled_x > max_width + or scaled_y > max_height + ): + error_msg = "Coordinates are outside the padded image area" + raise ValueError(error_msg) + + + return scaled_x, scaled_y + + def scale_coordinates_back( x: float, y: float, From cd4d4e82647b6def867296f9f14ddb67bc18a0ed Mon Sep 17 00:00:00 2001 From: danyalxahid-askui Date: Thu, 24 Jul 2025 21:33:13 +0200 Subject: [PATCH 2/2] refactor: update action method return type to include Coordinate - Modified the return type of the `action` method in `Computer20250124Tool` to include `Coordinate` in addition to `Image.Image | None`, enhancing its functionality to return more comprehensive results. --- src/askui/tools/computer.py | 26 ++++++++++++++++++-------- src/askui/utils/image_utils.py | 11 ++--------- 2 files changed, 20 insertions(+), 17 deletions(-) diff --git a/src/askui/tools/computer.py b/src/askui/tools/computer.py index 70ed4260..08723ad6 100644 --- a/src/askui/tools/computer.py +++ b/src/askui/tools/computer.py @@ -4,16 +4,20 @@ from dataclasses import dataclass from typing import Annotated, Literal, TypedDict, cast, get_args -from anthropic.types.beta import (BetaToolComputerUse20241022Param, - BetaToolComputerUse20250124Param) +from anthropic.types.beta import ( + BetaToolComputerUse20241022Param, + BetaToolComputerUse20250124Param, +) from PIL import Image from pydantic import Field, validate_call from typing_extensions import Self, override from askui.tools.agent_os import AgentOs, Coordinate, ModifierKey, PcKey -from askui.utils.image_utils import (scale_coordinates_back, - scale_coordinates_with_padding, - scale_image_with_padding) +from askui.utils.image_utils import ( + scale_coordinates_back, + scale_coordinates_with_padding, + scale_image_with_padding, +) from ..models.shared.tools import InputSchema, Tool @@ -325,11 +329,17 @@ def _screenshot(self) -> Image.Image: self._real_screen_height = screenshot.height return scale_image_with_padding(screenshot, self._width, self._height) - def _get_mouse_position_scaled(self) -> Coordinate: mouse_position: Coordinate = self._agent_os.get_mouse_position() real_screen_width, real_screen_height = self._get_real_screen_resolution() - x, y = scale_coordinates_with_padding(mouse_position.x, mouse_position.y, real_screen_width, real_screen_height, self._width, self._height) + x, y = scale_coordinates_with_padding( + mouse_position.x, + mouse_position.y, + real_screen_width, + real_screen_height, + self._width, + self._height, + ) return Coordinate(x=int(x), y=int(y)) @@ -433,7 +443,7 @@ def __call__( # noqa: C901 scroll_amount: Annotated[int, Field(ge=0)] | None = None, duration: Annotated[float, Field(ge=0.0, le=100.0)] | None = None, key: str | None = None, # maybe not all keys supported - ) -> Image.Image | None: + ) -> Image.Image | None | Coordinate: match action: case "hold_key": self._hold_key(keystroke=text, duration=duration) # type: ignore[arg-type] diff --git a/src/askui/utils/image_utils.py b/src/askui/utils/image_utils.py index 6caa7881..1cd40a89 100644 --- a/src/askui/utils/image_utils.py +++ b/src/askui/utils/image_utils.py @@ -7,9 +7,8 @@ from pathlib import Path from typing import Any, Literal, Tuple, Union -from PIL import Image +from PIL import Image, ImageDraw, ImageOps, UnidentifiedImageError from PIL import Image as PILImage -from PIL import ImageDraw, ImageOps, UnidentifiedImageError from pydantic import ConfigDict, RootModel, field_validator # Regex to capture any kind of valid base64 data url (with optional media type and ;base64) @@ -233,16 +232,10 @@ def scale_coordinates_with_padding( scaled_x = x * scale_factor + pad_left scaled_y = y * scale_factor + pad_top - if ( - scaled_x < 0 - or scaled_y < 0 - or scaled_x > max_width - or scaled_y > max_height - ): + if scaled_x < 0 or scaled_y < 0 or scaled_x > max_width or scaled_y > max_height: error_msg = "Coordinates are outside the padded image area" raise ValueError(error_msg) - return scaled_x, scaled_y