From 6a4464f3b9e6a5ae93722368d0a8d45b984bde06 Mon Sep 17 00:00:00 2001 From: danyalxahid-askui Date: Fri, 25 Jul 2025 15:22:52 +0200 Subject: [PATCH 1/4] refactor: enhance ComputerToolBase with mouse position scaling and add coordinate scaling utility functions --- src/askui/tools/computer.py | 30 ++++++++++---- src/askui/utils/image_utils.py | 75 +++++++++++++++++++++++++++++----- 2 files changed, 87 insertions(+), 18 deletions(-) diff --git a/src/askui/tools/computer.py b/src/askui/tools/computer.py index 8289b591..ab048e59 100644 --- a/src/askui/tools/computer.py +++ b/src/askui/tools/computer.py @@ -4,16 +4,16 @@ from dataclasses import dataclass from typing import Annotated, Literal, TypedDict, cast, get_args -from anthropic.types.beta import ( - BetaToolComputerUse20241022Param, - BetaToolComputerUse20250124Param, -) +from anthropic.types.beta import (BetaToolComputerUse20241022Param, + BetaToolComputerUse20250124Param) from PIL import Image from pydantic import Field, validate_call from typing_extensions import Self, override -from askui.tools.agent_os import AgentOs, ModifierKey, PcKey -from askui.utils.image_utils import scale_coordinates_back, scale_image_with_padding +from askui.tools.agent_os import AgentOs, Coordinate, ModifierKey, PcKey +from askui.utils.image_utils import (scale_coordinates_back, + scale_coordinates_with_padding, + scale_image_with_padding) from ..models.shared.tools import InputSchema, Tool @@ -223,10 +223,10 @@ def __call__( # noqa: C901 text: str | None = None, coordinate: tuple[Annotated[int, Field(ge=0)], Annotated[int, Field(ge=0)]] | None = None, - ) -> Image.Image | None: + ) -> Image.Image | None | str: match action: case "cursor_position": - raise ActionNotImplementedError(action, self.name) + return self._get_mouse_position_scaled() case "double_click": return self._agent_os.click("left", 2) case "key": @@ -325,6 +325,20 @@ def _screenshot(self) -> Image.Image: self._real_screen_height = screenshot.height return scale_image_with_padding(screenshot, self._width, self._height) + def _get_mouse_position_scaled(self) -> str: + mouse_position: Coordinate = self._agent_os.get_mouse_position() + real_screen_width, real_screen_height = self._get_real_screen_resolution() + x, y = scale_coordinates_with_padding( + mouse_position.x, + mouse_position.y, + real_screen_width, + real_screen_height, + self._width, + self._height, + ) + + return f"X={x},Y={y}" + class Computer20241022Tool(ComputerToolBase): type: Literal["computer_20241022"] = "computer_20241022" diff --git a/src/askui/utils/image_utils.py b/src/askui/utils/image_utils.py index c2647f76..67319911 100644 --- a/src/askui/utils/image_utils.py +++ b/src/askui/utils/image_utils.py @@ -7,8 +7,9 @@ from pathlib import Path from typing import Any, Literal, Tuple, Union -from PIL import Image, ImageDraw, ImageOps, UnidentifiedImageError +from PIL import Image from PIL import Image as PILImage +from PIL import ImageDraw, ImageOps, UnidentifiedImageError from pydantic import ConfigDict, RootModel, field_validator # Regex to capture any kind of valid base64 data url (with optional media type and ;base64) @@ -189,6 +190,65 @@ def scale_image_with_padding( fill=(0, 0, 0), # Black padding ) +def scale_coordinates_with_padding( + x: float, + y: float, + original_width: int, + original_height: int, + max_width: int, + max_height: int, +) -> Tuple[float, float]: + """Convert coordinates from an original image to a scaled and padded image. + This function takes coordinates from the original image and calculates + their corresponding position in an image that has been scaled and + padded to fit within `max_width` and `max_height`. + Args: + x (float): The x-coordinate in the original image. + y (float): The y-coordinate in the original image. + original_width (int): The width of the original image. + original_height (int): The height of the original image. + max_width (int): The maximum width of the output scaled and padded image. + max_height (int): The maximum height of the output scaled and padded image. + Returns: + Tuple[float, float]: A tuple of (scaled_x, scaled_y) coordinates + in the padded image. + """ + scale_factor, scaled_width, scaled_height = _calculate_aspect_fit_scaling( + original_width, original_height, max_width, max_height + ) + + pad_left = (max_width - scaled_width) // 2 + pad_top = (max_height - scaled_height) // 2 + + scaled_x = x * scale_factor + pad_left + scaled_y = y * scale_factor + pad_top + + if scaled_x < 0 or scaled_y < 0 or scaled_x > max_width or scaled_y > max_height: + error_msg = "Coordinates are outside the padded image area" + raise ValueError(error_msg) + return scaled_x, scaled_y + + +def _calculate_aspect_fit_scaling( + original_width: int, + original_height: int, + max_width: int, + max_height: int, +) -> Tuple[float, float, float]: + """Calculate the scale factors for an image to fit within specified dimensions while maintaining aspect ratio and adding padding.""" + + aspect_ratio = original_width / original_height + if (max_width / max_height) > aspect_ratio: + scale_factor = max_height / original_height + scaled_width = int(original_width * scale_factor) + scaled_height = max_height + else: + scale_factor = max_width / original_width + scaled_width = max_width + scaled_height = int(original_height * scale_factor) + + return scale_factor, scaled_width, scaled_height + def scale_coordinates_back( x: float, @@ -214,15 +274,10 @@ def scale_coordinates_back( Raises: ValueError: If the coordinates are outside the padded image area. """ - aspect_ratio = original_width / original_height - if (max_width / max_height) > aspect_ratio: - scale_factor = max_height / original_height - scaled_width = int(original_width * scale_factor) - scaled_height = max_height - else: - scale_factor = max_width / original_width - scaled_width = max_width - scaled_height = int(original_height * scale_factor) + scale_factor, scaled_width, scaled_height = _calculate_aspect_fit_scaling( + original_width, original_height, max_width, max_height + ) + pad_left = (max_width - scaled_width) // 2 pad_top = (max_height - scaled_height) // 2 adjusted_x = x - pad_left From 81f724475e783f8be5e5942983d5c665167c61c5 Mon Sep 17 00:00:00 2001 From: danyalxahid-askui Date: Fri, 25 Jul 2025 15:31:32 +0200 Subject: [PATCH 2/4] style: update docstrings for scale_coordinates_with_padding and _calculate_aspect_fit_scaling functions --- src/askui/utils/image_utils.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/src/askui/utils/image_utils.py b/src/askui/utils/image_utils.py index 67319911..d54c8b5a 100644 --- a/src/askui/utils/image_utils.py +++ b/src/askui/utils/image_utils.py @@ -198,20 +198,9 @@ def scale_coordinates_with_padding( max_width: int, max_height: int, ) -> Tuple[float, float]: - """Convert coordinates from an original image to a scaled and padded image. - This function takes coordinates from the original image and calculates - their corresponding position in an image that has been scaled and - padded to fit within `max_width` and `max_height`. - Args: - x (float): The x-coordinate in the original image. - y (float): The y-coordinate in the original image. - original_width (int): The width of the original image. - original_height (int): The height of the original image. - max_width (int): The maximum width of the output scaled and padded image. - max_height (int): The maximum height of the output scaled and padded image. - Returns: - Tuple[float, float]: A tuple of (scaled_x, scaled_y) coordinates - in the padded image. + """ + Scale coordinates from an original coordinate system to a scaled and padded coordinate system. + """ scale_factor, scaled_width, scaled_height = _calculate_aspect_fit_scaling( original_width, original_height, max_width, max_height @@ -235,7 +224,18 @@ def _calculate_aspect_fit_scaling( max_width: int, max_height: int, ) -> Tuple[float, float, float]: - """Calculate the scale factors for an image to fit within specified dimensions while maintaining aspect ratio and adding padding.""" + """Calculate the scale factors for an image to fit within specified dimensions while maintaining aspect ratio. + + Args: + original_width (int): The width of the original coordinate system. + original_height (int): The height of the original coordinate system. + max_width (int): The maximum width of the output scaled coordinate system. + max_height (int): The maximum height of the output scaled coordinate system. + + Returns: + Tuple[float, float, float]: A tuple of (scale_factor, scaled_width, scaled_height). + + """ aspect_ratio = original_width / original_height if (max_width / max_height) > aspect_ratio: From 42734a1acbc62ef89377bc54333ccd0383be3976 Mon Sep 17 00:00:00 2001 From: danyalxahid-askui Date: Fri, 25 Jul 2025 15:39:02 +0200 Subject: [PATCH 3/4] fix: update return type of method to include str in Computer20250124Tool --- src/askui/tools/computer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/askui/tools/computer.py b/src/askui/tools/computer.py index ab048e59..286ddf12 100644 --- a/src/askui/tools/computer.py +++ b/src/askui/tools/computer.py @@ -440,7 +440,7 @@ def __call__( # noqa: C901 scroll_amount: Annotated[int, Field(ge=0)] | None = None, duration: Annotated[float, Field(ge=0.0, le=100.0)] | None = None, key: str | None = None, # maybe not all keys supported - ) -> Image.Image | None: + ) -> Image.Image | None | str: match action: case "hold_key": self._hold_key(keystroke=text, duration=duration) # type: ignore[arg-type] From 2fba2dbda785423570df67828a6fec89fa7786f4 Mon Sep 17 00:00:00 2001 From: danyalxahid-askui Date: Fri, 25 Jul 2025 15:44:17 +0200 Subject: [PATCH 4/4] style: reorganize imports and update formatting in computer.py and image_utils.py --- src/askui/tools/computer.py | 14 +++++++++----- src/askui/utils/image_utils.py | 12 ++++++------ 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/src/askui/tools/computer.py b/src/askui/tools/computer.py index 286ddf12..a97bb3cb 100644 --- a/src/askui/tools/computer.py +++ b/src/askui/tools/computer.py @@ -4,16 +4,20 @@ from dataclasses import dataclass from typing import Annotated, Literal, TypedDict, cast, get_args -from anthropic.types.beta import (BetaToolComputerUse20241022Param, - BetaToolComputerUse20250124Param) +from anthropic.types.beta import ( + BetaToolComputerUse20241022Param, + BetaToolComputerUse20250124Param, +) from PIL import Image from pydantic import Field, validate_call from typing_extensions import Self, override from askui.tools.agent_os import AgentOs, Coordinate, ModifierKey, PcKey -from askui.utils.image_utils import (scale_coordinates_back, - scale_coordinates_with_padding, - scale_image_with_padding) +from askui.utils.image_utils import ( + scale_coordinates_back, + scale_coordinates_with_padding, + scale_image_with_padding, +) from ..models.shared.tools import InputSchema, Tool diff --git a/src/askui/utils/image_utils.py b/src/askui/utils/image_utils.py index d54c8b5a..f3a60a5b 100644 --- a/src/askui/utils/image_utils.py +++ b/src/askui/utils/image_utils.py @@ -7,9 +7,8 @@ from pathlib import Path from typing import Any, Literal, Tuple, Union -from PIL import Image +from PIL import Image, ImageDraw, ImageOps, UnidentifiedImageError from PIL import Image as PILImage -from PIL import ImageDraw, ImageOps, UnidentifiedImageError from pydantic import ConfigDict, RootModel, field_validator # Regex to capture any kind of valid base64 data url (with optional media type and ;base64) @@ -190,6 +189,7 @@ def scale_image_with_padding( fill=(0, 0, 0), # Black padding ) + def scale_coordinates_with_padding( x: float, y: float, @@ -200,7 +200,7 @@ def scale_coordinates_with_padding( ) -> Tuple[float, float]: """ Scale coordinates from an original coordinate system to a scaled and padded coordinate system. - + """ scale_factor, scaled_width, scaled_height = _calculate_aspect_fit_scaling( original_width, original_height, max_width, max_height @@ -225,16 +225,16 @@ def _calculate_aspect_fit_scaling( max_height: int, ) -> Tuple[float, float, float]: """Calculate the scale factors for an image to fit within specified dimensions while maintaining aspect ratio. - + Args: original_width (int): The width of the original coordinate system. original_height (int): The height of the original coordinate system. max_width (int): The maximum width of the output scaled coordinate system. max_height (int): The maximum height of the output scaled coordinate system. - + Returns: Tuple[float, float, float]: A tuple of (scale_factor, scaled_width, scaled_height). - + """ aspect_ratio = original_width / original_height