From 13ddd9e867275c9260a035af7ef49c4de6ea21e6 Mon Sep 17 00:00:00 2001 From: danyalxahid-askui Date: Fri, 25 Jul 2025 14:38:47 +0200 Subject: [PATCH 01/13] feat: add display management tools and enhance VisionAgent --- src/askui/agent.py | 6 ++++ src/askui/tools/active_display_tool.py | 20 ++++++++++++ src/askui/tools/agent_os.py | 38 +++++++++++++++++++++-- src/askui/tools/askui/askui_controller.py | 35 ++++++++++++++++----- src/askui/tools/list_display_tool.py | 21 +++++++++++++ src/askui/tools/set_display_tool.py | 21 +++++++++++++ 6 files changed, 131 insertions(+), 10 deletions(-) create mode 100644 src/askui/tools/active_display_tool.py create mode 100644 src/askui/tools/list_display_tool.py create mode 100644 src/askui/tools/set_display_tool.py diff --git a/src/askui/agent.py b/src/askui/agent.py index 30dbce32..3c27a8c9 100644 --- a/src/askui/agent.py +++ b/src/askui/agent.py @@ -17,8 +17,11 @@ MessageSettings, ) from askui.models.shared.tools import Tool +from askui.tools.active_display_tool import ActiveDisplayTool from askui.tools.computer import Computer20241022Tool, Computer20250124Tool from askui.tools.exception_tool import ExceptionTool +from askui.tools.list_display_tool import ListDisplayTool +from askui.tools.set_display_tool import SetDisplayTool from .logger import logger from .models import ModelComposition @@ -115,6 +118,9 @@ def __init__( models=models, tools=[ ExceptionTool(), + SetDisplayTool(agent_os=self.tools.os), + ListDisplayTool(agent_os=self.tools.os), + ActiveDisplayTool(agent_os=self.tools.os), ] + (act_tools or []), agent_os=self.tools.os, diff --git a/src/askui/tools/active_display_tool.py b/src/askui/tools/active_display_tool.py new file mode 100644 index 00000000..ede6f992 --- /dev/null +++ b/src/askui/tools/active_display_tool.py @@ -0,0 +1,20 @@ +from askui.models.shared.tools import Tool +from askui.tools.agent_os import AgentOs + + +class ActiveDisplayTool(Tool): + """ + Tool to get the active display id. + """ + + def __init__(self, agent_os: AgentOs) -> None: + super().__init__( + name="active_display", + description=""" + This tool is useful for getting the active display id. + """, + ) + self._agent_os: AgentOs = agent_os + + def __call__(self) -> str: + return str(self._agent_os.get_active_display()) diff --git a/src/askui/tools/agent_os.py b/src/askui/tools/agent_os.py index 00b8902a..5f3c169f 100644 --- a/src/askui/tools/agent_os.py +++ b/src/askui/tools/agent_os.py @@ -2,12 +2,12 @@ from typing import TYPE_CHECKING, Literal from PIL import Image -from pydantic import BaseModel +from pydantic import BaseModel, Field if TYPE_CHECKING: - from askui.tools.askui.askui_ui_controller_grpc.generated.AgentOS_Send_Request_2501 import ( # noqa: E501 + from askui.tools.askui.askui_ui_controller_grpc.generated.AgentOS_Send_Request_2501 import ( RenderObjectStyle, - ) + ) # noqa: E501 ModifierKey = Literal[ @@ -159,6 +159,26 @@ class Coordinate(BaseModel): y: int +class SizeInPixels(BaseModel): + """Represents the size of a display in pixels.""" + + width: int + height: int + + +class DisplayInformation(BaseModel): + """Contains information about a single display.""" + + display_id: int = Field(validation_alias="displayID") + size_in_pixels: SizeInPixels = Field(validation_alias="sizeInPixels", exclude=True) + + +class GetDisplayInformationResponse(BaseModel): + """Response model for display information requests.""" + + displays: list[DisplayInformation] + + InputEvent = ClickEvent @@ -323,6 +343,18 @@ def keyboard_tap( """ raise NotImplementedError + def get_display_information(self) -> GetDisplayInformationResponse: + """ + Get information about all available displays and virtual screen. + """ + raise NotImplementedError + + def get_active_display(self) -> int: + """ + Get the active display. + """ + raise NotImplementedError + def set_display(self, display: int = 1) -> None: """ Sets the active display for screen interactions. diff --git a/src/askui/tools/askui/askui_controller.py b/src/askui/tools/askui/askui_controller.py index a08937ea..2b1d1ae6 100644 --- a/src/askui/tools/askui/askui_controller.py +++ b/src/askui/tools/askui/askui_controller.py @@ -7,13 +7,20 @@ from typing import Literal, Type import grpc +from google.protobuf.json_format import MessageToDict from PIL import Image from typing_extensions import Self, override from askui.container import telemetry from askui.logger import logger from askui.reporting import Reporter -from askui.tools.agent_os import AgentOs, Coordinate, ModifierKey, PcKey +from askui.tools.agent_os import ( + AgentOs, + Coordinate, + GetDisplayInformationResponse, + ModifierKey, + PcKey, +) from askui.tools.askui.askui_controller_settings import AskUiControllerSettings from askui.tools.askui.askui_ui_controller_grpc.generated import ( Controller_V1_pb2 as controller_v1_pbs, @@ -21,11 +28,11 @@ from askui.tools.askui.askui_ui_controller_grpc.generated import ( Controller_V1_pb2_grpc as controller_v1, ) -from askui.tools.askui.askui_ui_controller_grpc.generated.AgentOS_Send_Request_2501 import ( # noqa: E501 - RenderObjectStyle, +from askui.tools.askui.askui_ui_controller_grpc.generated.AgentOS_Send_Request_2501 import ( + RenderObjectStyle, # noqa: E501 ) -from askui.tools.askui.askui_ui_controller_grpc.generated.AgentOS_Send_Response_2501 import ( # noqa: E501 - AskuiAgentosSendResponseSchema, +from askui.tools.askui.askui_ui_controller_grpc.generated.AgentOS_Send_Response_2501 import ( + AskuiAgentosSendResponseSchema, # noqa: E501 ) from askui.tools.askui.command_helpers import ( create_clear_render_objects_command, @@ -626,9 +633,18 @@ def run_command(self, command: str, timeout_ms: int = 30000) -> None: ) @telemetry.record_call() + @override + def get_active_display(self) -> int: + """ + Get the active display. + """ + return self._display + + @telemetry.record_call() + @override def get_display_information( self, - ) -> controller_v1_pbs.Response_GetDisplayInformation: + ) -> GetDisplayInformationResponse: """ Get information about all available displays and virtual screen. @@ -647,7 +663,12 @@ def get_display_information( self._stub.GetDisplayInformation(controller_v1_pbs.Request_Void()) ) - return response + response_dict = MessageToDict( + response, + preserving_proto_field_name=True, + ) + + return GetDisplayInformationResponse.model_validate(response_dict) @telemetry.record_call() def get_process_list( diff --git a/src/askui/tools/list_display_tool.py b/src/askui/tools/list_display_tool.py new file mode 100644 index 00000000..5fa20f48 --- /dev/null +++ b/src/askui/tools/list_display_tool.py @@ -0,0 +1,21 @@ +from askui.models.shared.tools import Tool +from askui.tools.agent_os import AgentOs, GetDisplayInformationResponse + + +class ListDisplayTool(Tool): + """ + Tool to list all the available displays. + """ + + def __init__(self, agent_os: AgentOs) -> None: + super().__init__( + name="list_display", + description=""" + This tool is useful for listing all the available displays. + This is useful when the agent is not able to find the information on the current display. + """, + ) + self._agent_os: AgentOs = agent_os + + def __call__(self) -> GetDisplayInformationResponse: + return self._agent_os.get_display_information() diff --git a/src/askui/tools/set_display_tool.py b/src/askui/tools/set_display_tool.py new file mode 100644 index 00000000..385ea068 --- /dev/null +++ b/src/askui/tools/set_display_tool.py @@ -0,0 +1,21 @@ +from askui.models.shared.tools import Tool +from askui.tools.agent_os import AgentOs + + +class SetDisplayTool(Tool): + """ + Tool to set the display. + """ + + def __init__(self, agent_os: AgentOs) -> None: + super().__init__( + name="set_display", + description=""" + This tool is useful for setting the default display screen. + This is useful when the agent is not able to find the information on the current display. + """, + ) + self._agent_os: AgentOs = agent_os + + def __call__(self, display_id: int) -> None: + self._agent_os.set_display(display_id) From 4995892946473009e0e51cb235792f3b240af7f6 Mon Sep 17 00:00:00 2001 From: danyalxahid-askui Date: Fri, 25 Jul 2025 15:22:52 +0200 Subject: [PATCH 02/13] refactor: enhance ComputerToolBase with mouse position scaling and add coordinate scaling utility functions --- src/askui/tools/computer.py | 30 ++++++++++---- src/askui/utils/image_utils.py | 75 +++++++++++++++++++++++++++++----- 2 files changed, 87 insertions(+), 18 deletions(-) diff --git a/src/askui/tools/computer.py b/src/askui/tools/computer.py index 8289b591..ab048e59 100644 --- a/src/askui/tools/computer.py +++ b/src/askui/tools/computer.py @@ -4,16 +4,16 @@ from dataclasses import dataclass from typing import Annotated, Literal, TypedDict, cast, get_args -from anthropic.types.beta import ( - BetaToolComputerUse20241022Param, - BetaToolComputerUse20250124Param, -) +from anthropic.types.beta import (BetaToolComputerUse20241022Param, + BetaToolComputerUse20250124Param) from PIL import Image from pydantic import Field, validate_call from typing_extensions import Self, override -from askui.tools.agent_os import AgentOs, ModifierKey, PcKey -from askui.utils.image_utils import scale_coordinates_back, scale_image_with_padding +from askui.tools.agent_os import AgentOs, Coordinate, ModifierKey, PcKey +from askui.utils.image_utils import (scale_coordinates_back, + scale_coordinates_with_padding, + scale_image_with_padding) from ..models.shared.tools import InputSchema, Tool @@ -223,10 +223,10 @@ def __call__( # noqa: C901 text: str | None = None, coordinate: tuple[Annotated[int, Field(ge=0)], Annotated[int, Field(ge=0)]] | None = None, - ) -> Image.Image | None: + ) -> Image.Image | None | str: match action: case "cursor_position": - raise ActionNotImplementedError(action, self.name) + return self._get_mouse_position_scaled() case "double_click": return self._agent_os.click("left", 2) case "key": @@ -325,6 +325,20 @@ def _screenshot(self) -> Image.Image: self._real_screen_height = screenshot.height return scale_image_with_padding(screenshot, self._width, self._height) + def _get_mouse_position_scaled(self) -> str: + mouse_position: Coordinate = self._agent_os.get_mouse_position() + real_screen_width, real_screen_height = self._get_real_screen_resolution() + x, y = scale_coordinates_with_padding( + mouse_position.x, + mouse_position.y, + real_screen_width, + real_screen_height, + self._width, + self._height, + ) + + return f"X={x},Y={y}" + class Computer20241022Tool(ComputerToolBase): type: Literal["computer_20241022"] = "computer_20241022" diff --git a/src/askui/utils/image_utils.py b/src/askui/utils/image_utils.py index c2647f76..67319911 100644 --- a/src/askui/utils/image_utils.py +++ b/src/askui/utils/image_utils.py @@ -7,8 +7,9 @@ from pathlib import Path from typing import Any, Literal, Tuple, Union -from PIL import Image, ImageDraw, ImageOps, UnidentifiedImageError +from PIL import Image from PIL import Image as PILImage +from PIL import ImageDraw, ImageOps, UnidentifiedImageError from pydantic import ConfigDict, RootModel, field_validator # Regex to capture any kind of valid base64 data url (with optional media type and ;base64) @@ -189,6 +190,65 @@ def scale_image_with_padding( fill=(0, 0, 0), # Black padding ) +def scale_coordinates_with_padding( + x: float, + y: float, + original_width: int, + original_height: int, + max_width: int, + max_height: int, +) -> Tuple[float, float]: + """Convert coordinates from an original image to a scaled and padded image. + This function takes coordinates from the original image and calculates + their corresponding position in an image that has been scaled and + padded to fit within `max_width` and `max_height`. + Args: + x (float): The x-coordinate in the original image. + y (float): The y-coordinate in the original image. + original_width (int): The width of the original image. + original_height (int): The height of the original image. + max_width (int): The maximum width of the output scaled and padded image. + max_height (int): The maximum height of the output scaled and padded image. + Returns: + Tuple[float, float]: A tuple of (scaled_x, scaled_y) coordinates + in the padded image. + """ + scale_factor, scaled_width, scaled_height = _calculate_aspect_fit_scaling( + original_width, original_height, max_width, max_height + ) + + pad_left = (max_width - scaled_width) // 2 + pad_top = (max_height - scaled_height) // 2 + + scaled_x = x * scale_factor + pad_left + scaled_y = y * scale_factor + pad_top + + if scaled_x < 0 or scaled_y < 0 or scaled_x > max_width or scaled_y > max_height: + error_msg = "Coordinates are outside the padded image area" + raise ValueError(error_msg) + return scaled_x, scaled_y + + +def _calculate_aspect_fit_scaling( + original_width: int, + original_height: int, + max_width: int, + max_height: int, +) -> Tuple[float, float, float]: + """Calculate the scale factors for an image to fit within specified dimensions while maintaining aspect ratio and adding padding.""" + + aspect_ratio = original_width / original_height + if (max_width / max_height) > aspect_ratio: + scale_factor = max_height / original_height + scaled_width = int(original_width * scale_factor) + scaled_height = max_height + else: + scale_factor = max_width / original_width + scaled_width = max_width + scaled_height = int(original_height * scale_factor) + + return scale_factor, scaled_width, scaled_height + def scale_coordinates_back( x: float, @@ -214,15 +274,10 @@ def scale_coordinates_back( Raises: ValueError: If the coordinates are outside the padded image area. """ - aspect_ratio = original_width / original_height - if (max_width / max_height) > aspect_ratio: - scale_factor = max_height / original_height - scaled_width = int(original_width * scale_factor) - scaled_height = max_height - else: - scale_factor = max_width / original_width - scaled_width = max_width - scaled_height = int(original_height * scale_factor) + scale_factor, scaled_width, scaled_height = _calculate_aspect_fit_scaling( + original_width, original_height, max_width, max_height + ) + pad_left = (max_width - scaled_width) // 2 pad_top = (max_height - scaled_height) // 2 adjusted_x = x - pad_left From 241057d9e6e0064e198f1a42205f0e1de32ee963 Mon Sep 17 00:00:00 2001 From: danyalxahid-askui Date: Fri, 25 Jul 2025 15:31:32 +0200 Subject: [PATCH 03/13] style: update docstrings for scale_coordinates_with_padding and _calculate_aspect_fit_scaling functions --- src/askui/utils/image_utils.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/src/askui/utils/image_utils.py b/src/askui/utils/image_utils.py index 67319911..d54c8b5a 100644 --- a/src/askui/utils/image_utils.py +++ b/src/askui/utils/image_utils.py @@ -198,20 +198,9 @@ def scale_coordinates_with_padding( max_width: int, max_height: int, ) -> Tuple[float, float]: - """Convert coordinates from an original image to a scaled and padded image. - This function takes coordinates from the original image and calculates - their corresponding position in an image that has been scaled and - padded to fit within `max_width` and `max_height`. - Args: - x (float): The x-coordinate in the original image. - y (float): The y-coordinate in the original image. - original_width (int): The width of the original image. - original_height (int): The height of the original image. - max_width (int): The maximum width of the output scaled and padded image. - max_height (int): The maximum height of the output scaled and padded image. - Returns: - Tuple[float, float]: A tuple of (scaled_x, scaled_y) coordinates - in the padded image. + """ + Scale coordinates from an original coordinate system to a scaled and padded coordinate system. + """ scale_factor, scaled_width, scaled_height = _calculate_aspect_fit_scaling( original_width, original_height, max_width, max_height @@ -235,7 +224,18 @@ def _calculate_aspect_fit_scaling( max_width: int, max_height: int, ) -> Tuple[float, float, float]: - """Calculate the scale factors for an image to fit within specified dimensions while maintaining aspect ratio and adding padding.""" + """Calculate the scale factors for an image to fit within specified dimensions while maintaining aspect ratio. + + Args: + original_width (int): The width of the original coordinate system. + original_height (int): The height of the original coordinate system. + max_width (int): The maximum width of the output scaled coordinate system. + max_height (int): The maximum height of the output scaled coordinate system. + + Returns: + Tuple[float, float, float]: A tuple of (scale_factor, scaled_width, scaled_height). + + """ aspect_ratio = original_width / original_height if (max_width / max_height) > aspect_ratio: From 572f948f9c50299990bfc6caa141adb83594213b Mon Sep 17 00:00:00 2001 From: danyalxahid-askui Date: Fri, 25 Jul 2025 15:32:37 +0200 Subject: [PATCH 04/13] style: format description text in ListDisplayTool and SetDisplayTool --- src/askui/tools/list_display_tool.py | 3 ++- src/askui/tools/set_display_tool.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/askui/tools/list_display_tool.py b/src/askui/tools/list_display_tool.py index 5fa20f48..f84a7992 100644 --- a/src/askui/tools/list_display_tool.py +++ b/src/askui/tools/list_display_tool.py @@ -12,7 +12,8 @@ def __init__(self, agent_os: AgentOs) -> None: name="list_display", description=""" This tool is useful for listing all the available displays. - This is useful when the agent is not able to find the information on the current display. + This is useful when the agent is not able to find the information on the + current display. """, ) self._agent_os: AgentOs = agent_os diff --git a/src/askui/tools/set_display_tool.py b/src/askui/tools/set_display_tool.py index 385ea068..53fd4b39 100644 --- a/src/askui/tools/set_display_tool.py +++ b/src/askui/tools/set_display_tool.py @@ -12,7 +12,8 @@ def __init__(self, agent_os: AgentOs) -> None: name="set_display", description=""" This tool is useful for setting the default display screen. - This is useful when the agent is not able to find the information on the current display. + This is useful when the agent is not able to find the information on the + current display. """, ) self._agent_os: AgentOs = agent_os From a4330bdbd7eeef68fca0b9fb948d41a6c5a0190c Mon Sep 17 00:00:00 2001 From: danyalxahid-askui Date: Fri, 25 Jul 2025 15:34:16 +0200 Subject: [PATCH 05/13] style: reorganize imports and update formatting in agent_os.py and askui_controller.py --- src/askui/tools/agent_os.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/askui/tools/agent_os.py b/src/askui/tools/agent_os.py index 5f3c169f..6ae7118b 100644 --- a/src/askui/tools/agent_os.py +++ b/src/askui/tools/agent_os.py @@ -5,9 +5,9 @@ from pydantic import BaseModel, Field if TYPE_CHECKING: - from askui.tools.askui.askui_ui_controller_grpc.generated.AgentOS_Send_Request_2501 import ( + from askui.tools.askui.askui_ui_controller_grpc.generated.AgentOS_Send_Request_2501 import ( # noqa: E501 RenderObjectStyle, - ) # noqa: E501 + ) ModifierKey = Literal[ From eca8b22c8037c8feac6ec6bb0ffb1e9fbed94bc7 Mon Sep 17 00:00:00 2001 From: danyalxahid-askui Date: Fri, 25 Jul 2025 15:39:02 +0200 Subject: [PATCH 06/13] fix: update return type of method to include str in Computer20250124Tool --- src/askui/tools/computer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/askui/tools/computer.py b/src/askui/tools/computer.py index ab048e59..286ddf12 100644 --- a/src/askui/tools/computer.py +++ b/src/askui/tools/computer.py @@ -440,7 +440,7 @@ def __call__( # noqa: C901 scroll_amount: Annotated[int, Field(ge=0)] | None = None, duration: Annotated[float, Field(ge=0.0, le=100.0)] | None = None, key: str | None = None, # maybe not all keys supported - ) -> Image.Image | None: + ) -> Image.Image | None | str: match action: case "hold_key": self._hold_key(keystroke=text, duration=duration) # type: ignore[arg-type] From 960b4beb04e40949e2db99651188681a06bd7e7e Mon Sep 17 00:00:00 2001 From: danyalxahid-askui Date: Fri, 25 Jul 2025 15:44:17 +0200 Subject: [PATCH 07/13] style: reorganize imports and update formatting in computer.py and image_utils.py --- src/askui/tools/computer.py | 14 +++++++++----- src/askui/utils/image_utils.py | 12 ++++++------ 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/src/askui/tools/computer.py b/src/askui/tools/computer.py index 286ddf12..a97bb3cb 100644 --- a/src/askui/tools/computer.py +++ b/src/askui/tools/computer.py @@ -4,16 +4,20 @@ from dataclasses import dataclass from typing import Annotated, Literal, TypedDict, cast, get_args -from anthropic.types.beta import (BetaToolComputerUse20241022Param, - BetaToolComputerUse20250124Param) +from anthropic.types.beta import ( + BetaToolComputerUse20241022Param, + BetaToolComputerUse20250124Param, +) from PIL import Image from pydantic import Field, validate_call from typing_extensions import Self, override from askui.tools.agent_os import AgentOs, Coordinate, ModifierKey, PcKey -from askui.utils.image_utils import (scale_coordinates_back, - scale_coordinates_with_padding, - scale_image_with_padding) +from askui.utils.image_utils import ( + scale_coordinates_back, + scale_coordinates_with_padding, + scale_image_with_padding, +) from ..models.shared.tools import InputSchema, Tool diff --git a/src/askui/utils/image_utils.py b/src/askui/utils/image_utils.py index d54c8b5a..f3a60a5b 100644 --- a/src/askui/utils/image_utils.py +++ b/src/askui/utils/image_utils.py @@ -7,9 +7,8 @@ from pathlib import Path from typing import Any, Literal, Tuple, Union -from PIL import Image +from PIL import Image, ImageDraw, ImageOps, UnidentifiedImageError from PIL import Image as PILImage -from PIL import ImageDraw, ImageOps, UnidentifiedImageError from pydantic import ConfigDict, RootModel, field_validator # Regex to capture any kind of valid base64 data url (with optional media type and ;base64) @@ -190,6 +189,7 @@ def scale_image_with_padding( fill=(0, 0, 0), # Black padding ) + def scale_coordinates_with_padding( x: float, y: float, @@ -200,7 +200,7 @@ def scale_coordinates_with_padding( ) -> Tuple[float, float]: """ Scale coordinates from an original coordinate system to a scaled and padded coordinate system. - + """ scale_factor, scaled_width, scaled_height = _calculate_aspect_fit_scaling( original_width, original_height, max_width, max_height @@ -225,16 +225,16 @@ def _calculate_aspect_fit_scaling( max_height: int, ) -> Tuple[float, float, float]: """Calculate the scale factors for an image to fit within specified dimensions while maintaining aspect ratio. - + Args: original_width (int): The width of the original coordinate system. original_height (int): The height of the original coordinate system. max_width (int): The maximum width of the output scaled coordinate system. max_height (int): The maximum height of the output scaled coordinate system. - + Returns: Tuple[float, float, float]: A tuple of (scale_factor, scaled_width, scaled_height). - + """ aspect_ratio = original_width / original_height From c7a4b35f7ffdf239f58b4e43959546bf41309d98 Mon Sep 17 00:00:00 2001 From: danyalxahid-askui Date: Fri, 25 Jul 2025 17:30:15 +0200 Subject: [PATCH 08/13] feat: add input schema for display_id in SetDisplayTool --- src/askui/tools/set_display_tool.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/askui/tools/set_display_tool.py b/src/askui/tools/set_display_tool.py index 53fd4b39..19763772 100644 --- a/src/askui/tools/set_display_tool.py +++ b/src/askui/tools/set_display_tool.py @@ -15,6 +15,18 @@ def __init__(self, agent_os: AgentOs) -> None: This is useful when the agent is not able to find the information on the current display. """, + input_schema={ + "type": "object", + "properties": { + "display_id": { + "type": "integer", + "description": ( + "The display id to set. Must be a valid display id." + ), + }, + }, + "required": ["display_id"], + }, ) self._agent_os: AgentOs = agent_os From d8864b09e9d023372844705964407cc8afa523c9 Mon Sep 17 00:00:00 2001 From: Adrian Stritzinger Date: Tue, 29 Jul 2025 12:01:59 +0200 Subject: [PATCH 09/13] refactor: improve usability of display tools by ai/human agent - clearer, more concise naming --- src/askui/agent.py | 17 +++++----- src/askui/tools/agent_os.py | 31 +++++++++---------- src/askui/tools/askui/askui_controller.py | 16 +++++----- src/askui/tools/list_display_tool.py | 22 ------------- ..._display_tool.py => list_displays_tool.py} | 14 ++++----- .../tools/retrieve_active_display_tool.py | 16 ++++++++++ ...lay_tool.py => set_active_display_tool.py} | 16 +++------- .../e2e/tools/askui/test_askui_controller.py | 2 +- 8 files changed, 58 insertions(+), 76 deletions(-) delete mode 100644 src/askui/tools/list_display_tool.py rename src/askui/tools/{active_display_tool.py => list_displays_tool.py} (54%) create mode 100644 src/askui/tools/retrieve_active_display_tool.py rename src/askui/tools/{set_display_tool.py => set_active_display_tool.py} (58%) diff --git a/src/askui/agent.py b/src/askui/agent.py index 3c27a8c9..98fb82af 100644 --- a/src/askui/agent.py +++ b/src/askui/agent.py @@ -17,11 +17,11 @@ MessageSettings, ) from askui.models.shared.tools import Tool -from askui.tools.active_display_tool import ActiveDisplayTool from askui.tools.computer import Computer20241022Tool, Computer20250124Tool from askui.tools.exception_tool import ExceptionTool -from askui.tools.list_display_tool import ListDisplayTool -from askui.tools.set_display_tool import SetDisplayTool +from askui.tools.list_displays_tool import ListDisplaysTool +from askui.tools.retrieve_active_display_tool import RetrieveActiveDisplayTool +from askui.tools.set_active_display_tool import SetActiveDisplayTool from .logger import logger from .models import ModelComposition @@ -33,9 +33,10 @@ _SYSTEM_PROMPT = f""" * You are utilising a {sys.platform} machine using {platform.machine()} architecture with internet access. +* When you cannot find something (application window, ui element etc.) on the currently selected/active displa/screen, check the other available displays by listing them and checking which one is currently active and then going through the other displays one by one until you find it or you have checked all of them. * When asked to perform web tasks try to open the browser (firefox, chrome, safari, ...) if not already open. Often you can find the browser icons in the toolbars of the operating systems. -* When viewing a page it can be helpful to zoom out so that you can see everything on the page. Either that, or make sure you scroll down to see everything before deciding something isn't available. -* When using your function calls, they take a while to run and send back to you. Where possible/feasible, try to chain multiple of these calls all into one function calls request. +* When viewing a page it can be helpful to zoom out/in so that you can see everything on the page. Either that, or make sure you scroll down/up to see everything before deciding something isn't available. +* When using your function calls, they take a while to run and send back to you. Where possible/feasible, try to chain multiple of these calls all into one function calls request. * The current date and time is {datetime.now(timezone.utc).strftime("%A, %B %d, %Y %H:%M:%S %z")}. @@ -118,9 +119,9 @@ def __init__( models=models, tools=[ ExceptionTool(), - SetDisplayTool(agent_os=self.tools.os), - ListDisplayTool(agent_os=self.tools.os), - ActiveDisplayTool(agent_os=self.tools.os), + SetActiveDisplayTool(agent_os=self.tools.os), + RetrieveActiveDisplayTool(agent_os=self.tools.os), + ListDisplaysTool(agent_os=self.tools.os), ] + (act_tools or []), agent_os=self.tools.os, diff --git a/src/askui/tools/agent_os.py b/src/askui/tools/agent_os.py index 6ae7118b..86236885 100644 --- a/src/askui/tools/agent_os.py +++ b/src/askui/tools/agent_os.py @@ -5,8 +5,8 @@ from pydantic import BaseModel, Field if TYPE_CHECKING: - from askui.tools.askui.askui_ui_controller_grpc.generated.AgentOS_Send_Request_2501 import ( # noqa: E501 - RenderObjectStyle, + from askui.tools.askui.askui_ui_controller_grpc.generated.AgentOS_Send_Request_2501 import ( + RenderObjectStyle, # noqa: E501 ) @@ -159,24 +159,20 @@ class Coordinate(BaseModel): y: int -class SizeInPixels(BaseModel): +class DisplaySize(BaseModel): """Represents the size of a display in pixels.""" width: int height: int -class DisplayInformation(BaseModel): - """Contains information about a single display.""" +class Display(BaseModel): + id: int = Field(validation_alias="displayID") + size: DisplaySize = Field(validation_alias="sizeInPixels") - display_id: int = Field(validation_alias="displayID") - size_in_pixels: SizeInPixels = Field(validation_alias="sizeInPixels", exclude=True) - -class GetDisplayInformationResponse(BaseModel): - """Response model for display information requests.""" - - displays: list[DisplayInformation] +class DisplaysListResponse(BaseModel): + data: list[Display] = Field(validation_alias="displays") InputEvent = ClickEvent @@ -343,15 +339,18 @@ def keyboard_tap( """ raise NotImplementedError - def get_display_information(self) -> GetDisplayInformationResponse: + def list_displays(self) -> DisplaysListResponse: """ - Get information about all available displays and virtual screen. + List all the available displays. """ raise NotImplementedError - def get_active_display(self) -> int: + def retrieve_active_display(self) -> int: """ - Get the active display. + Retrieve the id of the currently active display/screen. + + Returns: + int: The id of the currently active display/screen. """ raise NotImplementedError diff --git a/src/askui/tools/askui/askui_controller.py b/src/askui/tools/askui/askui_controller.py index 2b1d1ae6..c4a91868 100644 --- a/src/askui/tools/askui/askui_controller.py +++ b/src/askui/tools/askui/askui_controller.py @@ -17,7 +17,7 @@ from askui.tools.agent_os import ( AgentOs, Coordinate, - GetDisplayInformationResponse, + DisplaysListResponse, ModifierKey, PcKey, ) @@ -634,7 +634,7 @@ def run_command(self, command: str, timeout_ms: int = 30000) -> None: @telemetry.record_call() @override - def get_active_display(self) -> int: + def retrieve_active_display(self) -> int: """ Get the active display. """ @@ -642,16 +642,14 @@ def get_active_display(self) -> int: @telemetry.record_call() @override - def get_display_information( + def list_displays( self, - ) -> GetDisplayInformationResponse: + ) -> DisplaysListResponse: """ - Get information about all available displays and virtual screen. + List all available displays including virtual screens. Returns: - controller_v1_pbs.Response_GetDisplayInformation: - - displays: List of DisplayInformation objects - - virtualScreenRectangle: Overall virtual screen bounds + DisplaysListResponse """ assert isinstance(self._stub, controller_v1.ControllerAPIStub), ( "Stub is not initialized" @@ -668,7 +666,7 @@ def get_display_information( preserving_proto_field_name=True, ) - return GetDisplayInformationResponse.model_validate(response_dict) + return DisplaysListResponse.model_validate(response_dict) @telemetry.record_call() def get_process_list( diff --git a/src/askui/tools/list_display_tool.py b/src/askui/tools/list_display_tool.py deleted file mode 100644 index f84a7992..00000000 --- a/src/askui/tools/list_display_tool.py +++ /dev/null @@ -1,22 +0,0 @@ -from askui.models.shared.tools import Tool -from askui.tools.agent_os import AgentOs, GetDisplayInformationResponse - - -class ListDisplayTool(Tool): - """ - Tool to list all the available displays. - """ - - def __init__(self, agent_os: AgentOs) -> None: - super().__init__( - name="list_display", - description=""" - This tool is useful for listing all the available displays. - This is useful when the agent is not able to find the information on the - current display. - """, - ) - self._agent_os: AgentOs = agent_os - - def __call__(self) -> GetDisplayInformationResponse: - return self._agent_os.get_display_information() diff --git a/src/askui/tools/active_display_tool.py b/src/askui/tools/list_displays_tool.py similarity index 54% rename from src/askui/tools/active_display_tool.py rename to src/askui/tools/list_displays_tool.py index ede6f992..92cf9088 100644 --- a/src/askui/tools/active_display_tool.py +++ b/src/askui/tools/list_displays_tool.py @@ -2,19 +2,17 @@ from askui.tools.agent_os import AgentOs -class ActiveDisplayTool(Tool): - """ - Tool to get the active display id. - """ - +class ListDisplaysTool(Tool): def __init__(self, agent_os: AgentOs) -> None: super().__init__( - name="active_display", + name="list_displays", description=""" - This tool is useful for getting the active display id. + List all the available displays. """, ) self._agent_os: AgentOs = agent_os def __call__(self) -> str: - return str(self._agent_os.get_active_display()) + return self._agent_os.list_displays().model_dump_json( + exclude={"data": {"__all__": {"size"}}}, + ) diff --git a/src/askui/tools/retrieve_active_display_tool.py b/src/askui/tools/retrieve_active_display_tool.py new file mode 100644 index 00000000..2c4c65fa --- /dev/null +++ b/src/askui/tools/retrieve_active_display_tool.py @@ -0,0 +1,16 @@ +from askui.models.shared.tools import Tool +from askui.tools.agent_os import AgentOs + + +class RetrieveActiveDisplayTool(Tool): + def __init__(self, agent_os: AgentOs) -> None: + super().__init__( + name="retrieve_active_display", + description=""" + Retrieve the id (integer) of the currently active display/screen. + """, + ) + self._agent_os: AgentOs = agent_os + + def __call__(self) -> str: + return str(self._agent_os.retrieve_active_display()) diff --git a/src/askui/tools/set_display_tool.py b/src/askui/tools/set_active_display_tool.py similarity index 58% rename from src/askui/tools/set_display_tool.py rename to src/askui/tools/set_active_display_tool.py index 19763772..5fc6e57c 100644 --- a/src/askui/tools/set_display_tool.py +++ b/src/askui/tools/set_active_display_tool.py @@ -2,27 +2,19 @@ from askui.tools.agent_os import AgentOs -class SetDisplayTool(Tool): - """ - Tool to set the display. - """ - +class SetActiveDisplayTool(Tool): def __init__(self, agent_os: AgentOs) -> None: super().__init__( - name="set_display", + name="set_active_display", description=""" - This tool is useful for setting the default display screen. - This is useful when the agent is not able to find the information on the - current display. + Set the display screen from which screenshots are taken and on which + actions are performed. """, input_schema={ "type": "object", "properties": { "display_id": { "type": "integer", - "description": ( - "The display id to set. Must be a valid display id." - ), }, }, "required": ["display_id"], diff --git a/tests/e2e/tools/askui/test_askui_controller.py b/tests/e2e/tools/askui/test_askui_controller.py index d9c3a097..2c479338 100644 --- a/tests/e2e/tools/askui/test_askui_controller.py +++ b/tests/e2e/tools/askui/test_askui_controller.py @@ -122,7 +122,7 @@ def test_screenshot_basic(controller_client: AskUiControllerClient) -> None: def test_get_display_information(controller_client: AskUiControllerClient) -> None: """Test retrieving display information""" with controller_client: - display_info = controller_client.get_display_information() + display_info = controller_client.list_displays() assert display_info is not None From 0c39e2258bae0a268349b4b9549c5548d0f58f56 Mon Sep 17 00:00:00 2001 From: Adrian Stritzinger Date: Tue, 29 Jul 2025 12:09:10 +0200 Subject: [PATCH 10/13] refactor: improve usability of display tools by ai/human agent - return whole display object instead of just id for `retrieve_active_display` --- src/askui/tools/agent_os.py | 6 +++--- src/askui/tools/askui/askui_controller.py | 18 ++++++++++++++---- .../tools/retrieve_active_display_tool.py | 6 ++++-- 3 files changed, 21 insertions(+), 9 deletions(-) diff --git a/src/askui/tools/agent_os.py b/src/askui/tools/agent_os.py index 86236885..0559ed70 100644 --- a/src/askui/tools/agent_os.py +++ b/src/askui/tools/agent_os.py @@ -345,12 +345,12 @@ def list_displays(self) -> DisplaysListResponse: """ raise NotImplementedError - def retrieve_active_display(self) -> int: + def retrieve_active_display(self) -> Display: """ - Retrieve the id of the currently active display/screen. + Retrieve the currently active display/screen. Returns: - int: The id of the currently active display/screen. + Display: The currently active display/screen. """ raise NotImplementedError diff --git a/src/askui/tools/askui/askui_controller.py b/src/askui/tools/askui/askui_controller.py index c4a91868..81351f6c 100644 --- a/src/askui/tools/askui/askui_controller.py +++ b/src/askui/tools/askui/askui_controller.py @@ -17,6 +17,7 @@ from askui.tools.agent_os import ( AgentOs, Coordinate, + Display, DisplaysListResponse, ModifierKey, PcKey, @@ -634,11 +635,20 @@ def run_command(self, command: str, timeout_ms: int = 30000) -> None: @telemetry.record_call() @override - def retrieve_active_display(self) -> int: + def retrieve_active_display(self) -> Display: """ - Get the active display. + Retrieve the currently active display/screen. + + Returns: + Display: The currently active display/screen. """ - return self._display + self._reporter.add_message("AgentOS", "retrieve_active_display()") + displays_list_response = self.list_displays() + for display in displays_list_response.data: + if display.id == self._display: + return display + error_msg = f"Display {self._display} not found" + raise ValueError(error_msg) @telemetry.record_call() @override @@ -655,7 +665,7 @@ def list_displays( "Stub is not initialized" ) - self._reporter.add_message("AgentOS", "get_display_information()") + self._reporter.add_message("AgentOS", "list_displays()") response: controller_v1_pbs.Response_GetDisplayInformation = ( self._stub.GetDisplayInformation(controller_v1_pbs.Request_Void()) diff --git a/src/askui/tools/retrieve_active_display_tool.py b/src/askui/tools/retrieve_active_display_tool.py index 2c4c65fa..f42882f5 100644 --- a/src/askui/tools/retrieve_active_display_tool.py +++ b/src/askui/tools/retrieve_active_display_tool.py @@ -7,10 +7,12 @@ def __init__(self, agent_os: AgentOs) -> None: super().__init__( name="retrieve_active_display", description=""" - Retrieve the id (integer) of the currently active display/screen. + Retrieve the currently active display/screen. """, ) self._agent_os: AgentOs = agent_os def __call__(self) -> str: - return str(self._agent_os.retrieve_active_display()) + return str( + self._agent_os.retrieve_active_display().model_dump_json(exclude={"size"}) + ) From 69f8afdfdb69991a44dc4ab422a4720df9e77a76 Mon Sep 17 00:00:00 2001 From: Adrian Stritzinger Date: Tue, 29 Jul 2025 12:25:13 +0200 Subject: [PATCH 11/13] fix(tools/computer): invalid real coordinate cache - when switching the display, the cache needs to be invalidated - removed the cache by retrieving display size live - moved from inferring the display size from the screenshot to retrieving size which is cheaper and faster --- src/askui/tools/agent_os.py | 11 ++++++++--- src/askui/tools/computer.py | 11 ++--------- src/askui/tools/playwright/agent_os.py | 24 +++++++++++++++++++++++- src/askui/tools/pynput_agent_os.py | 24 +++++++++++++++++++++++- 4 files changed, 56 insertions(+), 14 deletions(-) diff --git a/src/askui/tools/agent_os.py b/src/askui/tools/agent_os.py index 0559ed70..346a6ed7 100644 --- a/src/askui/tools/agent_os.py +++ b/src/askui/tools/agent_os.py @@ -2,11 +2,11 @@ from typing import TYPE_CHECKING, Literal from PIL import Image -from pydantic import BaseModel, Field +from pydantic import BaseModel, ConfigDict, Field if TYPE_CHECKING: - from askui.tools.askui.askui_ui_controller_grpc.generated.AgentOS_Send_Request_2501 import ( - RenderObjectStyle, # noqa: E501 + from askui.tools.askui.askui_ui_controller_grpc.generated.AgentOS_Send_Request_2501 import ( # noqa: E501 + RenderObjectStyle, ) @@ -167,6 +167,10 @@ class DisplaySize(BaseModel): class Display(BaseModel): + model_config = ConfigDict( + validate_by_name=True, + ) + id: int = Field(validation_alias="displayID") size: DisplaySize = Field(validation_alias="sizeInPixels") @@ -345,6 +349,7 @@ def list_displays(self) -> DisplaysListResponse: """ raise NotImplementedError + @abstractmethod def retrieve_active_display(self) -> Display: """ Retrieve the currently active display/screen. diff --git a/src/askui/tools/computer.py b/src/askui/tools/computer.py index a97bb3cb..1ebb1401 100644 --- a/src/askui/tools/computer.py +++ b/src/askui/tools/computer.py @@ -206,8 +206,6 @@ def __init__( self._agent_os = agent_os self._width = 1280 self._height = 800 - self._real_screen_width: int | None = None - self._real_screen_height: int | None = None @property def params_base( @@ -279,11 +277,8 @@ def _keyboard_released(self, keystroke: str) -> None: ) def _get_real_screen_resolution(self) -> tuple[int, int]: - if self._real_screen_width is None or self._real_screen_height is None: - screenshot = self._agent_os.screenshot() - self._real_screen_width = screenshot.width - self._real_screen_height = screenshot.height - return self._real_screen_width, self._real_screen_height + size = self._agent_os.retrieve_active_display().size + return size.width, size.height def _scale_coordinates_back( self, @@ -325,8 +320,6 @@ def _screenshot(self) -> Image.Image: Take a screenshot of the current screen, scale it and return it """ screenshot = self._agent_os.screenshot() - self._real_screen_width = screenshot.width - self._real_screen_height = screenshot.height return scale_image_with_padding(screenshot, self._width, self._height) def _get_mouse_position_scaled(self) -> str: diff --git a/src/askui/tools/playwright/agent_os.py b/src/askui/tools/playwright/agent_os.py index 285f6be4..aa226ca7 100644 --- a/src/askui/tools/playwright/agent_os.py +++ b/src/askui/tools/playwright/agent_os.py @@ -16,7 +16,7 @@ ) from typing_extensions import override -from ..agent_os import AgentOs, InputEvent, ModifierKey, PcKey +from ..agent_os import AgentOs, Display, DisplaySize, InputEvent, ModifierKey, PcKey class PlaywrightAgentOs(AgentOs): @@ -356,6 +356,28 @@ def keyboard_tap( for modifier in modifier_keys: self._page.keyboard.up(self._convert_key(modifier)) + @override + def retrieve_active_display(self) -> Display: + """ + Retrieve the currently active display/screen. + """ + if not self._page: + error_msg = "No active page. Call connect() first." + raise RuntimeError(error_msg) + + viewport_size = self._page.viewport_size + if viewport_size is None: + error_msg = "No viewport size." + raise RuntimeError(error_msg) + + return Display( + id=1, + size=DisplaySize( + width=viewport_size["width"], + height=viewport_size["height"], + ), + ) + def _convert_key(self, key: PcKey | ModifierKey) -> str: """ Convert our key format to Playwright's key format. diff --git a/src/askui/tools/pynput_agent_os.py b/src/askui/tools/pynput_agent_os.py index 1cd022e2..8a57a83d 100644 --- a/src/askui/tools/pynput_agent_os.py +++ b/src/askui/tools/pynput_agent_os.py @@ -18,7 +18,14 @@ from askui.logger import logger from askui.reporting import CompositeReporter, Reporter -from askui.tools.agent_os import AgentOs, InputEvent, ModifierKey, PcKey +from askui.tools.agent_os import ( + AgentOs, + Display, + DisplaySize, + InputEvent, + ModifierKey, + PcKey, +) from askui.utils.image_utils import draw_point_on_image if platform.system() == "Windows": @@ -415,3 +422,18 @@ def stop_listening(self) -> None: self._mouse_listener = None while not self._input_event_queue.empty(): self._input_event_queue.get() + + @override + def retrieve_active_display(self) -> Display: + """ + Retrieve the currently active display/screen. + """ + monitor = self._sct.monitors[self._display] + + return Display( + id=self._display, + size=DisplaySize( + width=monitor["width"], + height=monitor["height"], + ), + ) From 062e84285e49b19f0d5a30badd86e4f7562119cb Mon Sep 17 00:00:00 2001 From: Adrian Stritzinger Date: Tue, 29 Jul 2025 17:44:16 +0200 Subject: [PATCH 12/13] refactor(tools/computer): consistent, more readable coordinate scaling --- src/askui/locators/serializers.py | 4 +- src/askui/models/anthropic/messages_api.py | 23 +- src/askui/models/shared/tools.py | 2 + src/askui/tools/android/agent_os_facade.py | 20 +- src/askui/tools/askui/askui_controller.py | 8 +- src/askui/tools/computer.py | 77 +++-- src/askui/utils/image_utils.py | 360 +++++++++++++-------- tests/conftest.py | 6 +- tests/unit/utils/test_image_utils.py | 183 ++++++++++- 9 files changed, 473 insertions(+), 210 deletions(-) diff --git a/src/askui/locators/serializers.py b/src/askui/locators/serializers.py index 32dd3987..35d52688 100644 --- a/src/askui/locators/serializers.py +++ b/src/askui/locators/serializers.py @@ -13,9 +13,7 @@ Prompt, Text, ) -from .locators import ( - AiElement as AiElementLocator, -) +from .locators import AiElement as AiElementLocator from .relatable import ( BoundingRelation, LogicalRelation, diff --git a/src/askui/models/anthropic/messages_api.py b/src/askui/models/anthropic/messages_api.py index 5418984b..bfb8704a 100644 --- a/src/askui/models/anthropic/messages_api.py +++ b/src/askui/models/anthropic/messages_api.py @@ -45,8 +45,8 @@ from askui.utils.image_utils import ( ImageSource, image_to_base64, - scale_coordinates_back, - scale_image_with_padding, + scale_coordinates, + scale_image_to_fit, ) from .utils import extract_click_coordinates @@ -156,10 +156,9 @@ def _inference( system: str, model_choice: str, ) -> str: - scaled_image = scale_image_with_padding( + scaled_image = scale_image_to_fit( image.root, - self._settings.resolution[0], - self._settings.resolution[1], + self._settings.resolution, ) message = self.create_message( messages=[ @@ -222,16 +221,12 @@ def locate( ), model_choice=model_choice, ) - scaled_x, scaled_y = extract_click_coordinates(content) - x, y = scale_coordinates_back( - scaled_x, - scaled_y, - image.root.width, - image.root.height, - screen_width, - screen_height, + return scale_coordinates( + extract_click_coordinates(content), + image.root.size, + self._settings.resolution, + inverse=True, ) - return int(x), int(y) except ( _UnexpectedResponseError, ValueError, diff --git a/src/askui/models/shared/tools.py b/src/askui/models/shared/tools.py index 63d3e4eb..355a9aae 100644 --- a/src/askui/models/shared/tools.py +++ b/src/askui/models/shared/tools.py @@ -7,6 +7,7 @@ from pydantic import BaseModel, Field from typing_extensions import Self +from askui.logger import logger from askui.models.shared.agent_message_param import ( Base64ImageSourceParam, ContentBlockParam, @@ -155,6 +156,7 @@ def _run_tool( except AgentException: raise except Exception as e: # noqa: BLE001 + logger.error(f"Tool {tool_use_block_param.name} failed: {e}", exc_info=True) return ToolResultBlockParam( content=f"Tool {tool_use_block_param.name} failed: {e}", is_error=True, diff --git a/src/askui/tools/android/agent_os_facade.py b/src/askui/tools/android/agent_os_facade.py index 85ec1712..3ab29037 100644 --- a/src/askui/tools/android/agent_os_facade.py +++ b/src/askui/tools/android/agent_os_facade.py @@ -4,7 +4,7 @@ from askui.reporting import Reporter from askui.tools.android.agent_os import ANDROID_KEY, AndroidAgentOs, AndroidDisplay -from askui.utils.image_utils import scale_coordinates_back, scale_image_with_padding +from askui.utils.image_utils import scale_coordinates, scale_image_to_fit class AndroidAgentOsFacade(AndroidAgentOs): @@ -32,10 +32,9 @@ def disconnect(self) -> None: def screenshot(self) -> Image.Image: screenshot = self._agent_os.screenshot() self._real_screen_resolution = screenshot.size - scaled_image = scale_image_with_padding( + scaled_image = scale_image_to_fit( screenshot, - self._target_resolution[0], - self._target_resolution[1], + self._target_resolution, ) self._reporter.add_message("AndroidAgentOS", "Screenshot taken", screenshot) @@ -45,15 +44,12 @@ def _scale_coordinates_back(self, x: int, y: int) -> Tuple[int, int]: if self._real_screen_resolution is None: self._real_screen_resolution = self._agent_os.screenshot().size - scaled_x, scaled_y = scale_coordinates_back( - x, - y, - self._real_screen_resolution[0], - self._real_screen_resolution[1], - self._target_resolution[0], - self._target_resolution[1], + return scale_coordinates( + (x, y), + self._real_screen_resolution, + self._target_resolution, + inverse=True, ) - return int(scaled_x), int(scaled_y) def tap(self, x: int, y: int) -> None: scaled_x, scaled_y = self._scale_coordinates_back(x, y) diff --git a/src/askui/tools/askui/askui_controller.py b/src/askui/tools/askui/askui_controller.py index 81351f6c..4c872872 100644 --- a/src/askui/tools/askui/askui_controller.py +++ b/src/askui/tools/askui/askui_controller.py @@ -29,11 +29,11 @@ from askui.tools.askui.askui_ui_controller_grpc.generated import ( Controller_V1_pb2_grpc as controller_v1, ) -from askui.tools.askui.askui_ui_controller_grpc.generated.AgentOS_Send_Request_2501 import ( - RenderObjectStyle, # noqa: E501 +from askui.tools.askui.askui_ui_controller_grpc.generated.AgentOS_Send_Request_2501 import ( # noqa: E501 + RenderObjectStyle, ) -from askui.tools.askui.askui_ui_controller_grpc.generated.AgentOS_Send_Response_2501 import ( - AskuiAgentosSendResponseSchema, # noqa: E501 +from askui.tools.askui.askui_ui_controller_grpc.generated.AgentOS_Send_Response_2501 import ( # noqa: E501 + AskuiAgentosSendResponseSchema, ) from askui.tools.askui.command_helpers import ( create_clear_render_objects_command, diff --git a/src/askui/tools/computer.py b/src/askui/tools/computer.py index 1ebb1401..7100ddb5 100644 --- a/src/askui/tools/computer.py +++ b/src/askui/tools/computer.py @@ -13,11 +13,7 @@ from typing_extensions import Self, override from askui.tools.agent_os import AgentOs, Coordinate, ModifierKey, PcKey -from askui.utils.image_utils import ( - scale_coordinates_back, - scale_coordinates_with_padding, - scale_image_with_padding, -) +from askui.utils.image_utils import scale_coordinates, scale_image_to_fit from ..models.shared.tools import InputSchema, Tool @@ -192,11 +188,33 @@ class BetaToolComputerUseParamBase(TypedDict): display_height_px: int +@dataclass +class Resolution: + width: int + height: int + + +# https://github.com/anthropics/anthropic-quickstarts/blob/main/computer-use-demo/README.md +RESOLUTIONS_RECOMMENDED_BY_ANTHROPIC: dict[str, Resolution] = { + "XGA": Resolution(width=1024, height=768), # 4:3 + "WXGA": Resolution(width=1280, height=800), # 16:10 +} + + +def _get_closest_recommended_resolution(resolution: Resolution) -> Resolution: + return min( + RESOLUTIONS_RECOMMENDED_BY_ANTHROPIC.values(), + key=lambda r: abs(r.width - resolution.width) + + abs(r.height - resolution.height), + ) + + class ComputerToolBase(Tool, ABC): def __init__( self, agent_os: AgentOs, input_schema: InputSchema, + resolution: Resolution | None = None, ) -> None: super().__init__( name="computer", @@ -204,8 +222,21 @@ def __init__( input_schema=input_schema, ) self._agent_os = agent_os - self._width = 1280 - self._height = 800 + real_resolution = self._get_real_screen_resolution() + self._resolution = resolution or _get_closest_recommended_resolution( + Resolution( + width=real_resolution[0], + height=real_resolution[1], + ) + ) + + @property + def _width(self) -> int: + return self._resolution.width + + @property + def _height(self) -> int: + return self._resolution.height @property def params_base( @@ -228,7 +259,7 @@ def __call__( # noqa: C901 ) -> Image.Image | None | str: match action: case "cursor_position": - return self._get_mouse_position_scaled() + return self._retrieve_cursor_position() case "double_click": return self._agent_os.click("left", 2) case "key": @@ -284,17 +315,12 @@ def _scale_coordinates_back( self, coordinate: tuple[Annotated[int, Field(ge=0)], Annotated[int, Field(ge=0)]], ) -> tuple[int, int]: - real_screen_width, real_screen_height = self._get_real_screen_resolution() - x, y = scale_coordinates_back( - coordinate[0], - coordinate[1], - real_screen_width, - real_screen_height, - self._width, - self._height, + return scale_coordinates( + coordinate, + self._get_real_screen_resolution(), + (self._width, self._height), + inverse=True, ) - x, y = int(x), int(y) - return x, y @validate_call def _mouse_move( @@ -320,18 +346,15 @@ def _screenshot(self) -> Image.Image: Take a screenshot of the current screen, scale it and return it """ screenshot = self._agent_os.screenshot() - return scale_image_with_padding(screenshot, self._width, self._height) + return scale_image_to_fit(screenshot, (self._width, self._height)) - def _get_mouse_position_scaled(self) -> str: + def _retrieve_cursor_position(self) -> str: mouse_position: Coordinate = self._agent_os.get_mouse_position() real_screen_width, real_screen_height = self._get_real_screen_resolution() - x, y = scale_coordinates_with_padding( - mouse_position.x, - mouse_position.y, - real_screen_width, - real_screen_height, - self._width, - self._height, + x, y = scale_coordinates( + (mouse_position.x, mouse_position.y), + (real_screen_width, real_screen_height), + (self._width, self._height), ) return f"X={x},Y={y}" diff --git a/src/askui/utils/image_utils.py b/src/askui/utils/image_utils.py index f3a60a5b..d2ca53ea 100644 --- a/src/askui/utils/image_utils.py +++ b/src/askui/utils/image_utils.py @@ -3,11 +3,11 @@ import io import pathlib import re -from io import BytesIO +from dataclasses import dataclass from pathlib import Path -from typing import Any, Literal, Tuple, Union +from typing import Any, Literal, Union -from PIL import Image, ImageDraw, ImageOps, UnidentifiedImageError +from PIL import Image, ImageDraw, UnidentifiedImageError from PIL import Image as PILImage from pydantic import ConfigDict, RootModel, field_validator @@ -32,16 +32,14 @@ def load_image(source: Union[str, Path, Image.Image]) -> Image.Image: if isinstance(source, Image.Image): return source - if isinstance(source, Path) or ( - isinstance(source, str) and not source.startswith(("data:", ",")) - ): + if isinstance(source, Path) or (not source.startswith(("data:", ","))): try: return Image.open(source) except (OSError, FileNotFoundError, UnidentifiedImageError) as e: error_msg = f"Could not open image from file path: {source}" raise ValueError(error_msg) from e - if isinstance(source, str): + else: match = _DATA_URL_GENERIC_RE.match(source) if match: try: @@ -73,6 +71,26 @@ def image_to_data_url(image: PILImage.Image) -> str: return f"data:image/png;base64,{image_to_base64(image=image, format_='PNG')}" +def base64_to_image(base64_string: str) -> Image.Image: + """Convert a base64 string to a PIL Image. + + Args: + base64_string (str): The base64 encoded image string. + + Returns: + Image.Image: A PIL Image object. + + Raises: + ValueError: If the base64 string is invalid or the image cannot be decoded. + """ + try: + image_bytes = base64.b64decode(base64_string) + return Image.open(io.BytesIO(image_bytes)) + except (binascii.Error, UnidentifiedImageError) as e: + error_msg = f"Could not convert base64 string to image: {e}" + raise ValueError(error_msg) from e + + def data_url_to_image(data_url: str) -> Image.Image: """Convert a data URL to a PIL Image. @@ -83,13 +101,17 @@ def data_url_to_image(data_url: str) -> Image.Image: Image.Image: A PIL Image object. Raises: - ValueError: If the data URL is invalid or the image cannot be decoded. + ValueError: If the data URL is invalid or the data URL data cannot be decoded + or the image cannot be decoded. """ - data_url = data_url.split(",")[1] - while len(data_url) % 4 != 0: - data_url += "=" - image_data = base64.b64decode(data_url) - return Image.open(BytesIO(image_data)) + try: + data_url_data = data_url.split(",")[1] + while len(data_url_data) % 4 != 0: + data_url_data += "=" + return base64_to_image(data_url_data) + except (IndexError, ValueError) as e: + error_msg = f"Could not convert data URL to image: {e}" + raise ValueError(error_msg) from e def draw_point_on_image( @@ -112,22 +134,6 @@ def draw_point_on_image( return img_copy -def base64_to_image(base64_string: str) -> Image.Image: - """Convert a base64 string to a PIL Image. - - Args: - base64_string (str): The base64 encoded image string. - - Returns: - Image.Image: A PIL Image object. - - Raises: - ValueError: If the base64 string is invalid or the image cannot be decoded. - """ - image_bytes = base64.b64decode(base64_string) - return Image.open(io.BytesIO(image_bytes)) - - def image_to_base64( image: Union[pathlib.Path, Image.Image], format_: Literal["PNG", "JPEG"] = "PNG" ) -> str: @@ -145,154 +151,211 @@ def image_to_base64( """ image_bytes: bytes | None = None if isinstance(image, Image.Image): - with io.BytesIO() as _bytes: - image.save(_bytes, format=format_) - image_bytes = _bytes.getvalue() - elif isinstance(image, pathlib.Path): - with Path.open(image, "rb") as f: - image_bytes = f.read() + with io.BytesIO() as buffer: + image.save(buffer, format=format_) + image_bytes = buffer.getvalue() + else: + with Path.open(image, "rb") as file: + image_bytes = file.read() + return base64.b64encode(image_bytes).decode("utf-8") -def scale_image_with_padding( - image: Image.Image, max_width: int, max_height: int -) -> Image.Image: - """Scale an image to fit within specified dimensions while maintaining aspect ratio and adding padding. +def _calc_center_offset( + image_size: tuple[int, int], + container_size: tuple[int, int], +) -> tuple[int, int]: + """Calculate the offset to center the image in the container. + + If the image is larger than the container, the offset will be negative. Args: - image (Image.Image): The PIL Image to scale. - max_width (int): The maximum width of the output image. - max_height (int): The maximum height of the output image. + image_size (tuple[int, int]): The size of the image to center (width, height). + container_size (tuple[int, int]): The size of the container to center the image in (width, height). Returns: - Image.Image: A new PIL Image that fits within the specified dimensions with padding. + tuple[int, int]: The offset to center the image in the container. """ - original_width, original_height = image.size - aspect_ratio = original_width / original_height - if (max_width / max_height) > aspect_ratio: - scale_factor = max_height / original_height - else: - scale_factor = max_width / original_width - scaled_width = int(original_width * scale_factor) - scaled_height = int(original_height * scale_factor) - scaled_image = image.resize((scaled_width, scaled_height), Image.Resampling.LANCZOS) - pad_left = (max_width - scaled_width) // 2 - pad_top = (max_height - scaled_height) // 2 - return ImageOps.expand( - scaled_image, - border=( - pad_left, - pad_top, - max_width - scaled_width - pad_left, - max_height - scaled_height - pad_top, - ), - fill=(0, 0, 0), # Black padding + return ( + (container_size[0] - image_size[0]) // 2, + (container_size[1] - image_size[1]) // 2, ) -def scale_coordinates_with_padding( - x: float, - y: float, - original_width: int, - original_height: int, - max_width: int, - max_height: int, -) -> Tuple[float, float]: - """ - Scale coordinates from an original coordinate system to a scaled and padded coordinate system. +@dataclass +class ScalingResults: + """Results of scaling calculations. + Args: + factor (float): The scaling factor applied. + size (tuple[int, int]): The resulting size (width, height). """ - scale_factor, scaled_width, scaled_height = _calculate_aspect_fit_scaling( - original_width, original_height, max_width, max_height - ) - pad_left = (max_width - scaled_width) // 2 - pad_top = (max_height - scaled_height) // 2 + factor: float + size: tuple[int, int] - scaled_x = x * scale_factor + pad_left - scaled_y = y * scale_factor + pad_top - - if scaled_x < 0 or scaled_y < 0 or scaled_x > max_width or scaled_y > max_height: - error_msg = "Coordinates are outside the padded image area" - raise ValueError(error_msg) - return scaled_x, scaled_y +def _calculate_scaling_for_fit( + original_size: tuple[int, int], + target_size: tuple[int, int], +) -> ScalingResults: + """Calculate the scaling factor and size of an image to fit within target size while maintaining aspect ratio. -def _calculate_aspect_fit_scaling( - original_width: int, - original_height: int, - max_width: int, - max_height: int, -) -> Tuple[float, float, float]: - """Calculate the scale factors for an image to fit within specified dimensions while maintaining aspect ratio. + If the image is larger than the target size, the scaling factor will be less than 1. Args: - original_width (int): The width of the original coordinate system. - original_height (int): The height of the original coordinate system. - max_width (int): The maximum width of the output scaled coordinate system. - max_height (int): The maximum height of the output scaled coordinate system. + original_size (tuple[int, int]): The size of the original image (width, height). + target_size (tuple[int, int]): The target size to fit the image into (width, height). Returns: - Tuple[float, float, float]: A tuple of (scale_factor, scaled_width, scaled_height). + ScalingResults: The scaling factor and resulting size. + Raises: + ValueError: If the original size or target size is not positive. """ + if original_size[0] <= 0 or original_size[1] <= 0: + error_msg = f"Size must have positive width and height: {original_size}" + raise ValueError(error_msg) + + if target_size[0] <= 0 or target_size[1] <= 0: + error_msg = f"Target size must have positive width and height: {target_size}" + raise ValueError(error_msg) - aspect_ratio = original_width / original_height - if (max_width / max_height) > aspect_ratio: - scale_factor = max_height / original_height - scaled_width = int(original_width * scale_factor) - scaled_height = max_height + aspect_ratio = original_size[0] / original_size[1] + target_aspect_ratio = target_size[0] / target_size[1] + if target_aspect_ratio > aspect_ratio: + factor = target_size[1] / original_size[1] + width = max(1, int(original_size[0] * factor)) # Ensure minimum width of 1 + height = target_size[1] else: - scale_factor = max_width / original_width - scaled_width = max_width - scaled_height = int(original_height * scale_factor) + factor = target_size[0] / original_size[0] + width = target_size[0] + height = max(1, int(original_size[1] * factor)) # Ensure minimum height of 1 + return ScalingResults(factor=factor, size=(width, height)) - return scale_factor, scaled_width, scaled_height +def _center_image_in_background( + image: Image.Image, + background_size: tuple[int, int], + background_color: tuple[int, int, int] = (0, 0, 0), +) -> Image.Image: + """Center an image in a background image. -def scale_coordinates_back( - x: float, - y: float, - original_width: int, - original_height: int, - max_width: int, - max_height: int, -) -> Tuple[float, float]: - """Convert coordinates from a scaled and padded image back to the original image coordinates. + Args: + image (Image.Image): The image to center. + background_size (tuple[int, int]): The size of the background (width, height). + background_color (tuple[int, int, int], optional): The background color. Defaults to `(0, 0, 0)`. + + Returns: + Image.Image: A new image with the input image centered on the background. + """ + background = Image.new("RGB", background_size, background_color) + offset = _calc_center_offset(image.size, background_size) + background.paste(image, offset) + return background + + +def scale_image_to_fit( + image: Image.Image, + target_size: tuple[int, int], +) -> Image.Image: + """Scale an image to fit within specified size while maintaining aspect ratio. + + Use black padding to fill the remaining space. Args: - x (float): The x-coordinate in the scaled image. - y (float): The y-coordinate in the scaled image. - original_width (int): The width of the original image. - original_height (int): The height of the original image. - max_width (int): The maximum width used for scaling. - max_height (int): The maximum height used for scaling. + image (Image.Image): The PIL Image to scale. + target_size (tuple[int, int]): The target size to fit the image into (width, height). Returns: - Tuple[float, float]: A tuple of (original_x, original_y) coordinates. + Image.Image: A new PIL Image that fits within the specified size. + """ + scaling_results = _calculate_scaling_for_fit(image.size, target_size) + scaled_image = image.resize(scaling_results.size, Image.Resampling.LANCZOS) + return _center_image_in_background(scaled_image, target_size) - Raises: - ValueError: If the coordinates are outside the padded image area. + +def _scale_coordinates( + coordinates: tuple[int, int], + offset: tuple[int, int], + factor: float, + inverse: bool, +) -> tuple[int, int]: + """Scale coordinates based on scaling factor and offset. + + Args: + coordinates (tuple[int, int]): The coordinates to scale. + offset (tuple[int, int]): The offset to apply. + factor (float): The scaling factor. + inverse (bool): Whether to apply inverse scaling. + + Returns: + tuple[int, int]: The scaled coordinates. """ - scale_factor, scaled_width, scaled_height = _calculate_aspect_fit_scaling( - original_width, original_height, max_width, max_height - ) + if inverse: + result = ( + (coordinates[0] - offset[0]) / factor, + (coordinates[1] - offset[1]) / factor, + ) + else: + result = ( + (coordinates[0]) * factor + offset[0], + (coordinates[1]) * factor + offset[1], + ) + return (int(result[0]), int(result[1])) + + +def _check_coordinates_in_bounds( + coordinates: tuple[float, float], + bounds: tuple[int, int], +) -> None: + """Check if coordinates are within bounds. + + Args: + coordinates (tuple[float, float]): The coordinates to check. + bounds (tuple[int, int]): The bounds (width, height). - pad_left = (max_width - scaled_width) // 2 - pad_top = (max_height - scaled_height) // 2 - adjusted_x = x - pad_left - adjusted_y = y - pad_top + Raises: + ValueError: If coordinates are out of bounds. + """ if ( - adjusted_x < 0 - or adjusted_y < 0 - or adjusted_x > scaled_width - or adjusted_y > scaled_height + coordinates[0] < 0 + or coordinates[1] < 0 + or coordinates[0] > bounds[0] + or coordinates[1] > bounds[1] ): - error_msg = "Coordinates are outside the padded image area" + print(bounds) + error_msg = f"Coordinates {coordinates[0]}, {coordinates[1]} are out of bounds" raise ValueError(error_msg) - original_x = adjusted_x / scale_factor - original_y = adjusted_y / scale_factor - return original_x, original_y + + +def scale_coordinates( + coordinates: tuple[int, int], + original_size: tuple[int, int], + target_size: tuple[int, int], + inverse: bool = False, +) -> tuple[int, int]: + """Scale coordinates between original and scaled image sizes. + + Args: + coordinates (tuple[int, int]): The coordinates to scale. + original_size (tuple[int, int]): The original image size (width, height). + target_size (tuple[int, int]): The target size (width, height). + inverse (bool, optional): Whether to scale from target to original. Defaults to `False`. + + Returns: + tuple[int, int]: The scaled coordinates. + + Raises: + ValueError: If the scaled coordinates are out of bounds. + """ + scaling_results = _calculate_scaling_for_fit(original_size, target_size) + offset = _calc_center_offset(scaling_results.size, target_size) + result = _scale_coordinates(coordinates, offset, scaling_results.factor, inverse) + _check_coordinates_in_bounds( + result, original_size if inverse else scaling_results.size + ) + return result Img = Union[str, Path, PILImage.Image] @@ -306,9 +369,9 @@ def scale_coordinates_back( class ImageSource(RootModel): - """A Pydantic model that represents an image source and provides methods to convert it to different formats. + """A class that represents an image source and provides methods to convert it to different formats. - The model can be initialized with: + The class can be initialized with: - A PIL Image object - A file path (str or pathlib.Path) - A data URL string @@ -346,3 +409,18 @@ def to_base64(self) -> str: str: A base64 encoded string of the image. """ return image_to_base64(image=self.root) + + +__all__ = [ + "load_image", + "image_to_data_url", + "data_url_to_image", + "draw_point_on_image", + "base64_to_image", + "image_to_base64", + "scale_image_to_fit", + "scale_coordinates", + "ScalingResults", + "ImageSource", + "Img", +] diff --git a/tests/conftest.py b/tests/conftest.py index e84c3007..efae66ba 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -6,7 +6,7 @@ from pytest_mock import MockerFixture from askui.models.model_router import ModelRouter -from askui.tools.agent_os import AgentOs +from askui.tools.agent_os import AgentOs, Display, DisplaySize from askui.tools.toolbox import AgentToolbox @@ -41,6 +41,10 @@ def path_fixtures_github_com__icon(path_fixtures_images: pathlib.Path) -> pathli def agent_os_mock(mocker: MockerFixture) -> AgentOs: """Fixture providing a mock agent os.""" mock = mocker.MagicMock(spec=AgentOs) + mock.retrieve_active_display.return_value = Display( + id=1, + size=DisplaySize(width=100, height=100), + ) mock.screenshot.return_value = Image.new("RGB", (100, 100), color="white") return cast("AgentOs", mock) diff --git a/tests/unit/utils/test_image_utils.py b/tests/unit/utils/test_image_utils.py index d6c58f5d..c68988dd 100644 --- a/tests/unit/utils/test_image_utils.py +++ b/tests/unit/utils/test_image_utils.py @@ -6,14 +6,15 @@ from askui.utils.image_utils import ( ImageSource, + ScalingResults, base64_to_image, data_url_to_image, draw_point_on_image, image_to_base64, image_to_data_url, load_image, - scale_coordinates_back, - scale_image_with_padding, + scale_coordinates, + scale_image_to_fit, ) @@ -74,6 +75,14 @@ def test_load_image_invalid( img_str = base64.b64encode(img_bytes).decode() load_image(img_str) + def test_load_image_unsupported_type(self) -> None: + with pytest.raises(AttributeError): + load_image(123) # type: ignore + + def test_load_image_nonexistent_file(self) -> None: + with pytest.raises(ValueError, match="Could not open image from file path"): + load_image("nonexistent_file.png") + class TestImageSource: def test_image_source(self, path_fixtures_github_com__icon: pathlib.Path) -> None: @@ -135,6 +144,14 @@ def test_data_url_to_image( assert isinstance(img, Image.Image) assert img.size == (128, 125) + def test_data_url_to_image_invalid_format(self) -> None: + with pytest.raises(ValueError): + data_url_to_image("invalid_data_url") + + def test_data_url_to_image_invalid_base64(self) -> None: + with pytest.raises(ValueError): + data_url_to_image("data:image/png;base64,invalid_base64") + class TestPointDrawing: def test_draw_point_on_image( @@ -147,7 +164,31 @@ def test_draw_point_on_image( assert new_img != img # Should be a new image assert isinstance(new_img, Image.Image) # Check that the point was drawn by looking at the pixel color - assert new_img.getpixel((x, y)) == (255, 0, 0, 255) # Red color + pixel_color = new_img.getpixel((x, y)) + assert pixel_color == (255, 0, 0, 255) # Red color + + def test_draw_point_on_image_custom_size( + self, path_fixtures_github_com__icon: pathlib.Path + ) -> None: + img = Image.open(path_fixtures_github_com__icon) + x, y = 64, 62 + size = 5 + new_img = draw_point_on_image(img, x, y, size) + + # Check that the point was drawn with custom size + pixel_color = new_img.getpixel((x, y)) + assert pixel_color == (255, 0, 0, 255) # Red color + + def test_draw_point_on_image_edge_coordinates( + self, path_fixtures_github_com__icon: pathlib.Path + ) -> None: + img = Image.open(path_fixtures_github_com__icon) + x, y = 0, 0 # Edge coordinates + new_img = draw_point_on_image(img, x, y) + + assert new_img != img + pixel_color = new_img.getpixel((x, y)) + assert pixel_color == (255, 0, 0, 255) # Red color class TestBase64Conversion: @@ -162,6 +203,10 @@ def test_base64_to_image( assert isinstance(img, Image.Image) assert img.size == (128, 125) + def test_base64_to_image_invalid(self) -> None: + with pytest.raises(ValueError): + base64_to_image("invalid_base64") + def test_image_to_base64( self, path_fixtures_github_com__icon: pathlib.Path ) -> None: @@ -193,6 +238,10 @@ def test_image_to_base64_format( # Verify the images are different (JPEG is lossy) assert png_base64 != jpeg_base64 + def test_image_to_base64_unsupported_type(self) -> None: + with pytest.raises(AttributeError): + image_to_base64(123) # type: ignore + class TestImageScaling: def test_scale_image_with_padding( @@ -201,7 +250,7 @@ def test_scale_image_with_padding( img = Image.open(path_fixtures_github_com__icon) max_width, max_height = 200, 200 - scaled = scale_image_with_padding(img, max_width, max_height) + scaled = scale_image_to_fit(img, (max_width, max_height)) assert isinstance(scaled, Image.Image) assert scaled.size == (max_width, max_height) @@ -213,6 +262,28 @@ def test_scale_image_with_padding( ) / max_height assert abs(original_ratio - scaled_ratio) < 0.01 + def test_scale_image_smaller_than_target( + self, path_fixtures_github_com__icon: pathlib.Path + ) -> None: + img = Image.open(path_fixtures_github_com__icon) + # Target size smaller than original + target_size = (50, 50) + + scaled = scale_image_to_fit(img, target_size) + assert isinstance(scaled, Image.Image) + assert scaled.size == target_size + + def test_scale_image_square_to_rectangle( + self, path_fixtures_github_com__icon: pathlib.Path + ) -> None: + img = Image.open(path_fixtures_github_com__icon) + # Test scaling to a rectangular target + target_size = (300, 100) + + scaled = scale_image_to_fit(img, target_size) + assert isinstance(scaled, Image.Image) + assert scaled.size == target_size + def test_scale_coordinates_back( self, path_fixtures_github_com__icon: pathlib.Path ) -> None: @@ -221,8 +292,11 @@ def test_scale_coordinates_back( # Test coordinates in the center of the scaled image x, y = 100, 100 - original_x, original_y = scale_coordinates_back( - x, y, img.size[0], img.size[1], max_width, max_height + original_x, original_y = scale_coordinates( + (x, y), + img.size, + (max_width, max_height), + inverse=True, ) # Coordinates should be within the original image bounds @@ -231,6 +305,99 @@ def test_scale_coordinates_back( # Test coordinates outside the padded area with pytest.raises(ValueError): - scale_coordinates_back( - -10, -10, img.size[0], img.size[1], max_width, max_height + scale_coordinates( + (-10, -10), + img.size, + (max_width, max_height), + inverse=True, ) + + def test_scale_coordinates_forward( + self, path_fixtures_github_com__icon: pathlib.Path + ) -> None: + img = Image.open(path_fixtures_github_com__icon) + target_size = (200, 200) + + # Test scaling coordinates from original to target + original_coords = (64, 62) # Center of original image + scaled_coords = scale_coordinates( + original_coords, + img.size, + target_size, + inverse=False, + ) + + # Coordinates should be within the target bounds + assert 0 <= scaled_coords[0] <= target_size[0] + assert 0 <= scaled_coords[1] <= target_size[1] + + def test_scale_coordinates_out_of_bounds( + self, path_fixtures_github_com__icon: pathlib.Path + ) -> None: + img = Image.open(path_fixtures_github_com__icon) + target_size = (200, 200) + + # Test coordinates outside bounds + with pytest.raises(ValueError, match="are out of bounds"): + scale_coordinates( + (300, 300), # Outside target bounds + img.size, + target_size, + inverse=False, + ) + + +class TestScalingResults: + def test_scaling_results(self) -> None: + factor = 0.5 + size = (100, 50) + results = ScalingResults(factor=factor, size=size) + + assert results.factor == factor + assert results.size == size + + +class TestEdgeCases: + def test_empty_image(self) -> None: + # Create a minimal 1x1 image + img = Image.new("RGB", (1, 1), color="white") + + # Test scaling + scaled = scale_image_to_fit(img, (100, 100)) + assert scaled.size == (100, 100) + + # For a 1x1 image scaled to 100x100: + # - Scaling factor = 100 (both dimensions) + # - Scaled size = (100, 100) + # - Offset = (0, 0) since the scaled image is the same size as target + # - Coordinates (0, 0) in original map to (0, 0) in target + coords = scale_coordinates((0, 0), img.size, (100, 100)) + assert coords == (0, 0) # Should map to (0, 0) since no offset + + def test_large_image_scaling(self) -> None: + # Create a large image + img = Image.new("RGB", (1000, 1000), color="white") + target_size = (100, 100) + + scaled = scale_image_to_fit(img, target_size) + assert scaled.size == target_size + + def test_very_small_target( + self, path_fixtures_github_com__icon: pathlib.Path + ) -> None: + img = Image.open(path_fixtures_github_com__icon) + target_size = (1, 1) + + scaled = scale_image_to_fit(img, target_size) + assert scaled.size == target_size + + def test_data_url_edge_cases(self) -> None: + # Test malformed data URLs + with pytest.raises(ValueError): + data_url_to_image("not_a_data_url") + + with pytest.raises(ValueError): + data_url_to_image("data:invalid") + + with pytest.raises(ValueError): + data_url_to_image("data:image/png;base64,") # Empty base64 From 7a19a8fec252fa0347e6bb98381fae4f94755528 Mon Sep 17 00:00:00 2001 From: Adrian Stritzinger Date: Tue, 29 Jul 2025 17:53:08 +0200 Subject: [PATCH 13/13] tests: remove erroneous unit test --- tests/unit/utils/test_image_utils.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/tests/unit/utils/test_image_utils.py b/tests/unit/utils/test_image_utils.py index c68988dd..f739655a 100644 --- a/tests/unit/utils/test_image_utils.py +++ b/tests/unit/utils/test_image_utils.py @@ -75,10 +75,6 @@ def test_load_image_invalid( img_str = base64.b64encode(img_bytes).decode() load_image(img_str) - def test_load_image_unsupported_type(self) -> None: - with pytest.raises(AttributeError): - load_image(123) # type: ignore - def test_load_image_nonexistent_file(self) -> None: with pytest.raises(ValueError, match="Could not open image from file path"): load_image("nonexistent_file.png") @@ -238,10 +234,6 @@ def test_image_to_base64_format( # Verify the images are different (JPEG is lossy) assert png_base64 != jpeg_base64 - def test_image_to_base64_unsupported_type(self) -> None: - with pytest.raises(AttributeError): - image_to_base64(123) # type: ignore - class TestImageScaling: def test_scale_image_with_padding(