Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions src/askui/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@
from askui.models.shared.tools import Tool
from askui.tools.computer import Computer20241022Tool, Computer20250124Tool
from askui.tools.exception_tool import ExceptionTool
from askui.tools.list_displays_tool import ListDisplaysTool
from askui.tools.retrieve_active_display_tool import RetrieveActiveDisplayTool
from askui.tools.set_active_display_tool import SetActiveDisplayTool

from .logger import logger
from .models import ModelComposition
Expand All @@ -30,9 +33,10 @@

_SYSTEM_PROMPT = f"""<SYSTEM_CAPABILITY>
* You are utilising a {sys.platform} machine using {platform.machine()} architecture with internet access.
* When you cannot find something (application window, ui element etc.) on the currently selected/active displa/screen, check the other available displays by listing them and checking which one is currently active and then going through the other displays one by one until you find it or you have checked all of them.
* When asked to perform web tasks try to open the browser (firefox, chrome, safari, ...) if not already open. Often you can find the browser icons in the toolbars of the operating systems.
* When viewing a page it can be helpful to zoom out so that you can see everything on the page. Either that, or make sure you scroll down to see everything before deciding something isn't available.
* When using your function calls, they take a while to run and send back to you. Where possible/feasible, try to chain multiple of these calls all into one function calls request.
* When viewing a page it can be helpful to zoom out/in so that you can see everything on the page. Either that, or make sure you scroll down/up to see everything before deciding something isn't available.
* When using your function calls, they take a while to run and send back to you. Where possible/feasible, try to chain multiple of these calls all into one function calls request.
* The current date and time is {datetime.now(timezone.utc).strftime("%A, %B %d, %Y %H:%M:%S %z")}.
</SYSTEM_CAPABILITY>

Expand Down Expand Up @@ -115,6 +119,9 @@ def __init__(
models=models,
tools=[
ExceptionTool(),
SetActiveDisplayTool(agent_os=self.tools.os),
RetrieveActiveDisplayTool(agent_os=self.tools.os),
ListDisplaysTool(agent_os=self.tools.os),
]
+ (act_tools or []),
agent_os=self.tools.os,
Expand Down
4 changes: 1 addition & 3 deletions src/askui/locators/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,7 @@
Prompt,
Text,
)
from .locators import (
AiElement as AiElementLocator,
)
from .locators import AiElement as AiElementLocator
from .relatable import (
BoundingRelation,
LogicalRelation,
Expand Down
23 changes: 9 additions & 14 deletions src/askui/models/anthropic/messages_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@
from askui.utils.image_utils import (
ImageSource,
image_to_base64,
scale_coordinates_back,
scale_image_with_padding,
scale_coordinates,
scale_image_to_fit,
)

from .utils import extract_click_coordinates
Expand Down Expand Up @@ -156,10 +156,9 @@ def _inference(
system: str,
model_choice: str,
) -> str:
scaled_image = scale_image_with_padding(
scaled_image = scale_image_to_fit(
image.root,
self._settings.resolution[0],
self._settings.resolution[1],
self._settings.resolution,
)
message = self.create_message(
messages=[
Expand Down Expand Up @@ -222,16 +221,12 @@ def locate(
),
model_choice=model_choice,
)
scaled_x, scaled_y = extract_click_coordinates(content)
x, y = scale_coordinates_back(
scaled_x,
scaled_y,
image.root.width,
image.root.height,
screen_width,
screen_height,
return scale_coordinates(
extract_click_coordinates(content),
image.root.size,
self._settings.resolution,
inverse=True,
)
return int(x), int(y)
except (
_UnexpectedResponseError,
ValueError,
Expand Down
2 changes: 2 additions & 0 deletions src/askui/models/shared/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from pydantic import BaseModel, Field
from typing_extensions import Self

from askui.logger import logger
from askui.models.shared.agent_message_param import (
Base64ImageSourceParam,
ContentBlockParam,
Expand Down Expand Up @@ -155,6 +156,7 @@ def _run_tool(
except AgentException:
raise
except Exception as e: # noqa: BLE001
logger.error(f"Tool {tool_use_block_param.name} failed: {e}", exc_info=True)
return ToolResultBlockParam(
content=f"Tool {tool_use_block_param.name} failed: {e}",
is_error=True,
Expand Down
38 changes: 37 additions & 1 deletion src/askui/tools/agent_os.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from typing import TYPE_CHECKING, Literal

from PIL import Image
from pydantic import BaseModel
from pydantic import BaseModel, ConfigDict, Field

if TYPE_CHECKING:
from askui.tools.askui.askui_ui_controller_grpc.generated.AgentOS_Send_Request_2501 import ( # noqa: E501
Expand Down Expand Up @@ -159,6 +159,26 @@ class Coordinate(BaseModel):
y: int


class DisplaySize(BaseModel):
"""Represents the size of a display in pixels."""

width: int
height: int


class Display(BaseModel):
model_config = ConfigDict(
validate_by_name=True,
)

id: int = Field(validation_alias="displayID")
size: DisplaySize = Field(validation_alias="sizeInPixels")


class DisplaysListResponse(BaseModel):
data: list[Display] = Field(validation_alias="displays")


InputEvent = ClickEvent


Expand Down Expand Up @@ -323,6 +343,22 @@ def keyboard_tap(
"""
raise NotImplementedError

def list_displays(self) -> DisplaysListResponse:
"""
List all the available displays.
"""
raise NotImplementedError

@abstractmethod
def retrieve_active_display(self) -> Display:
"""
Retrieve the currently active display/screen.

Returns:
Display: The currently active display/screen.
"""
raise NotImplementedError

def set_display(self, display: int = 1) -> None:
"""
Sets the active display for screen interactions.
Expand Down
20 changes: 8 additions & 12 deletions src/askui/tools/android/agent_os_facade.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from askui.reporting import Reporter
from askui.tools.android.agent_os import ANDROID_KEY, AndroidAgentOs, AndroidDisplay
from askui.utils.image_utils import scale_coordinates_back, scale_image_with_padding
from askui.utils.image_utils import scale_coordinates, scale_image_to_fit


class AndroidAgentOsFacade(AndroidAgentOs):
Expand Down Expand Up @@ -32,10 +32,9 @@ def disconnect(self) -> None:
def screenshot(self) -> Image.Image:
screenshot = self._agent_os.screenshot()
self._real_screen_resolution = screenshot.size
scaled_image = scale_image_with_padding(
scaled_image = scale_image_to_fit(
screenshot,
self._target_resolution[0],
self._target_resolution[1],
self._target_resolution,
)

self._reporter.add_message("AndroidAgentOS", "Screenshot taken", screenshot)
Expand All @@ -45,15 +44,12 @@ def _scale_coordinates_back(self, x: int, y: int) -> Tuple[int, int]:
if self._real_screen_resolution is None:
self._real_screen_resolution = self._agent_os.screenshot().size

scaled_x, scaled_y = scale_coordinates_back(
x,
y,
self._real_screen_resolution[0],
self._real_screen_resolution[1],
self._target_resolution[0],
self._target_resolution[1],
return scale_coordinates(
(x, y),
self._real_screen_resolution,
self._target_resolution,
inverse=True,
)
return int(scaled_x), int(scaled_y)

def tap(self, x: int, y: int) -> None:
scaled_x, scaled_y = self._scale_coordinates_back(x, y)
Expand Down
47 changes: 38 additions & 9 deletions src/askui/tools/askui/askui_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,21 @@
from typing import Literal, Type

import grpc
from google.protobuf.json_format import MessageToDict
from PIL import Image
from typing_extensions import Self, override

from askui.container import telemetry
from askui.logger import logger
from askui.reporting import Reporter
from askui.tools.agent_os import AgentOs, Coordinate, ModifierKey, PcKey
from askui.tools.agent_os import (
AgentOs,
Coordinate,
Display,
DisplaysListResponse,
ModifierKey,
PcKey,
)
from askui.tools.askui.askui_controller_settings import AskUiControllerSettings
from askui.tools.askui.askui_ui_controller_grpc.generated import (
Controller_V1_pb2 as controller_v1_pbs,
Expand Down Expand Up @@ -626,28 +634,49 @@ def run_command(self, command: str, timeout_ms: int = 30000) -> None:
)

@telemetry.record_call()
def get_display_information(
@override
def retrieve_active_display(self) -> Display:
"""
Retrieve the currently active display/screen.

Returns:
Display: The currently active display/screen.
"""
self._reporter.add_message("AgentOS", "retrieve_active_display()")
displays_list_response = self.list_displays()
for display in displays_list_response.data:
if display.id == self._display:
return display
error_msg = f"Display {self._display} not found"
raise ValueError(error_msg)

@telemetry.record_call()
@override
def list_displays(
self,
) -> controller_v1_pbs.Response_GetDisplayInformation:
) -> DisplaysListResponse:
"""
Get information about all available displays and virtual screen.
List all available displays including virtual screens.

Returns:
controller_v1_pbs.Response_GetDisplayInformation:
- displays: List of DisplayInformation objects
- virtualScreenRectangle: Overall virtual screen bounds
DisplaysListResponse
"""
assert isinstance(self._stub, controller_v1.ControllerAPIStub), (
"Stub is not initialized"
)

self._reporter.add_message("AgentOS", "get_display_information()")
self._reporter.add_message("AgentOS", "list_displays()")

response: controller_v1_pbs.Response_GetDisplayInformation = (
self._stub.GetDisplayInformation(controller_v1_pbs.Request_Void())
)

return response
response_dict = MessageToDict(
response,
preserving_proto_field_name=True,
)

return DisplaysListResponse.model_validate(response_dict)

@telemetry.record_call()
def get_process_list(
Expand Down
Loading