Skip to content
3 changes: 3 additions & 0 deletions src/askui/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from askui.models.shared.tools import Tool
from askui.tools.computer import Computer20241022Tool, Computer20250124Tool
from askui.tools.exception_tool import ExceptionTool
from askui.tools.screen_switch_tool import ScreenSwitchTool

from .logger import logger
from .models import ModelComposition
Expand Down Expand Up @@ -401,6 +402,8 @@ def _get_default_settings_for_act(self, model_choice: str) -> ActSettings:

@override
def _get_default_tools_for_act(self, model_choice: str) -> list[Tool]:
self._tools.append(ScreenSwitchTool(agent_os=self.tools.os))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

WDYT about including it right next to the ExceptionTool in the constructor? Seems easier and more consistent.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Aren't we missing the tool for listing all available displays?


match model_choice:
case ModelName.ANTHROPIC__CLAUDE__3_5__SONNET__20241022:
return self._tools + [Computer20241022Tool(agent_os=self.tools.os)]
Expand Down
34 changes: 33 additions & 1 deletion src/askui/tools/agent_os.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from typing import TYPE_CHECKING, Literal

from PIL import Image
from pydantic import BaseModel
from pydantic import BaseModel, Field

if TYPE_CHECKING:
from askui.tools.askui.askui_ui_controller_grpc.generated.AgentOS_Send_Request_2501 import ( # noqa: E501
Expand Down Expand Up @@ -154,6 +154,26 @@ class ClickEvent(BaseModel):
timestamp: float


class SizeInPixels(BaseModel):
"""Represents the size of a display in pixels."""

width: int
height: int


class DisplayInformation(BaseModel):
"""Contains information about a single display."""

display_id: int = Field(validation_alias="displayID")
size_in_pixels: SizeInPixels = Field(validation_alias="sizeInPixels")


class GetDisplayInformationResponse(BaseModel):
"""Response model for display information requests."""

displays: list[DisplayInformation]


class Coordinate(BaseModel):
x: int
y: int
Expand Down Expand Up @@ -333,6 +353,18 @@ def set_display(self, display: int = 1) -> None:
"""
raise NotImplementedError

def get_display_information(self) -> GetDisplayInformationResponse:
"""
Get information about all available displays and virtual screen.
"""
raise NotImplementedError

def get_active_display(self) -> int:
"""
Get the active display.
"""
raise NotImplementedError

def run_command(self, command: str, timeout_ms: int = 30000) -> None:
"""
Executes a shell command.
Expand Down
61 changes: 24 additions & 37 deletions src/askui/tools/askui/askui_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from typing import Literal, Type

import grpc
from google.protobuf.json_format import MessageToDict
from PIL import Image
from pydantic import BaseModel, Field, model_validator
from pydantic_settings import BaseSettings, SettingsConfigDict
Expand All @@ -15,7 +16,13 @@
from askui.container import telemetry
from askui.logger import logger
from askui.reporting import Reporter
from askui.tools.agent_os import AgentOs, Coordinate, ModifierKey, PcKey
from askui.tools.agent_os import (
AgentOs,
Coordinate,
GetDisplayInformationResponse,
ModifierKey,
PcKey,
)
from askui.tools.askui.askui_ui_controller_grpc.generated import (
Controller_V1_pb2 as controller_v1_pbs,
)
Expand Down Expand Up @@ -723,6 +730,14 @@ def set_display(self, display: int = 1) -> None:
)
self._display = display

@telemetry.record_call()
@override
def get_active_display(self) -> int:
"""
Get the active display.
"""
return self._display
Comment on lines +733 to +739
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well done :) I think we should either include this info in the display information returned by the listing tool / the GetDisplayInformationResponse or add an extra tool for this.

I think it would be great if this was included in the controller api as I think they maintain the source of truth about which display is currently active (see https://askyourui.slack.com/archives/C091TSQ6KP0/p1753431836333729).


@telemetry.record_call(exclude={"command"})
@override
def run_command(self, command: str, timeout_ms: int = 30000) -> None:
Expand All @@ -747,14 +762,13 @@ def run_command(self, command: str, timeout_ms: int = 30000) -> None:
@telemetry.record_call()
def get_display_information(
self,
) -> controller_v1_pbs.Response_GetDisplayInformation:
) -> GetDisplayInformationResponse:
"""
Get information about all available displays and virtual screen.

Returns:
controller_v1_pbs.Response_GetDisplayInformation:
- displays: List of DisplayInformation objects
- virtualScreenRectangle: Overall virtual screen bounds
GetDisplayInformationResponse: A Pydantic model containing information
about all available displays and the virtual screen.
"""
assert isinstance(self._stub, controller_v1.ControllerAPIStub), (
"Stub is not initialized"
Expand All @@ -765,21 +779,22 @@ def get_display_information(
response: controller_v1_pbs.Response_GetDisplayInformation = (
self._stub.GetDisplayInformation(controller_v1_pbs.Request_Void())
)

return response
response_dict = MessageToDict(
response,
preserving_proto_field_name=True,
)
return GetDisplayInformationResponse.model_validate(response_dict)

@telemetry.record_call()
def get_process_list(
self, get_extended_info: bool = False
) -> controller_v1_pbs.Response_GetProcessList:
"""
Get a list of running processes.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why remove the empty lines across all the docstrings? 😆

Args:
get_extended_info (bool, optional): Whether to include
extended process information.
Defaults to `False`.

Returns:
controller_v1_pbs.Response_GetProcessList: Process list response containing:
- processes: List of ProcessInfo objects
Expand All @@ -802,10 +817,8 @@ def get_window_list(
) -> controller_v1_pbs.Response_GetWindowList:
"""
Get a list of windows for a specific process.

Args:
process_id (int): The ID of the process to get windows for.

Returns:
controller_v1_pbs.Response_GetWindowList: Window list response containing:
- windows: List of WindowInfo objects with ID and name
Expand All @@ -828,7 +841,6 @@ def get_automation_target_list(
) -> controller_v1_pbs.Response_GetAutomationTargetList:
"""
Get a list of available automation targets.

Returns:
controller_v1_pbs.Response_GetAutomationTargetList:
Automation target list response:
Expand All @@ -850,7 +862,6 @@ def get_automation_target_list(
def set_mouse_delay(self, delay_ms: int) -> None:
"""
Configure mouse action delay.

Args:
delay_ms (int): The delay in milliseconds to set for mouse actions.
"""
Expand All @@ -870,7 +881,6 @@ def set_mouse_delay(self, delay_ms: int) -> None:
def set_keyboard_delay(self, delay_ms: int) -> None:
"""
Configure keyboard action delay.

Args:
delay_ms (int): The delay in milliseconds to set for keyboard actions.
"""
Expand All @@ -890,7 +900,6 @@ def set_keyboard_delay(self, delay_ms: int) -> None:
def set_active_window(self, process_id: int, window_id: int) -> None:
"""
Set the active window for automation.

Args:
process_id (int): The ID of the process that owns the window.
window_id (int): The ID of the window to set as active.
Expand All @@ -913,7 +922,6 @@ def set_active_window(self, process_id: int, window_id: int) -> None:
def set_active_automation_target(self, target_id: int) -> None:
"""
Set the active automation target.

Args:
target_id (int): The ID of the automation target to set as active.
"""
Expand All @@ -937,13 +945,11 @@ def schedule_batched_action(
) -> controller_v1_pbs.Response_ScheduleBatchedAction:
"""
Schedule an action for batch execution.

Args:
action_class_id (controller_v1_pbs.ActionClassID): The class ID
of the action to schedule.
action_parameters (controller_v1_pbs.ActionParameters):
Parameters for the action.

Returns:
controller_v1_pbs.Response_ScheduleBatchedAction: Response containing
the scheduled action ID.
Expand Down Expand Up @@ -1003,7 +1009,6 @@ def stop_batch_run(self) -> None:
def get_action_count(self) -> controller_v1_pbs.Response_GetActionCount:
"""
Get the count of recorded or batched actions.

Returns:
controller_v1_pbs.Response_GetActionCount: Response
containing the action count.
Expand All @@ -1024,10 +1029,8 @@ def get_action_count(self) -> controller_v1_pbs.Response_GetActionCount:
def get_action(self, action_index: int) -> controller_v1_pbs.Response_GetAction:
"""
Get a specific action by its index.

Args:
action_index (int): The index of the action to retrieve.

Returns:
controller_v1_pbs.Response_GetAction: Action information containing:
- actionID: The action ID
Expand All @@ -1052,7 +1055,6 @@ def get_action(self, action_index: int) -> controller_v1_pbs.Response_GetAction:
def remove_action(self, action_id: int) -> None:
"""
Remove a specific action by its ID.

Args:
action_id (int): The ID of the action to remove.
"""
Expand Down Expand Up @@ -1086,10 +1088,8 @@ def remove_all_actions(self) -> None:
def _send_message(self, message: str) -> controller_v1_pbs.Response_Send:
"""
Send a general message to the controller.

Args:
message (str): The message to send to the controller.

Returns:
controller_v1_pbs.Response_Send: Response containing
the message from the controller.
Expand All @@ -1110,7 +1110,6 @@ def _send_message(self, message: str) -> controller_v1_pbs.Response_Send:
def get_mouse_position(self) -> Coordinate:
"""
Get the mouse cursor position

Returns:
Coordinate: Response containing the result of the mouse position change.
"""
Expand All @@ -1132,7 +1131,6 @@ def get_mouse_position(self) -> Coordinate:
def set_mouse_position(self, x: int, y: int) -> None:
"""
Set the mouse cursor position to specific coordinates.

Args:
x (int): The horizontal coordinate (in pixels) to set the cursor to.
y (int): The vertical coordinate (in pixels) to set the cursor to.
Expand All @@ -1150,10 +1148,8 @@ def set_mouse_position(self, x: int, y: int) -> None:
def render_quad(self, style: RenderObjectStyle) -> int:
"""
Render a quad object to the display.

Args:
style (RenderObjectStyle): The style properties for the quad.

Returns:
int: Object ID.
"""
Expand All @@ -1174,11 +1170,9 @@ def render_quad(self, style: RenderObjectStyle) -> int:
def render_line(self, style: RenderObjectStyle, points: list[Coordinate]) -> int:
"""
Render a line object to the display.

Args:
style (RenderObjectStyle): The style properties for the line.
points (list[Coordinates]): The points defining the line.

Returns:
int: Object ID.
"""
Expand All @@ -1199,11 +1193,9 @@ def render_line(self, style: RenderObjectStyle, points: list[Coordinate]) -> int
def render_image(self, style: RenderObjectStyle, image_data: str) -> int:
"""
Render an image object to the display.

Args:
style (RenderObjectStyle): The style properties for the image.
image_data (str): The base64-encoded image data.

Returns:
int: Object ID.
"""
Expand All @@ -1225,11 +1217,9 @@ def render_image(self, style: RenderObjectStyle, image_data: str) -> int:
def render_text(self, style: RenderObjectStyle, content: str) -> int:
"""
Render a text object to the display.

Args:
style (RenderObjectStyle): The style properties for the text.
content (str): The text content to display.

Returns:
int: Object ID.
"""
Expand All @@ -1251,11 +1241,9 @@ def render_text(self, style: RenderObjectStyle, content: str) -> int:
def update_render_object(self, object_id: int, style: RenderObjectStyle) -> None:
"""
Update styling properties of an existing render object.

Args:
object_id (float): The ID of the render object to update.
style (RenderObjectStyle): The new style properties.

Returns:
int: Object ID.
"""
Expand All @@ -1274,7 +1262,6 @@ def update_render_object(self, object_id: int, style: RenderObjectStyle) -> None
def delete_render_object(self, object_id: int) -> None:
"""
Delete an existing render object from the display.

Args:
object_id (RenderObjectId): The ID of the render object to delete.
"""
Expand Down
45 changes: 45 additions & 0 deletions src/askui/tools/screen_switch_tool.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
from askui.models.shared.tools import Tool
from askui.tools.agent_os import AgentOs, DisplayInformation


class ScreenSwitchTool(Tool):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we

  • call the file display_tools.py
  • add the tool for listing displays
  • rename the tool here to SetDisplayTool

so that we are more consistent with our naming across the code base?

"""
Tool to change the screen.
"""

def __init__(self, agent_os: AgentOs) -> None:
# We need to determine the number of displays available to provide context
# to the agent indicating that screen switching can only be done this number
# of times.
displays: list[DisplayInformation] = agent_os.get_display_information().displays

super().__init__(
name="screen_switch",
description=f"""
This tool is useful for switching between multiple displays to find
information not present on the current active screen.
If more than one display is available, this tool cycles through them.
Number of displays available: {len(displays)}.
""",
)
self._agent_os: AgentOs = agent_os
self._displays: list[DisplayInformation] = displays
Comment on lines +11 to +26
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Decent idea of listing the displays here and including it in the tool description although that does not match what is specified in the issue. The problem I see with this is that it is too static as (the number of) displays may change throughout the execution. So I think going with an extra tool for listing displays the better approach. WDYT?


def __call__(self) -> None:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the tool would be more versatile and easier if it was just forwarding to the set_display taking a the display id as a parameter as then we could just forward without additional logic and we could directly switch to one display without having to call this tool multiple times to switch to the correct display. WDYT?

"""
Cycles to the next display if there are multiple displays.
This tool is useful to switch between multiple displays if some information is
not found on the current display.
"""
if len(self._displays) <= 1:
return

active_display_id: int = self._agent_os.get_active_display()

current_display_index: int = next(
i for i, d in enumerate(self._displays) if d.display_id == active_display_id
)
# if current_index is the last index, wrap around to the first index
next_index: int = (current_display_index + 1) % len(self._displays)

self._agent_os.set_display(self._displays[next_index].display_id)