askui · adi-wan-askui · Jul 29, 2025 · Jul 25, 2025 · Jul 25, 2025 · Jul 25, 2025
diff --git a/src/askui/agent.py b/src/askui/agent.py
@@ -19,6 +19,9 @@
 from askui.models.shared.tools import Tool
 from askui.tools.computer import Computer20241022Tool, Computer20250124Tool
 from askui.tools.exception_tool import ExceptionTool
+from askui.tools.list_displays_tool import ListDisplaysTool
+from askui.tools.retrieve_active_display_tool import RetrieveActiveDisplayTool
+from askui.tools.set_active_display_tool import SetActiveDisplayTool
 
 from .logger import logger
 from .models import ModelComposition
@@ -30,9 +33,10 @@
 
 _SYSTEM_PROMPT = f"""<SYSTEM_CAPABILITY>
 * You are utilising a {sys.platform} machine using {platform.machine()} architecture with internet access.
+* When you cannot find something (application window, ui element etc.) on the currently selected/active displa/screen, check the other available displays by listing them and checking which one is currently active and then going through the other displays one by one until you find it or you have checked all of them.
 * When asked to perform web tasks try to open the browser (firefox, chrome, safari, ...) if not already open. Often you can find the browser icons in the toolbars of the operating systems.
-* When viewing a page it can be helpful to zoom out so that you can see everything on the page.  Either that, or make sure you scroll down to see everything before deciding something isn't available.
-* When using your function calls, they take a while to run and send back to you.  Where possible/feasible, try to chain multiple of these calls all into one function calls request.
+* When viewing a page it can be helpful to zoom out/in so that you can see everything on the page. Either that, or make sure you scroll down/up to see everything before deciding something isn't available.
+* When using your function calls, they take a while to run and send back to you. Where possible/feasible, try to chain multiple of these calls all into one function calls request.
 * The current date and time is {datetime.now(timezone.utc).strftime("%A, %B %d, %Y %H:%M:%S %z")}.
 </SYSTEM_CAPABILITY>
 
@@ -115,6 +119,9 @@ def __init__(
             models=models,
             tools=[
                 ExceptionTool(),
+                SetActiveDisplayTool(agent_os=self.tools.os),
+                RetrieveActiveDisplayTool(agent_os=self.tools.os),
+                ListDisplaysTool(agent_os=self.tools.os),
             ]
             + (act_tools or []),
             agent_os=self.tools.os,

diff --git a/src/askui/locators/serializers.py b/src/askui/locators/serializers.py
@@ -13,9 +13,7 @@
     Prompt,
     Text,
 )
-from .locators import (
-    AiElement as AiElementLocator,
-)
+from .locators import AiElement as AiElementLocator
 from .relatable import (
     BoundingRelation,
     LogicalRelation,

diff --git a/src/askui/models/anthropic/messages_api.py b/src/askui/models/anthropic/messages_api.py
@@ -45,8 +45,8 @@
 from askui.utils.image_utils import (
     ImageSource,
     image_to_base64,
-    scale_coordinates_back,
-    scale_image_with_padding,
+    scale_coordinates,
+    scale_image_to_fit,
 )
 
 from .utils import extract_click_coordinates
@@ -156,10 +156,9 @@ def _inference(
         system: str,
         model_choice: str,
     ) -> str:
-        scaled_image = scale_image_with_padding(
+        scaled_image = scale_image_to_fit(
             image.root,
-            self._settings.resolution[0],
-            self._settings.resolution[1],
+            self._settings.resolution,
         )
         message = self.create_message(
             messages=[
@@ -222,16 +221,12 @@ def locate(
                 ),
                 model_choice=model_choice,
             )
-            scaled_x, scaled_y = extract_click_coordinates(content)
-            x, y = scale_coordinates_back(
-                scaled_x,
-                scaled_y,
-                image.root.width,
-                image.root.height,
-                screen_width,
-                screen_height,
+            return scale_coordinates(
+                extract_click_coordinates(content),
+                image.root.size,
+                self._settings.resolution,
+                inverse=True,
             )
-            return int(x), int(y)
         except (
             _UnexpectedResponseError,
             ValueError,

diff --git a/src/askui/models/shared/tools.py b/src/askui/models/shared/tools.py
@@ -7,6 +7,7 @@
 from pydantic import BaseModel, Field
 from typing_extensions import Self
 
+from askui.logger import logger
 from askui.models.shared.agent_message_param import (
     Base64ImageSourceParam,
     ContentBlockParam,
@@ -155,6 +156,7 @@ def _run_tool(
         except AgentException:
             raise
         except Exception as e:  # noqa: BLE001
+            logger.error(f"Tool {tool_use_block_param.name} failed: {e}", exc_info=True)
             return ToolResultBlockParam(
                 content=f"Tool {tool_use_block_param.name} failed: {e}",
                 is_error=True,

diff --git a/src/askui/tools/agent_os.py b/src/askui/tools/agent_os.py
@@ -2,7 +2,7 @@
 from typing import TYPE_CHECKING, Literal
 
 from PIL import Image
-from pydantic import BaseModel
+from pydantic import BaseModel, ConfigDict, Field
 
 if TYPE_CHECKING:
     from askui.tools.askui.askui_ui_controller_grpc.generated.AgentOS_Send_Request_2501 import (  # noqa: E501
@@ -159,6 +159,26 @@ class Coordinate(BaseModel):
     y: int
 
 
+class DisplaySize(BaseModel):
+    """Represents the size of a display in pixels."""
+
+    width: int
+    height: int
+
+
+class Display(BaseModel):
+    model_config = ConfigDict(
+        validate_by_name=True,
+    )
+
+    id: int = Field(validation_alias="displayID")
+    size: DisplaySize = Field(validation_alias="sizeInPixels")
+
+
+class DisplaysListResponse(BaseModel):
+    data: list[Display] = Field(validation_alias="displays")
+
+
 InputEvent = ClickEvent
 
 
@@ -323,6 +343,22 @@ def keyboard_tap(
         """
         raise NotImplementedError
 
+    def list_displays(self) -> DisplaysListResponse:
+        """
+        List all the available displays.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def retrieve_active_display(self) -> Display:
+        """
+        Retrieve the currently active display/screen.
+
+        Returns:
+            Display: The currently active display/screen.
+        """
+        raise NotImplementedError
+
     def set_display(self, display: int = 1) -> None:
         """
         Sets the active display for screen interactions.

diff --git a/src/askui/tools/android/agent_os_facade.py b/src/askui/tools/android/agent_os_facade.py
@@ -4,7 +4,7 @@
 
 from askui.reporting import Reporter
 from askui.tools.android.agent_os import ANDROID_KEY, AndroidAgentOs, AndroidDisplay
-from askui.utils.image_utils import scale_coordinates_back, scale_image_with_padding
+from askui.utils.image_utils import scale_coordinates, scale_image_to_fit
 
 
 class AndroidAgentOsFacade(AndroidAgentOs):
@@ -32,10 +32,9 @@ def disconnect(self) -> None:
     def screenshot(self) -> Image.Image:
         screenshot = self._agent_os.screenshot()
         self._real_screen_resolution = screenshot.size
-        scaled_image = scale_image_with_padding(
+        scaled_image = scale_image_to_fit(
             screenshot,
-            self._target_resolution[0],
-            self._target_resolution[1],
+            self._target_resolution,
         )
 
         self._reporter.add_message("AndroidAgentOS", "Screenshot taken", screenshot)
@@ -45,15 +44,12 @@ def _scale_coordinates_back(self, x: int, y: int) -> Tuple[int, int]:
         if self._real_screen_resolution is None:
             self._real_screen_resolution = self._agent_os.screenshot().size
 
-        scaled_x, scaled_y = scale_coordinates_back(
-            x,
-            y,
-            self._real_screen_resolution[0],
-            self._real_screen_resolution[1],
-            self._target_resolution[0],
-            self._target_resolution[1],
+        return scale_coordinates(
+            (x, y),
+            self._real_screen_resolution,
+            self._target_resolution,
+            inverse=True,
         )
-        return int(scaled_x), int(scaled_y)
 
     def tap(self, x: int, y: int) -> None:
         scaled_x, scaled_y = self._scale_coordinates_back(x, y)

diff --git a/src/askui/tools/askui/askui_controller.py b/src/askui/tools/askui/askui_controller.py
@@ -7,13 +7,21 @@
 from typing import Literal, Type
 
 import grpc
+from google.protobuf.json_format import MessageToDict
 from PIL import Image
 from typing_extensions import Self, override
 
 from askui.container import telemetry
 from askui.logger import logger
 from askui.reporting import Reporter
-from askui.tools.agent_os import AgentOs, Coordinate, ModifierKey, PcKey
+from askui.tools.agent_os import (
+    AgentOs,
+    Coordinate,
+    Display,
+    DisplaysListResponse,
+    ModifierKey,
+    PcKey,
+)
 from askui.tools.askui.askui_controller_settings import AskUiControllerSettings
 from askui.tools.askui.askui_ui_controller_grpc.generated import (
     Controller_V1_pb2 as controller_v1_pbs,
@@ -626,28 +634,49 @@ def run_command(self, command: str, timeout_ms: int = 30000) -> None:
         )
 
     @telemetry.record_call()
-    def get_display_information(
+    @override
+    def retrieve_active_display(self) -> Display:
+        """
+        Retrieve the currently active display/screen.
+
+        Returns:
+            Display: The currently active display/screen.
+        """
+        self._reporter.add_message("AgentOS", "retrieve_active_display()")
+        displays_list_response = self.list_displays()
+        for display in displays_list_response.data:
+            if display.id == self._display:
+                return display
+        error_msg = f"Display {self._display} not found"
+        raise ValueError(error_msg)
+
+    @telemetry.record_call()
+    @override
+    def list_displays(
         self,
-    ) -> controller_v1_pbs.Response_GetDisplayInformation:
+    ) -> DisplaysListResponse:
         """
-        Get information about all available displays and virtual screen.
+        List all available displays including virtual screens.
 
         Returns:
-            controller_v1_pbs.Response_GetDisplayInformation:
-                - displays: List of DisplayInformation objects
-                - virtualScreenRectangle: Overall virtual screen bounds
+            DisplaysListResponse
         """
         assert isinstance(self._stub, controller_v1.ControllerAPIStub), (
             "Stub is not initialized"
         )
 
-        self._reporter.add_message("AgentOS", "get_display_information()")
+        self._reporter.add_message("AgentOS", "list_displays()")
 
         response: controller_v1_pbs.Response_GetDisplayInformation = (
             self._stub.GetDisplayInformation(controller_v1_pbs.Request_Void())
         )
 
-        return response
+        response_dict = MessageToDict(
+            response,
+            preserving_proto_field_name=True,
+        )
+
+        return DisplaysListResponse.model_validate(response_dict)
 
     @telemetry.record_call()
     def get_process_list(