Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions src/askui/tools/anthropic/computer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from .base import BaseAnthropicTool, ToolError, ToolResult

from ..utils import image_to_base64, scale_image_with_padding, scale_coordinates_back
from ..utils import image_to_base64, scale_coordinates_forward, scale_image_with_padding, scale_coordinates_back


Action = Literal[
Expand Down Expand Up @@ -180,8 +180,13 @@ def __call__(
if action == "screenshot":
return self.screenshot()
elif action == "cursor_position":
# TODO: Implement in the future
return ToolError("cursor_position is not implemented by this agent")
if self.real_screen_height is None or self.real_screen_width is None:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't we also check if the display (index) changed?

screenshot = self.controller_client.screenshot(report=False)
self.real_screen_width = screenshot.width
self.real_screen_height = screenshot.height
mouse_x, mouse_y = self.controller_client.get_cursor_position()
scaled_x, scaled_y = scale_coordinates_forward(mouse_x, mouse_y, self.real_screen_width, self.real_screen_height, self.width, self.height)
return ToolResult(output=f"Cursor position: x={scaled_x}, y={scaled_y}")
Comment on lines +183 to +189
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI:

All translation of coordination system are handeled in the future inside the agent os.

elif action == "left_click":
self.controller_client.click("left")
return ToolResult()
Expand All @@ -202,6 +207,6 @@ def screenshot(self):
screenshot = self.controller_client.screenshot()
self.real_screen_width = screenshot.width
self.real_screen_height = screenshot.height
scaled_screenshot = scale_image_with_padding(screenshot, 1280, 800)
scaled_screenshot = scale_image_with_padding(screenshot, self.width, self.height)
base64_image = image_to_base64(scaled_screenshot)
return ToolResult(base64_image=base64_image)
30 changes: 30 additions & 0 deletions src/askui/tools/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,3 +93,33 @@ def scale_coordinates_back(x, y, original_width, original_height, max_width, max
original_x = adjusted_x / scale_factor
original_y = adjusted_y / scale_factor
return original_x, original_y

def scale_coordinates_forward(x, y, original_width, original_height, max_width, max_height) -> tuple[int, int]:
"""Scale coordinates from original image size to scaled image size with padding.
Args:
x (int): X coordinate in original image.
y (int): Y coordinate in original image.
original_width (int): Width of the original image.
original_height (int): Height of the original image.
max_width (int): Maximum width of the scaled image.
max_height (int): Maximum height of the scaled image.
Returns:
tuple: Scaled X and Y coordinates.
"""
aspect_ratio = original_width / original_height
if (max_width / max_height) > aspect_ratio:
scale_factor = max_height / original_height
scaled_width = int(original_width * scale_factor)
scaled_height = max_height
else:
scale_factor = max_width / original_width
scaled_width = max_width
scaled_height = int(original_height * scale_factor)

pad_left = (max_width - scaled_width) // 2
pad_top = (max_height - scaled_height) // 2

scaled_x = int(x * scale_factor) + pad_left
scaled_y = int(y * scale_factor) + pad_top

return int(scaled_x), int(scaled_y)