From 0ec01b37dde5fcbadd1a58c110698cbc3d046d3d Mon Sep 17 00:00:00 2001 From: Samir mlika <105347215+mlikasam-askui@users.noreply.github.com> Date: Fri, 20 Jun 2025 13:24:09 +0200 Subject: [PATCH 01/12] refactor android --- src/askui/android_agent.py | 22 ++++++- src/askui/models/model_router.py | 10 ++- src/askui/models/models.py | 19 ++++++ src/askui/models/shared/android_agent.py | 1 + src/askui/models/shared/base_agent.py | 20 +++++- src/askui/models/shared/facade.py | 11 ++++ src/askui/models/shared/tools.py | 10 +++ ...gent_os_handler.py => agent_os_handler.py} | 4 +- src/askui/tools/android/ppadb_agent_os.py | 4 +- src/askui/tools/android/tools.py | 52 ++++++++-------- src/askui/tools/computer.py | 61 ++----------------- 11 files changed, 122 insertions(+), 92 deletions(-) rename src/askui/tools/android/{ppadb_agent_os_handler.py => agent_os_handler.py} (98%) diff --git a/src/askui/android_agent.py b/src/askui/android_agent.py index 48a369a3..be096009 100644 --- a/src/askui/android_agent.py +++ b/src/askui/android_agent.py @@ -10,10 +10,10 @@ from askui.locators.locators import Locator from askui.models.shared.computer_agent_cb_param import OnMessageCb from askui.models.shared.computer_agent_message_param import MessageParam -from askui.models.shared.tools import ToolCollection +from askui.models.shared.tools import Tool, ToolCollection from askui.tools.android.agent_os import ANDROID_KEY +from askui.tools.android.agent_os_handler import AndroidAgentOSHandler from askui.tools.android.ppadb_agent_os import PpadbAgentOs -from askui.tools.android.ppadb_agent_os_handler import PpadbAgentOSHandler from askui.tools.android.tools import ( AndroidDragAndDropTool, AndroidKeyCombinationTool, @@ -60,7 +60,7 @@ def __init__( configure_logging(level=log_level) self.os = PpadbAgentOs() self._reporter = CompositeReporter(reporters=reporters) - android_os_handler = PpadbAgentOSHandler(self.os, self._reporter) + android_os_handler = AndroidAgentOSHandler(self.os, self._reporter) _models = initialize_default_android_model_registry( tool_collection=ToolCollection( tools=[ @@ -604,6 +604,22 @@ def act( ) self._model_router.act(messages, model or self._model_choice["act"], on_message) + def set_act_model_tools(self, tools: list[Tool], model: str | None = None) -> None: + """ + Sets the tools for the act model. + """ + self._model_router.set_act_model_tools( + model or self._model_choice["act"], tools + ) + + def add_tool_to_act_model(self, tool: Tool, model: str | None = None) -> None: + """ + Adds a tool to the act model. + """ + self._model_router.add_tool_to_act_model( + model or self._model_choice["act"], tool + ) + @telemetry.record_call(flush=True) def close(self) -> None: """Disconnects from the Android device.""" diff --git a/src/askui/models/model_router.py b/src/askui/models/model_router.py index affee49a..e1f8bb72 100644 --- a/src/askui/models/model_router.py +++ b/src/askui/models/model_router.py @@ -34,7 +34,7 @@ from askui.models.shared.computer_agent_cb_param import OnMessageCb from askui.models.shared.computer_agent_message_param import MessageParam from askui.models.shared.facade import ModelFacade -from askui.models.shared.tools import ToolCollection +from askui.models.shared.tools import Tool, ToolCollection from askui.models.types.response_schemas import ResponseSchema from askui.reporting import CompositeReporter, Reporter from askui.utils.image_utils import ImageSource @@ -259,3 +259,11 @@ def locate( m = self._get_model(_model_choice, "locate") logger.debug(f"Routing locate prediction to {_model_choice}") return m.locate(locator, screenshot, _model_composition or _model_choice) + + def set_act_model_tools(self, model_choice: str, tools: list[Tool]) -> None: + act_model = self._get_model(model_choice, "act") + act_model.set_tools(tools) + + def add_tool_to_act_model(self, model_choice: str, tool: Tool) -> None: + act_model = self._get_model(model_choice, "act") + act_model.add_tool(tool) diff --git a/src/askui/models/models.py b/src/askui/models/models.py index 1c612d8c..1dd6ddad 100644 --- a/src/askui/models/models.py +++ b/src/askui/models/models.py @@ -10,6 +10,7 @@ from askui.locators.locators import Locator from askui.models.shared.computer_agent_cb_param import OnMessageCb from askui.models.shared.computer_agent_message_param import MessageParam +from askui.models.shared.tools import Tool from askui.models.types.response_schemas import ResponseSchema from askui.utils.image_utils import ImageSource @@ -219,6 +220,24 @@ def act( """ # noqa: E501 raise NotImplementedError + @abc.abstractmethod + def set_tools(self, tools: list[Tool]) -> None: + """Set the tools that the model can use. + + Args: + tools (list[Tool]): The tools that the model can use. + """ + raise NotImplementedError + + @abc.abstractmethod + def add_tool(self, tool: Tool) -> None: + """Add a tool to the model. + + Args: + tool (Tool): The tool to add. + """ + raise NotImplementedError + class GetModel(abc.ABC): """Abstract base class for models that can extract information from images. diff --git a/src/askui/models/shared/android_agent.py b/src/askui/models/shared/android_agent.py index 2f5dfdb5..6d856453 100644 --- a/src/askui/models/shared/android_agent.py +++ b/src/askui/models/shared/android_agent.py @@ -8,6 +8,7 @@ * Autonomy: Operate independently and make informed decisions without requiring user input. +* Never ask for other tasks to be done, only do the task you are given. * Reliability: Ensure actions are repeatable and maintain system stability. * Efficiency: Optimize operations to minimize latency and resource usage. * Safety: Always verify actions before execution, even with full system access. diff --git a/src/askui/models/shared/base_agent.py b/src/askui/models/shared/base_agent.py index e7043bb3..b8267d43 100644 --- a/src/askui/models/shared/base_agent.py +++ b/src/askui/models/shared/base_agent.py @@ -12,7 +12,7 @@ MessageParam, TextBlockParam, ) -from askui.models.shared.tools import ToolCollection +from askui.models.shared.tools import Tool, ToolCollection from askui.reporting import Reporter from ...logger import logger @@ -60,6 +60,24 @@ def __init__( text=system_prompt, ) + @override + def set_tools(self, tools: list[Tool]) -> None: + """Set the tools that the model can use. + + Args: + tools (list[Tool]): The tools that the model can use. + """ + self._tool_collection.set_tools(tools) + + @override + def add_tool(self, tool: Tool) -> None: + """Add a tool to the model. + + Args: + tool (Tool): The tool to add. + """ + self._tool_collection.add_tool(tool) + @abstractmethod def _create_message( self, messages: list[MessageParam], model_choice: str diff --git a/src/askui/models/shared/facade.py b/src/askui/models/shared/facade.py index e4ac7a0f..a3068b3d 100644 --- a/src/askui/models/shared/facade.py +++ b/src/askui/models/shared/facade.py @@ -6,6 +6,7 @@ from askui.models.models import ActModel, GetModel, LocateModel, ModelComposition, Point from askui.models.shared.computer_agent_cb_param import OnMessageCb from askui.models.shared.computer_agent_message_param import MessageParam +from askui.models.shared.tools import Tool from askui.models.types.response_schemas import ResponseSchema from askui.utils.image_utils import ImageSource @@ -34,6 +35,16 @@ def act( on_message=on_message, ) + @override + def set_tools(self, tools: list[Tool]) -> None: + """Set the tools for the act model.""" + self._act_model.set_tools(tools) + + @override + def add_tool(self, tool: Tool) -> None: + """Add a tool to the act model.""" + self._act_model.add_tool(tool) + @override def get( self, diff --git a/src/askui/models/shared/tools.py b/src/askui/models/shared/tools.py index 9bffd1fc..17e13325 100644 --- a/src/askui/models/shared/tools.py +++ b/src/askui/models/shared/tools.py @@ -110,6 +110,16 @@ def to_params( ) -> list[BetaToolUnionParam]: return [tool.to_params() for tool in self._tools] + def add_tool(self, tool: Tool) -> None: + """Add a tool to the collection.""" + self._tools.append(tool) + self._tool_map[tool.to_params()["name"]] = tool + + def set_tools(self, tools: list[Tool]) -> None: + """Set the tools in the collection.""" + self._tools = tools + self._tool_map = {tool.to_params()["name"]: tool for tool in tools} + def run( self, tool_use_block_params: list[ToolUseBlockParam] ) -> list[ContentBlockParam]: diff --git a/src/askui/tools/android/ppadb_agent_os_handler.py b/src/askui/tools/android/agent_os_handler.py similarity index 98% rename from src/askui/tools/android/ppadb_agent_os_handler.py rename to src/askui/tools/android/agent_os_handler.py index dcfae70a..2b9480f9 100644 --- a/src/askui/tools/android/ppadb_agent_os_handler.py +++ b/src/askui/tools/android/agent_os_handler.py @@ -7,7 +7,7 @@ from askui.utils.image_utils import scale_coordinates_back, scale_image_with_padding -class PpadbAgentOSHandler(AndroidAgentOs): +class AndroidAgentOSHandler(AndroidAgentOs): """ This class is used to handle the AndroidAgentOs class. It is used to scale the coordinates to the target resolution @@ -97,7 +97,7 @@ def key_combination( self._os_agent.key_combination(keys, duration_in_ms) self._reporter.add_message( "AndroidAgentOS", - f"Tapped on {keys}", + f"Tapped on Keys: {keys}", ) def shell(self, command: str) -> str: diff --git a/src/askui/tools/android/ppadb_agent_os.py b/src/askui/tools/android/ppadb_agent_os.py index bf1639b8..a064bec1 100644 --- a/src/askui/tools/android/ppadb_agent_os.py +++ b/src/askui/tools/android/ppadb_agent_os.py @@ -194,7 +194,7 @@ def key_tap(self, key: ANDROID_KEY) -> None: raise RuntimeError(error_msg_invalid_key) assert self._selected_display is not None display_index: int = self._selected_display.display_index - self.shell(f"input -d {display_index} keyevent {key.capitalize()}") + self.shell(f"input -d {display_index} keyevent {key.upper()}") def key_combination( self, keys: List[ANDROID_KEY], duration_in_ms: int = 100 @@ -207,7 +207,7 @@ def key_combination( error_msg_too_few: str = "Key combination must contain at least 2 keys" raise RuntimeError(error_msg_too_few) - keys_string = " ".join(keys) + keys_string = " ".join(key.upper() for key in keys) assert self._selected_display is not None display_index: int = self._selected_display.display_index self.shell( diff --git a/src/askui/tools/android/tools.py b/src/askui/tools/android/tools.py index a7edd630..cea9b1d3 100644 --- a/src/askui/tools/android/tools.py +++ b/src/askui/tools/android/tools.py @@ -5,7 +5,7 @@ from askui.models.shared.tools import Tool from askui.tools.android.agent_os import ANDROID_KEY -from askui.tools.android.ppadb_agent_os_handler import PpadbAgentOSHandler +from askui.tools.android.agent_os_handler import AndroidAgentOSHandler class AndroidScreenshotTool(Tool): @@ -13,7 +13,7 @@ class AndroidScreenshotTool(Tool): Takes a screenshot from the currently connected Android device. """ - def __init__(self, os_agent_handler: PpadbAgentOSHandler) -> None: + def __init__(self, os_agent_handler: AndroidAgentOSHandler) -> None: super().__init__( name="android_screenshot_tool", description=( @@ -25,11 +25,11 @@ def __init__(self, os_agent_handler: PpadbAgentOSHandler) -> None: """ ), ) - self.os_agent_handler = os_agent_handler + self._os_agent_handler = os_agent_handler @override def __call__(self) -> tuple[str, Image.Image]: - screenshot = self.os_agent_handler.screenshot() + screenshot = self._os_agent_handler.screenshot() return "Screenshot was taken.", screenshot @@ -41,7 +41,7 @@ class AndroidTapTool(Tool): The top left corner of the screen is (0, 0). """ - def __init__(self, os_agent_handler: PpadbAgentOSHandler) -> None: + def __init__(self, os_agent_handler: AndroidAgentOSHandler) -> None: super().__init__( name="android_tap_tool", description=( @@ -66,11 +66,11 @@ def __init__(self, os_agent_handler: PpadbAgentOSHandler) -> None: "required": ["x", "y"], }, ) - self.os_agent_handler = os_agent_handler + self._os_agent_handler = os_agent_handler @override def __call__(self, x: int, y: int) -> str: - self.os_agent_handler.tap(x, y) + self._os_agent_handler.tap(x, y) return f"Tapped at ({x}, {y})" @@ -79,7 +79,7 @@ class AndroidTypeTool(Tool): Types the given text on the Android device screen. """ - def __init__(self, os_agent_handler: PpadbAgentOSHandler) -> None: + def __init__(self, os_agent_handler: AndroidAgentOSHandler) -> None: super().__init__( name="android_type_tool", description=( @@ -103,11 +103,11 @@ def __init__(self, os_agent_handler: PpadbAgentOSHandler) -> None: "required": ["text"], }, ) - self.os_agent_handler = os_agent_handler + self._os_agent_handler = os_agent_handler @override def __call__(self, text: str) -> str: - self.os_agent_handler.type(text) + self._os_agent_handler.type(text) return f"Typed: {text}" @@ -116,8 +116,8 @@ class AndroidDragAndDropTool(Tool): Performs a drag and drop gesture on the Android device screen. """ - def __init__(self, os_agent_handler: PpadbAgentOSHandler) -> None: - self.os_agent_handler = os_agent_handler + def __init__(self, os_agent_handler: AndroidAgentOSHandler) -> None: + self._os_agent_handler = os_agent_handler super().__init__( name="android_drag_and_drop_tool", description=( @@ -163,12 +163,12 @@ def __init__(self, os_agent_handler: PpadbAgentOSHandler) -> None: @override def __call__(self, x1: int, y1: int, x2: int, y2: int, duration: int = 1000) -> str: - self.os_agent_handler.drag_and_drop(x1, y1, x2, y2, duration) + self._os_agent_handler.drag_and_drop(x1, y1, x2, y2, duration) return f"Dragged and dropped from ({x1}, {y1}) to ({x2}, {y2}) in {duration}ms" class AndroidKeyTapEventTool(Tool): - def __init__(self, os_agent_handler: PpadbAgentOSHandler) -> None: + def __init__(self, os_agent_handler: AndroidAgentOSHandler) -> None: super().__init__( name="android_key_event_tool", description=( @@ -192,12 +192,12 @@ def __init__(self, os_agent_handler: PpadbAgentOSHandler) -> None: "required": ["key_name"], }, ) - self.os_agent_handler = os_agent_handler + self._os_agent_handler = os_agent_handler @override def __call__(self, key_name: ANDROID_KEY) -> str: - self.os_agent_handler.key_tap(key_name) - return f"Tapped on {key_name}" + self._os_agent_handler.key_tap(key_name) + return f"Tapped on Key: {key_name}" class AndroidSwipeTool(Tool): @@ -205,7 +205,7 @@ class AndroidSwipeTool(Tool): Performs a swipe gesture on the Android device screen. """ - def __init__(self, os_agent_handler: PpadbAgentOSHandler) -> None: + def __init__(self, os_agent_handler: AndroidAgentOSHandler) -> None: super().__init__( name="android_swipe_tool", description=( @@ -265,11 +265,11 @@ def __init__(self, os_agent_handler: PpadbAgentOSHandler) -> None: "required": ["x1", "y1", "x2", "y2"], }, ) - self.os_agent_handler = os_agent_handler + self._os_agent_handler = os_agent_handler @override def __call__(self, x1: int, y1: int, x2: int, y2: int, duration: int = 1000) -> str: - self.os_agent_handler.swipe(x1, y1, x2, y2, duration) + self._os_agent_handler.swipe(x1, y1, x2, y2, duration) return f"Swiped from ({x1}, {y1}) to ({x2}, {y2}) in {duration}ms" @@ -278,7 +278,7 @@ class AndroidKeyCombinationTool(Tool): Performs a key combination on the Android device. """ - def __init__(self, os_agent_handler: PpadbAgentOSHandler) -> None: + def __init__(self, os_agent_handler: AndroidAgentOSHandler) -> None: super().__init__( name="android_key_combination_tool", description=( @@ -321,11 +321,11 @@ def __init__(self, os_agent_handler: PpadbAgentOSHandler) -> None: "required": ["keys"], }, ) - self.os_agent_handler = os_agent_handler + self._os_agent_handler = os_agent_handler @override def __call__(self, keys: list[ANDROID_KEY], duration: int = 100) -> str: - self.os_agent_handler.key_combination(keys, duration) + self._os_agent_handler.key_combination(keys, duration) return f"Performed key combination: {keys}" @@ -334,7 +334,7 @@ class AndroidShellTool(Tool): Executes a shell command on the Android device. """ - def __init__(self, os_agent_handler: PpadbAgentOSHandler) -> None: + def __init__(self, os_agent_handler: AndroidAgentOSHandler) -> None: super().__init__( name="android_shell_tool", description=( @@ -363,9 +363,9 @@ def __init__(self, os_agent_handler: PpadbAgentOSHandler) -> None: "required": ["command"], }, ) - self.os_agent_handler = os_agent_handler + self._os_agent_handler = os_agent_handler @override def __call__(self, command: str) -> str: - output = self.os_agent_handler.shell(command) + output = self._os_agent_handler.shell(command) return f"Shell command executed. Output: {output}" diff --git a/src/askui/tools/computer.py b/src/askui/tools/computer.py index 780eca1b..44e1df07 100644 --- a/src/askui/tools/computer.py +++ b/src/askui/tools/computer.py @@ -1,5 +1,5 @@ from abc import ABC -from typing import Annotated, Literal, TypedDict, get_args +from typing import Annotated, Literal, TypedDict from anthropic.types.beta import ( BetaToolComputerUse20241022Param, @@ -13,7 +13,7 @@ from askui.utils.dict_utils import IdentityDefaultDict from askui.utils.image_utils import scale_coordinates_back, scale_image_with_padding -from ..models.shared.tools import InputSchema, Tool +from ..models.shared.tools import Tool Action20241022 = Literal[ "key", @@ -124,12 +124,10 @@ class ComputerToolBase(Tool, ABC): def __init__( self, agent_os: AgentOs, - input_schema: InputSchema, ) -> None: super().__init__( name="computer", description="A tool for interacting with the computer", - input_schema=input_schema, ) self._agent_os = agent_os self._width = 1280 @@ -257,29 +255,7 @@ def __init__( self, agent_os: AgentOs, ) -> None: - super().__init__( - agent_os=agent_os, - input_schema={ - "type": "object", - "properties": { - "action": { - "type": "string", - "enum": list(get_args(Action20241022)), - }, - "text": { - "type": "string", - }, - "coordinate": { - "type": "object", - "properties": { - "x": {"type": "integer", "minimum": 0}, - "y": {"type": "integer", "minimum": 0}, - }, - }, - }, - "required": ["action"], - }, - ) + super().__init__(agent_os=agent_os) @override def to_params( @@ -298,36 +274,7 @@ def __init__( self, agent_os: AgentOs, ) -> None: - super().__init__( - agent_os=agent_os, - input_schema={ - "type": "object", - "properties": { - "action": { - "type": "string", - "enum": list(get_args(Action20250124)), - }, - "text": { - "type": "string", - }, - "coordinate": { - "type": "object", - "properties": { - "x": {"type": "integer", "minimum": 0}, - "y": {"type": "integer", "minimum": 0}, - }, - }, - "scroll_direction": { - "type": "string", - "enum": list(get_args(ScrollDirection)), - }, - "scroll_amount": {"type": "integer", "minimum": 0}, - "duration": {"type": "number", "minimum": 0.0, "maximum": 100.0}, - "key": {"type": "string"}, - }, - "required": ["action"], - }, - ) + super().__init__(agent_os=agent_os) @override def to_params( From f8ed25990ae3b89b967c53eb87a98710ceea2fed Mon Sep 17 00:00:00 2001 From: Samir mlika <105347215+mlikasam-askui@users.noreply.github.com> Date: Fri, 20 Jun 2025 13:24:22 +0200 Subject: [PATCH 02/12] Add android agent to chat --- src/chat/api/assistants/seeds.py | 6 ++++++ src/chat/api/assistants/service.py | 7 ++++++- src/chat/api/runs/runner/runner.py | 28 ++++++++++++++++++++++++- tests/integration/test_custom_models.py | 17 +++++++++++++++ 4 files changed, 56 insertions(+), 2 deletions(-) diff --git a/src/chat/api/assistants/seeds.py b/src/chat/api/assistants/seeds.py index 81e44ed7..ae42940e 100644 --- a/src/chat/api/assistants/seeds.py +++ b/src/chat/api/assistants/seeds.py @@ -11,3 +11,9 @@ name="Human DemonstrationAgent", avatar="data:image/svg+xml;base64,PHN2ZyAgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIgogIHdpZHRoPSIyNCIKICBoZWlnaHQ9IjI0IgogIHZpZXdCb3g9IjAgMCAyNCAyNCIKICBmaWxsPSJub25lIgogIHN0cm9rZT0iIzAwMCIgc3R5bGU9ImJhY2tncm91bmQtY29sb3I6ICNmZmY7IGJvcmRlci1yYWRpdXM6IDJweCIKICBzdHJva2Utd2lkdGg9IjIiCiAgc3Ryb2tlLWxpbmVjYXA9InJvdW5kIgogIHN0cm9rZS1saW5lam9pbj0icm91bmQiCj4KICA8cGF0aCBkPSJNMTkgMjF2LTJhNCA0IDAgMCAwLTQtNEg5YTQgNCAwIDAgMC00IDR2MiIgLz4KICA8Y2lyY2xlIGN4PSIxMiIgY3k9IjciIHI9IjQiIC8+Cjwvc3ZnPgo=", ) + +ANDROID_VISION_AGENT = Assistant( + id="asst_78da09fbf1ed43c7826fb1686f89f541", + name="AskUI Android Vision Agent", + avatar="data:image/svg+xml;base64,PHN2ZyAgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIgogIHdpZHRoPSIyNCIKICBoZWlnaHQ9IjI0IgogIHZpZXdCb3g9IjAgMCAyNCAyNCIKICBmaWxsPSJub25lIgogIHN0cm9rZT0iIzAwMCIgc3R5bGU9ImJhY2tncm91bmQtY29sb3I6ICNmZmY7IGJvcmRlci1yYWRpdXM6IDJweCIKICBzdHJva2Utd2lkdGg9IjIiCiAgc3Ryb2tlLWxpbmVjYXA9InJvdW5kIgogIHN0cm9rZS1saW5lam9pbj0icm91bmQiCj4KICA8cGF0aCBkPSJNMTIgOFY0SDgiIC8+CiAgPHJlY3Qgd2lkdGg9IjE2IiBoZWlnaHQ9IjEyIiB4PSI0IiB5PSI4IiByeD0iMiIgLz4KICA8cGF0aCBkPSJNMiAxNGgyIiAvPgogIDxwYXRoIGQ9Ik0yMCAxNGgyIiAvPgogIDxwYXRoIGQ9Ik0xNSAxM3YyIiAvPgogIDxwYXRoIGQ9Ik05IDEzdjIiIC8+Cjwvc3ZnPgo=", +) diff --git a/src/chat/api/assistants/service.py b/src/chat/api/assistants/service.py index 369698c8..391135dd 100644 --- a/src/chat/api/assistants/service.py +++ b/src/chat/api/assistants/service.py @@ -3,7 +3,11 @@ from pydantic import BaseModel, Field from chat.api.assistants.models import Assistant -from chat.api.assistants.seeds import ASKUI_VISION_AGENT, HUMAN_DEMONSTRATION_AGENT +from chat.api.assistants.seeds import ( + ANDROID_VISION_AGENT, + ASKUI_VISION_AGENT, + HUMAN_DEMONSTRATION_AGENT, +) from chat.api.models import DO_NOT_PATCH, DoNotPatch, ListQuery, ListResponse @@ -162,5 +166,6 @@ def delete(self, assistant_id: str) -> None: def seed(self) -> None: """Seed the assistant service with default assistants.""" + self._save(ANDROID_VISION_AGENT) self._save(ASKUI_VISION_AGENT) self._save(HUMAN_DEMONSTRATION_AGENT) diff --git a/src/chat/api/runs/runner/runner.py b/src/chat/api/runs/runner/runner.py index 9611aa65..c793b719 100644 --- a/src/chat/api/runs/runner/runner.py +++ b/src/chat/api/runs/runner/runner.py @@ -3,9 +3,10 @@ import time from datetime import datetime, timezone from pathlib import Path -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Literal from askui.agent import VisionAgent +from askui.android_agent import AndroidVisionAgent from askui.models.shared.computer_agent_cb_param import OnMessageCbParam from askui.models.shared.computer_agent_message_param import ( Base64ImageSourceParam, @@ -35,6 +36,7 @@ ASKUI_VISION_AGENT_ID = "asst_ge3tiojsga3dgnruge3di2u5ov36shedkcslxnmca" +ASKUI_ANDROID_AGENT_ID = "asst_78da09fbf1ed43c7826fb1686f89f541" HUMAN_AGENT_ID = "asst_ge3tiojsga3dgnruge3di2u5ov36shedkcslxnmcb" @@ -138,7 +140,21 @@ def _run_human_agent(self, event_queue: queue.Queue[Events]) -> None: ) ) + def _run_askui_android_agent(self, event_queue: queue.Queue[Events]) -> None: + self._run_agent( + agent_type="android", + event_queue=event_queue, + ) + def _run_askui_vision_agent(self, event_queue: queue.Queue[Events]) -> None: + self._run_agent( + agent_type="vision", + event_queue=event_queue, + ) + + def _run_agent( + self, agent_type: Literal["android", "vision"], event_queue: queue.Queue[Events] + ) -> None: messages: list[MessageParam] = [ MessageParam( role=msg.role, @@ -175,6 +191,14 @@ def on_message( return None return on_message_cb_param.message + if agent_type == "android": + with AndroidVisionAgent() as android_agent: + android_agent.act( + messages, + on_message=on_message, + ) + return + with VisionAgent() as agent: agent.act( messages, @@ -197,6 +221,8 @@ def run( self._run_human_agent(event_queue) elif self._run.assistant_id == ASKUI_VISION_AGENT_ID: self._run_askui_vision_agent(event_queue) + elif self._run.assistant_id == ASKUI_ANDROID_AGENT_ID: + self._run_askui_android_agent(event_queue) updated_run = self._retrieve_run() if updated_run.status == "in_progress": updated_run.completed_at = datetime.now(tz=timezone.utc) diff --git a/tests/integration/test_custom_models.py b/tests/integration/test_custom_models.py index a64525ba..086cf93b 100644 --- a/tests/integration/test_custom_models.py +++ b/tests/integration/test_custom_models.py @@ -19,6 +19,7 @@ from askui.models import ModelComposition, ModelDefinition, ModelName from askui.models.shared.computer_agent_cb_param import OnMessageCb from askui.models.shared.computer_agent_message_param import MessageParam +from askui.models.shared.tools import Tool from askui.tools.toolbox import AgentToolbox from askui.utils.image_utils import ImageSource @@ -40,6 +41,14 @@ def act( self.goals.append([message.model_dump(mode="json") for message in messages]) self.model_choices.append(model_choice) + @override + def add_tool(self, tool: Tool) -> None: + pass + + @override + def set_tools(self, tools: list[Tool]) -> None: + pass + class SimpleGetModel(GetModel): """Simple get model that returns a fixed response.""" @@ -209,6 +218,14 @@ def act( ) -> None: pass + @override + def add_tool(self, tool: Tool) -> None: + pass + + @override + def set_tools(self, tools: list[Tool]) -> None: + pass + registry: ModelRegistry = { "act-1": act_model, "act-2": AnotherActModel(), From d6d7ed334284d47b2ad99af2bdc306e76e4f4697 Mon Sep 17 00:00:00 2001 From: Samir Mlika Date: Fri, 20 Jun 2025 15:57:26 +0200 Subject: [PATCH 03/12] update android agent icon --- src/chat/api/assistants/seeds.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/chat/api/assistants/seeds.py b/src/chat/api/assistants/seeds.py index ae42940e..98da896a 100644 --- a/src/chat/api/assistants/seeds.py +++ b/src/chat/api/assistants/seeds.py @@ -15,5 +15,5 @@ ANDROID_VISION_AGENT = Assistant( id="asst_78da09fbf1ed43c7826fb1686f89f541", name="AskUI Android Vision Agent", - avatar="data:image/svg+xml;base64,PHN2ZyAgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIgogIHdpZHRoPSIyNCIKICBoZWlnaHQ9IjI0IgogIHZpZXdCb3g9IjAgMCAyNCAyNCIKICBmaWxsPSJub25lIgogIHN0cm9rZT0iIzAwMCIgc3R5bGU9ImJhY2tncm91bmQtY29sb3I6ICNmZmY7IGJvcmRlci1yYWRpdXM6IDJweCIKICBzdHJva2Utd2lkdGg9IjIiCiAgc3Ryb2tlLWxpbmVjYXA9InJvdW5kIgogIHN0cm9rZS1saW5lam9pbj0icm91bmQiCj4KICA8cGF0aCBkPSJNMTIgOFY0SDgiIC8+CiAgPHJlY3Qgd2lkdGg9IjE2IiBoZWlnaHQ9IjEyIiB4PSI0IiB5PSI4IiByeD0iMiIgLz4KICA8cGF0aCBkPSJNMiAxNGgyIiAvPgogIDxwYXRoIGQ9Ik0yMCAxNGgyIiAvPgogIDxwYXRoIGQ9Ik0xNSAxM3YyIiAvPgogIDxwYXRoIGQ9Ik05IDEzdjIiIC8+Cjwvc3ZnPgo=", + avatar="data:image/svg+xml;base64,PHN2ZyB3aWR0aD0iMjAwIiBoZWlnaHQ9IjMwMCIgdmlld0JveD0iMCAwIDIwMCAzMDAiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyIgZmlsbD0ibm9uZSIgc3Ryb2tlPSJibGFjayIgc3Ryb2tlLXdpZHRoPSIzIj4KICA8IS0tIEhlYWQgLS0+CiAgPHJlY3QgeD0iNTAiIHk9IjMwIiB3aWR0aD0iMTAwIiBoZWlnaHQ9IjYwIiByeD0iMTAiIHJ5PSIxMCIgLz4KCiAgPCEtLSBFeWVzIC0tPgogIDxjaXJjbGUgY3g9IjcwIiBjeT0iNTAiIHI9IjUiIC8+CiAgPGNpcmNsZSBjeD0iMTMwIiBjeT0iNTAiIHI9IjUiIC8+CgogIDwhLS0gQW50ZW5uYXMgLS0+CiAgPGxpbmUgeDE9IjYwIiB5MT0iMzAiIHgyPSI0MCIgeTI9IjEwIiAvPgogIDxsaW5lIHgxPSIxNDAiIHkxPSIzMCIgeDI9IjE2MCIgeTI9IjEwIiAvPgoKICA8IS0tIEJvZHkgLS0+CiAgPHJlY3QgeD0iNDAiIHk9IjkwIiB3aWR0aD0iMTIwIiBoZWlnaHQ9IjEyMCIgcng9IjIwIiByeT0iMjAiIC8+CgogIDwhLS0gIkFza1VJIEFuZHJvaWRBZ2VudCIgbGFiZWwgLS0+CiAgPHRleHQgeD0iMTAwIiB5PSIxNjAiIHRleHQtYW5jaG9yPSJtaWRkbGUiIGZvbnQtc2l6ZT0iMTYiIGZpbGw9ImJsYWNrIiBzdHJva2U9Im5vbmUiIGZvbnQtZmFtaWx5PSJBcmlhbCI+QXNrVUk8L3RleHQ+CiAgPHRleHQgeD0iMTAwIiB5PSIxODAiIHRleHQtYW5jaG9yPSJtaWRkbGUiIGZvbnQtc2l6ZT0iMTIiIGZpbGw9ImJsYWNrIiBzdHJva2U9Im5vbmUiIGZvbnQtZmFtaWx5PSJBcmlhbCI+QW5kcm9pZEFnZW50PC90ZXh0PgoKICA8IS0tIEFybXMgLS0+CiAgPHJlY3QgeD0iMTAiIHk9IjkwIiB3aWR0aD0iMjAiIGhlaWdodD0iMTAwIiByeD0iMTAiIC8+CiAgPHJlY3QgeD0iMTcwIiB5PSI5MCIgd2lkdGg9IjIwIiBoZWlnaHQ9IjEwMCIgcng9IjEwIiAvPgoKICA8IS0tIExlZ3MgLS0+CiAgPHJlY3QgeD0iNjAiIHk9IjIxMCIgd2lkdGg9IjIwIiBoZWlnaHQ9IjYwIiByeD0iNSIgLz4KICA8cmVjdCB4PSIxMjAiIHk9IjIxMCIgd2lkdGg9IjIwIiBoZWlnaHQ9IjYwIiByeD0iNSIgLz4KPC9zdmc+", ) From 64fb772e8134a5fbbbe54c532aca0e1765079eb2 Mon Sep 17 00:00:00 2001 From: Samir Mlika Date: Mon, 23 Jun 2025 08:48:44 +0200 Subject: [PATCH 04/12] fix QA findings --- src/askui/android_agent.py | 20 ++++++++++---------- src/askui/tools/android/agent_os.py | 4 ++-- src/askui/tools/android/agent_os_handler.py | 6 +++--- src/askui/tools/android/ppadb_agent_os.py | 14 ++++++++++---- 4 files changed, 25 insertions(+), 19 deletions(-) diff --git a/src/askui/android_agent.py b/src/askui/android_agent.py index be096009..e01c24f4 100644 --- a/src/askui/android_agent.py +++ b/src/askui/android_agent.py @@ -421,8 +421,8 @@ def key_tap( from askui import AndroidVisionAgent with AndroidVisionAgent() as agent: - agent.key_tap("KEYCODE_HOME") # Taps the home key - agent.key_tap("KEYCODE_BACK") # Taps the back key + agent.key_tap("home") # Taps the home key + agent.key_tap("back") # Taps the back key ``` """ self.os.key_tap(key) @@ -446,8 +446,8 @@ def key_combination( from askui import AndroidVisionAgent with AndroidVisionAgent() as agent: - agent.key_combination(["KEYCODE_HOME", "KEYCODE_BACK"]) # Taps the home key and then the back key - agent.key_combination(["KEYCODE_HOME", "KEYCODE_BACK"], duration_in_ms=200) # Taps the home key and then the back key with a 200ms delay + agent.key_combination(["home", "back"]) # Taps the home key and then the back key + agent.key_combination(["home", "back"], duration_in_ms=200) # Taps the home key and then the back key for 200ms. ``` """ self.os.key_combination(keys, duration_in_ms) @@ -536,27 +536,27 @@ def swipe( self.os.swipe(x1, y1, x2, y2, duration_in_ms) @telemetry.record_call( - exclude={"device_name"}, + exclude={"device_serial_number"}, ) @validate_call - def set_device_by_name( + def set_device_by_serial_number( self, - device_name: str, + device_serial_number: str, ) -> None: """ Sets the active device for screen interactions by name. Args: - device_name (str): The name of the device to set as active. + device_serial_number (str): The serial number of the device to set as active. Example: ```python from askui import AndroidVisionAgent with AndroidVisionAgent() as agent: - agent.set_device_by_name("Pixel 6") # Sets the active device to the Pixel 6 + agent.set_device_by_serial_number("Pixel 6") # Sets the active device to the Pixel 6 """ - self.os.set_device_by_name(device_name) + self.os.set_device_by_serial_number(device_serial_number) @telemetry.record_call(exclude={"goal", "on_message"}) @validate_call diff --git a/src/askui/tools/android/agent_os.py b/src/askui/tools/android/agent_os.py index 2a369cb6..87d2f5f3 100644 --- a/src/askui/tools/android/agent_os.py +++ b/src/askui/tools/android/agent_os.py @@ -375,9 +375,9 @@ def set_device_by_index(self, device_index: int = 0) -> None: raise NotImplementedError @abstractmethod - def set_device_by_name(self, device_name: str) -> None: + def set_device_by_serial_number(self, device_serial_number: str) -> None: """ - Sets the active device for screen interactions by name. + Sets the active device for screen interactions by serial number. """ raise NotImplementedError diff --git a/src/askui/tools/android/agent_os_handler.py b/src/askui/tools/android/agent_os_handler.py index 2b9480f9..c5e7cf2b 100644 --- a/src/askui/tools/android/agent_os_handler.py +++ b/src/askui/tools/android/agent_os_handler.py @@ -139,9 +139,9 @@ def set_device_by_index(self, device_index: int = 0) -> None: "AndroidAgentOS", f"Set device by index: {device_index}" ) - def set_device_by_name(self, device_name: str) -> None: - self._os_agent.set_device_by_name(device_name) + def set_device_by_serial_number(self, device_serial_number: str) -> None: + self._os_agent.set_device_by_serial_number(device_serial_number) self._real_screen_resolution = None self._reporter.add_message( - "AndroidAgentOS", f"Set device by name: {device_name}" + "AndroidAgentOS", f"Set device by serial number: {device_serial_number}" ) diff --git a/src/askui/tools/android/ppadb_agent_os.py b/src/askui/tools/android/ppadb_agent_os.py index a064bec1..8b30903f 100644 --- a/src/askui/tools/android/ppadb_agent_os.py +++ b/src/askui/tools/android/ppadb_agent_os.py @@ -1,5 +1,6 @@ import io import re +import shlex import string from typing import List, Optional, get_args @@ -110,14 +111,14 @@ def set_device_by_index(self, device_index: int = 0) -> None: self._device = devices[device_index] self.set_display_by_index(0) - def set_device_by_name(self, device_name: str) -> None: + def set_device_by_serial_number(self, device_serial_number: str) -> None: devices = self._get_connected_devices() for device in devices: - if device.serial == device_name: + if device.serial == device_serial_number: self._device = device self.set_display_by_index(0) return - msg = f"Device name {device_name} not found" + msg = f"Device name {device_serial_number} not found" raise RuntimeError(msg) def screenshot(self) -> Image.Image: @@ -186,7 +187,12 @@ def type(self, text: str) -> None: raise RuntimeError(error_msg_nonprintable) assert self._selected_display is not None display_index: int = self._selected_display.display_index - self.shell(f"input -d {display_index} text {text}") + + escaped_text = shlex.quote(text) + + shell_safe_text = escaped_text.replace(" ", "%s") + + self.shell(f"input -d {display_index} text {shell_safe_text}") def key_tap(self, key: ANDROID_KEY) -> None: if key not in get_args(ANDROID_KEY): From 371c542f121a5a03e42fb3ec5214d2431fb1fd0f Mon Sep 17 00:00:00 2001 From: Samir mlika <105347215+mlikasam-askui@users.noreply.github.com> Date: Mon, 23 Jun 2025 11:04:09 +0200 Subject: [PATCH 05/12] implement review remarks --- src/askui/android_agent.py | 59 ++-- src/askui/models/model_router.py | 10 +- src/askui/models/models.py | 19 -- src/askui/models/shared/base_agent.py | 20 +- src/askui/models/shared/facade.py | 11 - src/askui/models/shared/tools.py | 8 +- src/askui/tools/android/agent_os.py | 358 ++++++++++---------- src/askui/tools/android/agent_os_handler.py | 42 +-- src/askui/tools/android/ppadb_agent_os.py | 4 +- src/askui/tools/android/tools.py | 50 +-- tests/integration/test_custom_models.py | 17 - 11 files changed, 255 insertions(+), 343 deletions(-) diff --git a/src/askui/android_agent.py b/src/askui/android_agent.py index e01c24f4..7869a58c 100644 --- a/src/askui/android_agent.py +++ b/src/askui/android_agent.py @@ -10,9 +10,9 @@ from askui.locators.locators import Locator from askui.models.shared.computer_agent_cb_param import OnMessageCb from askui.models.shared.computer_agent_message_param import MessageParam -from askui.models.shared.tools import Tool, ToolCollection +from askui.models.shared.tools import ToolCollection from askui.tools.android.agent_os import ANDROID_KEY -from askui.tools.android.agent_os_handler import AndroidAgentOSHandler +from askui.tools.android.agent_os_handler import AndroidAgentOsHandler from askui.tools.android.ppadb_agent_os import PpadbAgentOs from askui.tools.android.tools import ( AndroidDragAndDropTool, @@ -60,21 +60,22 @@ def __init__( configure_logging(level=log_level) self.os = PpadbAgentOs() self._reporter = CompositeReporter(reporters=reporters) - android_os_handler = AndroidAgentOSHandler(self.os, self._reporter) + self.act_agent_os_handler = AndroidAgentOsHandler(self.os, self._reporter) + self.act_tool_collection = ToolCollection( + tools=[ + AndroidScreenshotTool(self.act_agent_os_handler), + AndroidTapTool(self.act_agent_os_handler), + AndroidTypeTool(self.act_agent_os_handler), + AndroidDragAndDropTool(self.act_agent_os_handler), + AndroidKeyTapEventTool(self.act_agent_os_handler), + AndroidSwipeTool(self.act_agent_os_handler), + AndroidKeyCombinationTool(self.act_agent_os_handler), + AndroidShellTool(self.act_agent_os_handler), + ExceptionTool(), + ] + ) _models = initialize_default_android_model_registry( - tool_collection=ToolCollection( - tools=[ - AndroidScreenshotTool(android_os_handler), - AndroidTapTool(android_os_handler), - AndroidTypeTool(android_os_handler), - AndroidDragAndDropTool(android_os_handler), - AndroidKeyTapEventTool(android_os_handler), - AndroidSwipeTool(android_os_handler), - AndroidKeyCombinationTool(android_os_handler), - AndroidShellTool(android_os_handler), - ExceptionTool(), - ] - ), + tool_collection=self.act_tool_collection, reporter=self._reporter, ) _models.update(models or {}) @@ -421,8 +422,8 @@ def key_tap( from askui import AndroidVisionAgent with AndroidVisionAgent() as agent: - agent.key_tap("home") # Taps the home key - agent.key_tap("back") # Taps the back key + agent.key_tap("HOME") # Taps the home key + agent.key_tap("BACK") # Taps the back key ``` """ self.os.key_tap(key) @@ -431,7 +432,7 @@ def key_tap( @validate_call def key_combination( self, - keys: Annotated[list[ANDROID_KEY], Field(min_length=1)], + keys: Annotated[list[ANDROID_KEY], Field(min_length=2)], duration_in_ms: int = 100, ) -> None: """ @@ -446,8 +447,8 @@ def key_combination( from askui import AndroidVisionAgent with AndroidVisionAgent() as agent: - agent.key_combination(["home", "back"]) # Taps the home key and then the back key - agent.key_combination(["home", "back"], duration_in_ms=200) # Taps the home key and then the back key for 200ms. + agent.key_combination(["HOME", "BACK"]) # Taps the home key and then the back key + agent.key_combination(["HOME", "BACK"], duration_in_ms=200) # Taps the home key and then the back key for 200ms. ``` """ self.os.key_combination(keys, duration_in_ms) @@ -604,22 +605,6 @@ def act( ) self._model_router.act(messages, model or self._model_choice["act"], on_message) - def set_act_model_tools(self, tools: list[Tool], model: str | None = None) -> None: - """ - Sets the tools for the act model. - """ - self._model_router.set_act_model_tools( - model or self._model_choice["act"], tools - ) - - def add_tool_to_act_model(self, tool: Tool, model: str | None = None) -> None: - """ - Adds a tool to the act model. - """ - self._model_router.add_tool_to_act_model( - model or self._model_choice["act"], tool - ) - @telemetry.record_call(flush=True) def close(self) -> None: """Disconnects from the Android device.""" diff --git a/src/askui/models/model_router.py b/src/askui/models/model_router.py index e1f8bb72..affee49a 100644 --- a/src/askui/models/model_router.py +++ b/src/askui/models/model_router.py @@ -34,7 +34,7 @@ from askui.models.shared.computer_agent_cb_param import OnMessageCb from askui.models.shared.computer_agent_message_param import MessageParam from askui.models.shared.facade import ModelFacade -from askui.models.shared.tools import Tool, ToolCollection +from askui.models.shared.tools import ToolCollection from askui.models.types.response_schemas import ResponseSchema from askui.reporting import CompositeReporter, Reporter from askui.utils.image_utils import ImageSource @@ -259,11 +259,3 @@ def locate( m = self._get_model(_model_choice, "locate") logger.debug(f"Routing locate prediction to {_model_choice}") return m.locate(locator, screenshot, _model_composition or _model_choice) - - def set_act_model_tools(self, model_choice: str, tools: list[Tool]) -> None: - act_model = self._get_model(model_choice, "act") - act_model.set_tools(tools) - - def add_tool_to_act_model(self, model_choice: str, tool: Tool) -> None: - act_model = self._get_model(model_choice, "act") - act_model.add_tool(tool) diff --git a/src/askui/models/models.py b/src/askui/models/models.py index 1dd6ddad..1c612d8c 100644 --- a/src/askui/models/models.py +++ b/src/askui/models/models.py @@ -10,7 +10,6 @@ from askui.locators.locators import Locator from askui.models.shared.computer_agent_cb_param import OnMessageCb from askui.models.shared.computer_agent_message_param import MessageParam -from askui.models.shared.tools import Tool from askui.models.types.response_schemas import ResponseSchema from askui.utils.image_utils import ImageSource @@ -220,24 +219,6 @@ def act( """ # noqa: E501 raise NotImplementedError - @abc.abstractmethod - def set_tools(self, tools: list[Tool]) -> None: - """Set the tools that the model can use. - - Args: - tools (list[Tool]): The tools that the model can use. - """ - raise NotImplementedError - - @abc.abstractmethod - def add_tool(self, tool: Tool) -> None: - """Add a tool to the model. - - Args: - tool (Tool): The tool to add. - """ - raise NotImplementedError - class GetModel(abc.ABC): """Abstract base class for models that can extract information from images. diff --git a/src/askui/models/shared/base_agent.py b/src/askui/models/shared/base_agent.py index b8267d43..e7043bb3 100644 --- a/src/askui/models/shared/base_agent.py +++ b/src/askui/models/shared/base_agent.py @@ -12,7 +12,7 @@ MessageParam, TextBlockParam, ) -from askui.models.shared.tools import Tool, ToolCollection +from askui.models.shared.tools import ToolCollection from askui.reporting import Reporter from ...logger import logger @@ -60,24 +60,6 @@ def __init__( text=system_prompt, ) - @override - def set_tools(self, tools: list[Tool]) -> None: - """Set the tools that the model can use. - - Args: - tools (list[Tool]): The tools that the model can use. - """ - self._tool_collection.set_tools(tools) - - @override - def add_tool(self, tool: Tool) -> None: - """Add a tool to the model. - - Args: - tool (Tool): The tool to add. - """ - self._tool_collection.add_tool(tool) - @abstractmethod def _create_message( self, messages: list[MessageParam], model_choice: str diff --git a/src/askui/models/shared/facade.py b/src/askui/models/shared/facade.py index a3068b3d..e4ac7a0f 100644 --- a/src/askui/models/shared/facade.py +++ b/src/askui/models/shared/facade.py @@ -6,7 +6,6 @@ from askui.models.models import ActModel, GetModel, LocateModel, ModelComposition, Point from askui.models.shared.computer_agent_cb_param import OnMessageCb from askui.models.shared.computer_agent_message_param import MessageParam -from askui.models.shared.tools import Tool from askui.models.types.response_schemas import ResponseSchema from askui.utils.image_utils import ImageSource @@ -35,16 +34,6 @@ def act( on_message=on_message, ) - @override - def set_tools(self, tools: list[Tool]) -> None: - """Set the tools for the act model.""" - self._act_model.set_tools(tools) - - @override - def add_tool(self, tool: Tool) -> None: - """Add a tool to the act model.""" - self._act_model.add_tool(tool) - @override def get( self, diff --git a/src/askui/models/shared/tools.py b/src/askui/models/shared/tools.py index 17e13325..7b8f5037 100644 --- a/src/askui/models/shared/tools.py +++ b/src/askui/models/shared/tools.py @@ -110,13 +110,13 @@ def to_params( ) -> list[BetaToolUnionParam]: return [tool.to_params() for tool in self._tools] - def add_tool(self, tool: Tool) -> None: - """Add a tool to the collection.""" + def append_tool(self, tool: Tool) -> None: + """Append a tool to the collection.""" self._tools.append(tool) self._tool_map[tool.to_params()["name"]] = tool - def set_tools(self, tools: list[Tool]) -> None: - """Set the tools in the collection.""" + def reset_tools(self, tools: list[Tool]) -> None: + """Reset the tools in the collection with new tools.""" self._tools = tools self._tool_map = {tool.to_params()["name"]: tool for tool in tools} diff --git a/src/askui/tools/android/agent_os.py b/src/askui/tools/android/agent_os.py index 87d2f5f3..00419ae6 100644 --- a/src/askui/tools/android/agent_os.py +++ b/src/askui/tools/android/agent_os.py @@ -4,10 +4,10 @@ from PIL import Image ANDROID_KEY = Literal[ # pylint: disable=C0103 - "home", - "back", - "call", - "endcall", + "HOME", + "BACK", + "CALL", + "ENDCALL", "0", "1", "2", @@ -18,181 +18,181 @@ "7", "8", "9", - "star", - "pound", - "dpad_up", - "dpad_down", - "dpad_left", - "dpad_right", - "dpad_center", - "volume_up", - "volume_down", - "power", - "camera", - "clear", - "a", - "b", - "c", - "d", - "e", - "f", - "g", - "h", - "i", - "j", - "k", - "l", - "m", - "n", - "o", - "p", - "q", - "r", - "s", - "t", - "u", - "v", - "w", - "x", - "y", - "z", - "comma", - "period", - "alt_left", - "alt_right", - "shift_left", - "shift_right", - "tab", - "space", - "sym", - "explorer", - "envelope", - "enter", - "del", - "grave", - "minus", - "equals", - "left_bracket", - "right_bracket", - "backslash", - "semicolon", - "apostrophe", - "slash", - "at", - "num", - "headsethook", - "focus", - "plus", - "menu", - "notification", - "search", - "media_play_pause", - "media_stop", - "media_next", - "media_previous", - "media_rewind", - "media_fast_forward", - "mute", - "page_up", - "page_down", - "switch_charset", - "escape", - "forward_del", - "ctrl_left", - "ctrl_right", - "caps_lock", - "scroll_lock", - "function", - "break", - "move_home", - "move_end", - "insert", - "forward", - "media_play", - "media_pause", - "media_close", - "media_eject", - "media_record", - "f1", - "f2", - "f3", - "f4", - "f5", - "f6", - "f7", - "f8", - "f9", - "f10", - "f11", - "f12", - "num_lock", - "numpad_0", - "numpad_1", - "numpad_2", - "numpad_3", - "numpad_4", - "numpad_5", - "numpad_6", - "numpad_7", - "numpad_8", - "numpad_9", - "numpad_divide", - "numpad_multiply", - "numpad_subtract", - "numpad_add", - "numpad_dot", - "numpad_comma", - "numpad_enter", - "numpad_equals", - "numpad_left_paren", - "numpad_right_paren", - "volume_mute", - "info", - "channel_up", - "channel_down", - "zoom_in", - "zoom_out", - "window", - "guide", - "bookmark", - "captions", - "settings", - "app_switch", - "language_switch", - "contacts", - "calendar", - "music", - "calculator", - "assist", - "brightness_down", - "brightness_up", - "media_audio_track", - "sleep", - "wakeup", - "pairing", - "media_top_menu", - "last_channel", - "tv_data_service", - "voice_assist", - "help", - "navigate_previous", - "navigate_next", - "navigate_in", - "navigate_out", - "dpad_up_left", - "dpad_down_left", - "dpad_up_right", - "dpad_down_right", - "media_skip_forward", - "media_skip_backward", - "media_step_forward", - "media_step_backward", - "soft_sleep", - "cut", - "copy", - "paste", - "all_apps", - "refresh", + "STAR", + "POUND", + "DPAD_UP", + "DPAD_DOWN", + "DPAD_LEFT", + "DPAD_RIGHT", + "DPAD_CENTER", + "VOLUME_UP", + "VOLUME_DOWN", + "POWER", + "CAMERA", + "CLEAR", + "A", + "B", + "C", + "D", + "E", + "F", + "G", + "H", + "I", + "J", + "K", + "L", + "M", + "N", + "O", + "P", + "Q", + "R", + "S", + "T", + "U", + "V", + "W", + "X", + "Y", + "Z", + "COMMA", + "PERIOD", + "ALT_LEFT", + "ALT_RIGHT", + "SHIFT_LEFT", + "SHIFT_RIGHT", + "TAB", + "SPACE", + "SYM", + "EXPLORER", + "ENVELOPE", + "ENTER", + "DEL", + "GRAVE", + "MINUS", + "EQUALS", + "LEFT_BRACKET", + "RIGHT_BRACKET", + "BACKSLASH", + "SEMICOLON", + "APOSTROPHE", + "SLASH", + "AT", + "NUM", + "HEADSETHOOK", + "FOCUS", + "PLUS", + "MENU", + "NOTIFICATION", + "SEARCH", + "MEDIA_PLAY_PAUSE", + "MEDIA_STOP", + "MEDIA_NEXT", + "MEDIA_PREVIOUS", + "MEDIA_REWIND", + "MEDIA_FAST_FORWARD", + "MUTE", + "PAGE_UP", + "PAGE_DOWN", + "SWITCH_CHARSET", + "ESCAPE", + "FORWARD_DEL", + "CTRL_LEFT", + "CTRL_RIGHT", + "CAPS_LOCK", + "SCROLL_LOCK", + "FUNCTION", + "BREAK", + "MOVE_HOME", + "MOVE_END", + "INSERT", + "FORWARD", + "MEDIA_PLAY", + "MEDIA_PAUSE", + "MEDIA_CLOSE", + "MEDIA_EJECT", + "MEDIA_RECORD", + "F1", + "F2", + "F3", + "F4", + "F5", + "F6", + "F7", + "F8", + "F9", + "F10", + "F11", + "F12", + "NUM_LOCK", + "NUMPAD_0", + "NUMPAD_1", + "NUMPAD_2", + "NUMPAD_3", + "NUMPAD_4", + "NUMPAD_5", + "NUMPAD_6", + "NUMPAD_7", + "NUMPAD_8", + "NUMPAD_9", + "NUMPAD_DIVIDE", + "NUMPAD_MULTIPLY", + "NUMPAD_SUBTRACT", + "NUMPAD_ADD", + "NUMPAD_DOT", + "NUMPAD_COMMA", + "NUMPAD_ENTER", + "NUMPAD_EQUALS", + "NUMPAD_LEFT_PAREN", + "NUMPAD_RIGHT_PAREN", + "VOLUME_MUTE", + "INFO", + "CHANNEL_UP", + "CHANNEL_DOWN", + "ZOOM_IN", + "ZOOM_OUT", + "WINDOW", + "GUIDE", + "BOOKMARK", + "CAPTIONS", + "SETTINGS", + "APP_SWITCH", + "LANGUAGE_SWITCH", + "CONTACTS", + "CALENDAR", + "MUSIC", + "CALCULATOR", + "ASSIST", + "BRIGHTNESS_DOWN", + "BRIGHTNESS_UP", + "MEDIA_AUDIO_TRACK", + "SLEEP", + "WAKEUP", + "PAIRING", + "MEDIA_TOP_MENU", + "LAST_CHANNEL", + "TV_DATA_SERVICE", + "VOICE_ASSIST", + "HELP", + "NAVIGATE_PREVIOUS", + "NAVIGATE_NEXT", + "NAVIGATE_IN", + "NAVIGATE_OUT", + "DPAD_UP_LEFT", + "DPAD_DOWN_LEFT", + "DPAD_UP_RIGHT", + "DPAD_DOWN_RIGHT", + "MEDIA_SKIP_FORWARD", + "MEDIA_SKIP_BACKWARD", + "MEDIA_STEP_FORWARD", + "MEDIA_STEP_BACKWARD", + "SOFT_SLEEP", + "CUT", + "COPY", + "PASTE", + "ALL_APPS", + "REFRESH", ] diff --git a/src/askui/tools/android/agent_os_handler.py b/src/askui/tools/android/agent_os_handler.py index c5e7cf2b..f1269092 100644 --- a/src/askui/tools/android/agent_os_handler.py +++ b/src/askui/tools/android/agent_os_handler.py @@ -7,30 +7,30 @@ from askui.utils.image_utils import scale_coordinates_back, scale_image_with_padding -class AndroidAgentOSHandler(AndroidAgentOs): +class AndroidAgentOsHandler(AndroidAgentOs): """ This class is used to handle the AndroidAgentOs class. It is used to scale the coordinates to the target resolution and back to the real screen resolution. """ - def __init__(self, os_agent: AndroidAgentOs, reporter: Reporter) -> None: - self._os_agent: AndroidAgentOs = os_agent + def __init__(self, agent_os: AndroidAgentOs, reporter: Reporter) -> None: + self._agent_os: AndroidAgentOs = agent_os self._reporter: Reporter = reporter self._target_resolution: Tuple[int, int] = (1280, 800) self._real_screen_resolution: Optional[Tuple[int, int]] = None def connect(self) -> None: - self._os_agent.connect() + self._agent_os.connect() self._reporter.add_message("AndroidAgentOS", "Connected to device") - self._real_screen_resolution = self._os_agent.screenshot().size + self._real_screen_resolution = self._agent_os.screenshot().size def disconnect(self) -> None: - self._os_agent.disconnect() + self._agent_os.disconnect() self._real_screen_resolution = None def screenshot(self) -> Image.Image: - screenshot = self._os_agent.screenshot() + screenshot = self._agent_os.screenshot() self._real_screen_resolution = screenshot.size scaled_image = scale_image_with_padding( screenshot, @@ -43,7 +43,7 @@ def screenshot(self) -> Image.Image: def _scale_coordinates_back(self, x: int, y: int) -> Tuple[int, int]: if self._real_screen_resolution is None: - self._real_screen_resolution = self._os_agent.screenshot().size + self._real_screen_resolution = self._agent_os.screenshot().size scaled_x, scaled_y = scale_coordinates_back( x, @@ -57,7 +57,7 @@ def _scale_coordinates_back(self, x: int, y: int) -> Tuple[int, int]: def tap(self, x: int, y: int) -> None: scaled_x, scaled_y = self._scale_coordinates_back(x, y) - self._os_agent.tap(scaled_x, scaled_y) + self._agent_os.tap(scaled_x, scaled_y) self._reporter.add_message("AndroidAgentOS", f"Tapped on {x}, {y}") def swipe( @@ -65,7 +65,7 @@ def swipe( ) -> None: scaled_x1, scaled_y1 = self._scale_coordinates_back(x1, y1) scaled_x2, scaled_y2 = self._scale_coordinates_back(x2, y2) - self._os_agent.swipe(scaled_x1, scaled_y1, scaled_x2, scaled_y2, duration_in_ms) + self._agent_os.swipe(scaled_x1, scaled_y1, scaled_x2, scaled_y2, duration_in_ms) self._reporter.add_message( "AndroidAgentOS", f"Swiped from {x1}, {y1} to {x2}, {y2}" ) @@ -75,7 +75,7 @@ def drag_and_drop( ) -> None: scaled_x1, scaled_y1 = self._scale_coordinates_back(x1, y1) scaled_x2, scaled_y2 = self._scale_coordinates_back(x2, y2) - self._os_agent.drag_and_drop( + self._agent_os.drag_and_drop( scaled_x1, scaled_y1, scaled_x2, scaled_y2, duration_in_ms ) self._reporter.add_message( @@ -84,29 +84,29 @@ def drag_and_drop( ) def type(self, text: str) -> None: - self._os_agent.type(text) + self._agent_os.type(text) self._reporter.add_message("AndroidAgentOS", f"Typed {text}") def key_tap(self, key: ANDROID_KEY) -> None: - self._os_agent.key_tap(key) + self._agent_os.key_tap(key) self._reporter.add_message("AndroidAgentOS", f"Tapped on {key}") def key_combination( self, keys: List[ANDROID_KEY], duration_in_ms: int = 100 ) -> None: - self._os_agent.key_combination(keys, duration_in_ms) + self._agent_os.key_combination(keys, duration_in_ms) self._reporter.add_message( "AndroidAgentOS", f"Tapped on Keys: {keys}", ) def shell(self, command: str) -> str: - shell_output = self._os_agent.shell(command) + shell_output = self._agent_os.shell(command) self._reporter.add_message("AndroidAgentOS", f"Ran shell command: {command}") return shell_output def get_connected_displays(self) -> list[AndroidDisplay]: - displays = self._os_agent.get_connected_displays() + displays = self._agent_os.get_connected_displays() self._reporter.add_message( "AndroidAgentOS", f"Retrieved connected displays, length: {len(displays)}", @@ -114,33 +114,33 @@ def get_connected_displays(self) -> list[AndroidDisplay]: return displays def set_display_by_index(self, display_index: int = 0) -> None: - self._os_agent.set_display_by_index(display_index) + self._agent_os.set_display_by_index(display_index) self._real_screen_resolution = None self._reporter.add_message( "AndroidAgentOS", f"Set display by index: {display_index}" ) def set_display_by_id(self, display_id: int) -> None: - self._os_agent.set_display_by_id(display_id) + self._agent_os.set_display_by_id(display_id) self._real_screen_resolution = None self._reporter.add_message("AndroidAgentOS", f"Set display by id: {display_id}") def set_display_by_name(self, display_name: str) -> None: - self._os_agent.set_display_by_name(display_name) + self._agent_os.set_display_by_name(display_name) self._real_screen_resolution = None self._reporter.add_message( "AndroidAgentOS", f"Set display by name: {display_name}" ) def set_device_by_index(self, device_index: int = 0) -> None: - self._os_agent.set_device_by_index(device_index) + self._agent_os.set_device_by_index(device_index) self._real_screen_resolution = None self._reporter.add_message( "AndroidAgentOS", f"Set device by index: {device_index}" ) def set_device_by_serial_number(self, device_serial_number: str) -> None: - self._os_agent.set_device_by_serial_number(device_serial_number) + self._agent_os.set_device_by_serial_number(device_serial_number) self._real_screen_resolution = None self._reporter.add_message( "AndroidAgentOS", f"Set device by serial number: {device_serial_number}" diff --git a/src/askui/tools/android/ppadb_agent_os.py b/src/askui/tools/android/ppadb_agent_os.py index 8b30903f..32ae2561 100644 --- a/src/askui/tools/android/ppadb_agent_os.py +++ b/src/askui/tools/android/ppadb_agent_os.py @@ -200,7 +200,7 @@ def key_tap(self, key: ANDROID_KEY) -> None: raise RuntimeError(error_msg_invalid_key) assert self._selected_display is not None display_index: int = self._selected_display.display_index - self.shell(f"input -d {display_index} keyevent {key.upper()}") + self.shell(f"input -d {display_index} keyevent {key}") def key_combination( self, keys: List[ANDROID_KEY], duration_in_ms: int = 100 @@ -213,7 +213,7 @@ def key_combination( error_msg_too_few: str = "Key combination must contain at least 2 keys" raise RuntimeError(error_msg_too_few) - keys_string = " ".join(key.upper() for key in keys) + keys_string = " ".join(keys) assert self._selected_display is not None display_index: int = self._selected_display.display_index self.shell( diff --git a/src/askui/tools/android/tools.py b/src/askui/tools/android/tools.py index cea9b1d3..b49b27ea 100644 --- a/src/askui/tools/android/tools.py +++ b/src/askui/tools/android/tools.py @@ -5,7 +5,7 @@ from askui.models.shared.tools import Tool from askui.tools.android.agent_os import ANDROID_KEY -from askui.tools.android.agent_os_handler import AndroidAgentOSHandler +from askui.tools.android.agent_os_handler import AndroidAgentOsHandler class AndroidScreenshotTool(Tool): @@ -13,7 +13,7 @@ class AndroidScreenshotTool(Tool): Takes a screenshot from the currently connected Android device. """ - def __init__(self, os_agent_handler: AndroidAgentOSHandler) -> None: + def __init__(self, agent_os_handler: AndroidAgentOsHandler) -> None: super().__init__( name="android_screenshot_tool", description=( @@ -25,11 +25,11 @@ def __init__(self, os_agent_handler: AndroidAgentOSHandler) -> None: """ ), ) - self._os_agent_handler = os_agent_handler + self._agent_os_handler = agent_os_handler @override def __call__(self) -> tuple[str, Image.Image]: - screenshot = self._os_agent_handler.screenshot() + screenshot = self._agent_os_handler.screenshot() return "Screenshot was taken.", screenshot @@ -41,7 +41,7 @@ class AndroidTapTool(Tool): The top left corner of the screen is (0, 0). """ - def __init__(self, os_agent_handler: AndroidAgentOSHandler) -> None: + def __init__(self, agent_os_handler: AndroidAgentOsHandler) -> None: super().__init__( name="android_tap_tool", description=( @@ -66,11 +66,11 @@ def __init__(self, os_agent_handler: AndroidAgentOSHandler) -> None: "required": ["x", "y"], }, ) - self._os_agent_handler = os_agent_handler + self._agent_os_handler = agent_os_handler @override def __call__(self, x: int, y: int) -> str: - self._os_agent_handler.tap(x, y) + self._agent_os_handler.tap(x, y) return f"Tapped at ({x}, {y})" @@ -79,7 +79,7 @@ class AndroidTypeTool(Tool): Types the given text on the Android device screen. """ - def __init__(self, os_agent_handler: AndroidAgentOSHandler) -> None: + def __init__(self, agent_os_handler: AndroidAgentOsHandler) -> None: super().__init__( name="android_type_tool", description=( @@ -103,11 +103,11 @@ def __init__(self, os_agent_handler: AndroidAgentOSHandler) -> None: "required": ["text"], }, ) - self._os_agent_handler = os_agent_handler + self._agent_os_handler = agent_os_handler @override def __call__(self, text: str) -> str: - self._os_agent_handler.type(text) + self._agent_os_handler.type(text) return f"Typed: {text}" @@ -116,8 +116,8 @@ class AndroidDragAndDropTool(Tool): Performs a drag and drop gesture on the Android device screen. """ - def __init__(self, os_agent_handler: AndroidAgentOSHandler) -> None: - self._os_agent_handler = os_agent_handler + def __init__(self, agent_os_handler: AndroidAgentOsHandler) -> None: + self._agent_os_handler = agent_os_handler super().__init__( name="android_drag_and_drop_tool", description=( @@ -163,12 +163,12 @@ def __init__(self, os_agent_handler: AndroidAgentOSHandler) -> None: @override def __call__(self, x1: int, y1: int, x2: int, y2: int, duration: int = 1000) -> str: - self._os_agent_handler.drag_and_drop(x1, y1, x2, y2, duration) + self._agent_os_handler.drag_and_drop(x1, y1, x2, y2, duration) return f"Dragged and dropped from ({x1}, {y1}) to ({x2}, {y2}) in {duration}ms" class AndroidKeyTapEventTool(Tool): - def __init__(self, os_agent_handler: AndroidAgentOSHandler) -> None: + def __init__(self, agent_os_handler: AndroidAgentOsHandler) -> None: super().__init__( name="android_key_event_tool", description=( @@ -192,11 +192,11 @@ def __init__(self, os_agent_handler: AndroidAgentOSHandler) -> None: "required": ["key_name"], }, ) - self._os_agent_handler = os_agent_handler + self._agent_os_handler = agent_os_handler @override def __call__(self, key_name: ANDROID_KEY) -> str: - self._os_agent_handler.key_tap(key_name) + self._agent_os_handler.key_tap(key_name) return f"Tapped on Key: {key_name}" @@ -205,7 +205,7 @@ class AndroidSwipeTool(Tool): Performs a swipe gesture on the Android device screen. """ - def __init__(self, os_agent_handler: AndroidAgentOSHandler) -> None: + def __init__(self, agent_os_handler: AndroidAgentOsHandler) -> None: super().__init__( name="android_swipe_tool", description=( @@ -265,11 +265,11 @@ def __init__(self, os_agent_handler: AndroidAgentOSHandler) -> None: "required": ["x1", "y1", "x2", "y2"], }, ) - self._os_agent_handler = os_agent_handler + self._agent_os_handler = agent_os_handler @override def __call__(self, x1: int, y1: int, x2: int, y2: int, duration: int = 1000) -> str: - self._os_agent_handler.swipe(x1, y1, x2, y2, duration) + self._agent_os_handler.swipe(x1, y1, x2, y2, duration) return f"Swiped from ({x1}, {y1}) to ({x2}, {y2}) in {duration}ms" @@ -278,7 +278,7 @@ class AndroidKeyCombinationTool(Tool): Performs a key combination on the Android device. """ - def __init__(self, os_agent_handler: AndroidAgentOSHandler) -> None: + def __init__(self, agent_os_handler: AndroidAgentOsHandler) -> None: super().__init__( name="android_key_combination_tool", description=( @@ -321,11 +321,11 @@ def __init__(self, os_agent_handler: AndroidAgentOSHandler) -> None: "required": ["keys"], }, ) - self._os_agent_handler = os_agent_handler + self._agent_os_handler = agent_os_handler @override def __call__(self, keys: list[ANDROID_KEY], duration: int = 100) -> str: - self._os_agent_handler.key_combination(keys, duration) + self._agent_os_handler.key_combination(keys, duration) return f"Performed key combination: {keys}" @@ -334,7 +334,7 @@ class AndroidShellTool(Tool): Executes a shell command on the Android device. """ - def __init__(self, os_agent_handler: AndroidAgentOSHandler) -> None: + def __init__(self, agent_os_handler: AndroidAgentOsHandler) -> None: super().__init__( name="android_shell_tool", description=( @@ -363,9 +363,9 @@ def __init__(self, os_agent_handler: AndroidAgentOSHandler) -> None: "required": ["command"], }, ) - self._os_agent_handler = os_agent_handler + self._agent_os_handler = agent_os_handler @override def __call__(self, command: str) -> str: - output = self._os_agent_handler.shell(command) + output = self._agent_os_handler.shell(command) return f"Shell command executed. Output: {output}" diff --git a/tests/integration/test_custom_models.py b/tests/integration/test_custom_models.py index 086cf93b..a64525ba 100644 --- a/tests/integration/test_custom_models.py +++ b/tests/integration/test_custom_models.py @@ -19,7 +19,6 @@ from askui.models import ModelComposition, ModelDefinition, ModelName from askui.models.shared.computer_agent_cb_param import OnMessageCb from askui.models.shared.computer_agent_message_param import MessageParam -from askui.models.shared.tools import Tool from askui.tools.toolbox import AgentToolbox from askui.utils.image_utils import ImageSource @@ -41,14 +40,6 @@ def act( self.goals.append([message.model_dump(mode="json") for message in messages]) self.model_choices.append(model_choice) - @override - def add_tool(self, tool: Tool) -> None: - pass - - @override - def set_tools(self, tools: list[Tool]) -> None: - pass - class SimpleGetModel(GetModel): """Simple get model that returns a fixed response.""" @@ -218,14 +209,6 @@ def act( ) -> None: pass - @override - def add_tool(self, tool: Tool) -> None: - pass - - @override - def set_tools(self, tools: list[Tool]) -> None: - pass - registry: ModelRegistry = { "act-1": act_model, "act-2": AnotherActModel(), From b880ddca473620841ab5bfe42c43827f73740522 Mon Sep 17 00:00:00 2001 From: Samir mlika <105347215+mlikasam-askui@users.noreply.github.com> Date: Mon, 23 Jun 2025 11:12:20 +0200 Subject: [PATCH 06/12] change android agent icon --- src/chat/api/assistants/seeds.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/chat/api/assistants/seeds.py b/src/chat/api/assistants/seeds.py index 98da896a..5d7bf97b 100644 --- a/src/chat/api/assistants/seeds.py +++ b/src/chat/api/assistants/seeds.py @@ -15,5 +15,5 @@ ANDROID_VISION_AGENT = Assistant( id="asst_78da09fbf1ed43c7826fb1686f89f541", name="AskUI Android Vision Agent", - avatar="data:image/svg+xml;base64,PHN2ZyB3aWR0aD0iMjAwIiBoZWlnaHQ9IjMwMCIgdmlld0JveD0iMCAwIDIwMCAzMDAiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyIgZmlsbD0ibm9uZSIgc3Ryb2tlPSJibGFjayIgc3Ryb2tlLXdpZHRoPSIzIj4KICA8IS0tIEhlYWQgLS0+CiAgPHJlY3QgeD0iNTAiIHk9IjMwIiB3aWR0aD0iMTAwIiBoZWlnaHQ9IjYwIiByeD0iMTAiIHJ5PSIxMCIgLz4KCiAgPCEtLSBFeWVzIC0tPgogIDxjaXJjbGUgY3g9IjcwIiBjeT0iNTAiIHI9IjUiIC8+CiAgPGNpcmNsZSBjeD0iMTMwIiBjeT0iNTAiIHI9IjUiIC8+CgogIDwhLS0gQW50ZW5uYXMgLS0+CiAgPGxpbmUgeDE9IjYwIiB5MT0iMzAiIHgyPSI0MCIgeTI9IjEwIiAvPgogIDxsaW5lIHgxPSIxNDAiIHkxPSIzMCIgeDI9IjE2MCIgeTI9IjEwIiAvPgoKICA8IS0tIEJvZHkgLS0+CiAgPHJlY3QgeD0iNDAiIHk9IjkwIiB3aWR0aD0iMTIwIiBoZWlnaHQ9IjEyMCIgcng9IjIwIiByeT0iMjAiIC8+CgogIDwhLS0gIkFza1VJIEFuZHJvaWRBZ2VudCIgbGFiZWwgLS0+CiAgPHRleHQgeD0iMTAwIiB5PSIxNjAiIHRleHQtYW5jaG9yPSJtaWRkbGUiIGZvbnQtc2l6ZT0iMTYiIGZpbGw9ImJsYWNrIiBzdHJva2U9Im5vbmUiIGZvbnQtZmFtaWx5PSJBcmlhbCI+QXNrVUk8L3RleHQ+CiAgPHRleHQgeD0iMTAwIiB5PSIxODAiIHRleHQtYW5jaG9yPSJtaWRkbGUiIGZvbnQtc2l6ZT0iMTIiIGZpbGw9ImJsYWNrIiBzdHJva2U9Im5vbmUiIGZvbnQtZmFtaWx5PSJBcmlhbCI+QW5kcm9pZEFnZW50PC90ZXh0PgoKICA8IS0tIEFybXMgLS0+CiAgPHJlY3QgeD0iMTAiIHk9IjkwIiB3aWR0aD0iMjAiIGhlaWdodD0iMTAwIiByeD0iMTAiIC8+CiAgPHJlY3QgeD0iMTcwIiB5PSI5MCIgd2lkdGg9IjIwIiBoZWlnaHQ9IjEwMCIgcng9IjEwIiAvPgoKICA8IS0tIExlZ3MgLS0+CiAgPHJlY3QgeD0iNjAiIHk9IjIxMCIgd2lkdGg9IjIwIiBoZWlnaHQ9IjYwIiByeD0iNSIgLz4KICA8cmVjdCB4PSIxMjAiIHk9IjIxMCIgd2lkdGg9IjIwIiBoZWlnaHQ9IjYwIiByeD0iNSIgLz4KPC9zdmc+", + avatar="data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciICB2aWV3Qm94PSIwIDAgNDggNDgiIHdpZHRoPSIyNXB4IiBoZWlnaHQ9IjI1cHgiPjxwYXRoIGQ9Ik0gMzIuNTE5NTMxIDAuOTgyNDIxODggQSAxLjUwMDE1IDEuNTAwMTUgMCAwIDAgMzEuMjc5Mjk3IDEuNjI4OTA2MiBMIDI5LjQzNzUgNC4yMDg5ODQ0IEMgMjcuNzgwMjA3IDMuNDQwNTAwNiAyNS45NDE5MSAzIDI0IDMgQyAyMi4wNTgwOSAzIDIwLjIxOTc5MyAzLjQ0MDUwMDYgMTguNTYyNSA0LjIwODk4NDQgTCAxNi43MjA3MDMgMS42Mjg5MDYyIEEgMS41MDAxNSAxLjUwMDE1IDAgMCAwIDE1LjQzNTU0NyAwLjk4NDM3NSBBIDEuNTAwMTUgMS41MDAxNSAwIDAgMCAxNC4yNzkyOTcgMy4zNzEwOTM4IEwgMTYgNS43NzkyOTY5IEMgMTMuMTM4ODk2IDguMDI0NzU4MiAxMS4yNDUxODggMTEuNDM2MDIgMTEuMDM1MTU2IDE1LjI5MTAxNiBDIDEwLjU1MzI2IDE1LjExMjgxOCAxMC4wNDA0MDggMTUgOS41IDE1IEMgNy4wMzI0OTkxIDE1IDUgMTcuMDMyNDk5IDUgMTkuNSBMIDUgMzAuNSBDIDUgMzIuOTY3NTAxIDcuMDMyNDk5MSAzNSA5LjUgMzUgQyAxMC4wOTAzMTMgMzUgMTAuNjUzMjI5IDM0Ljg3ODc0OSAxMS4xNzE4NzUgMzQuNjY3OTY5IEMgMTEuNTY0MzM2IDM2LjA3MjEwNSAxMi42MzEzMzMgMzcuMTk2OTk0IDE0IDM3LjY5MzM1OSBMIDE0IDQxLjUgQyAxNCA0My45Njc1MDEgMTYuMDMyNDk5IDQ2IDE4LjUgNDYgQyAyMC45Njc1MDEgNDYgMjMgNDMuOTY3NTAxIDIzIDQxLjUgTCAyMyAzOCBMIDI1IDM4IEwgMjUgNDEuNSBDIDI1IDQzLjk2NzUwMSAyNy4wMzI0OTkgNDYgMjkuNSA0NiBDIDMxLjk2NzUwMSA0NiAzNCA0My45Njc1MDEgMzQgNDEuNSBMIDM0IDM3LjY5MzM1OSBDIDM1LjM2ODY2NyAzNy4xOTY5OTQgMzYuNDM1NjY0IDM2LjA3MjEwNSAzNi44MjgxMjUgMzQuNjY3OTY5IEMgMzcuMzQ2NzcxIDM0Ljg3ODc0OSAzNy45MDk2ODcgMzUgMzguNSAzNSBDIDQwLjk2NzUwMSAzNSA0MyAzMi45Njc1MDEgNDMgMzAuNSBMIDQzIDE5LjUgQyA0MyAxNy4wMzI0OTkgNDAuOTY3NTAxIDE1IDM4LjUgMTUgQyAzNy45NTk1OTIgMTUgMzcuNDQ2NzQgMTUuMTEyODE4IDM2Ljk2NDg0NCAxNS4yOTEwMTYgQyAzNi43NTQ4MTIgMTEuNDM2MDIgMzQuODYxMTA0IDguMDI0NzU4MiAzMiA1Ljc3OTI5NjkgTCAzMy43MjA3MDMgMy4zNzEwOTM4IEEgMS41MDAxNSAxLjUwMDE1IDAgMCAwIDMyLjUxOTUzMSAwLjk4MjQyMTg4IHogTSAyNCA2IEMgMjkuMTg1MTI3IDYgMzMuMjc2NzI3IDkuOTU3NTEzMiAzMy43OTg4MjggMTUgTCAxNC4yMDExNzIgMTUgQyAxNC43MjMyNzMgOS45NTc1MTMyIDE4LjgxNDg3MyA2IDI0IDYgeiBNIDE5LjUgMTAgQSAxLjUgMS41IDAgMCAwIDE5LjUgMTMgQSAxLjUgMS41IDAgMCAwIDE5LjUgMTAgeiBNIDI4LjUgMTAgQSAxLjUgMS41IDAgMCAwIDI4LjUgMTMgQSAxLjUgMS41IDAgMCAwIDI4LjUgMTAgeiBNIDkuNSAxOCBDIDEwLjM0NjQ5OSAxOCAxMSAxOC42NTM1MDEgMTEgMTkuNSBMIDExIDMwLjUgQyAxMSAzMS4zNDY0OTkgMTAuMzQ2NDk5IDMyIDkuNSAzMiBDIDguNjUzNTAwOSAzMiA4IDMxLjM0NjQ5OSA4IDMwLjUgTCA4IDE5LjUgQyA4IDE4LjY1MzUwMSA4LjY1MzUwMDkgMTggOS41IDE4IHogTSAxNCAxOCBMIDM0IDE4IEwgMzQgMTkuNSBMIDM0IDMwLjUgTCAzNCAzMy41IEMgMzQgMzQuMzQ2NDk5IDMzLjM0NjQ5OSAzNSAzMi41IDM1IEwgMjUgMzUgTCAyMyAzNSBMIDE1LjUgMzUgQyAxNC42NTM1MDEgMzUgMTQgMzQuMzQ2NDk5IDE0IDMzLjUgTCAxNCAzMC41IEwgMTQgMTkuNSBMIDE0IDE4IHogTSAzOC41IDE4IEMgMzkuMzQ2NDk5IDE4IDQwIDE4LjY1MzUwMSA0MCAxOS41IEwgNDAgMzAuNSBDIDQwIDMxLjM0NjQ5OSAzOS4zNDY0OTkgMzIgMzguNSAzMiBDIDM3LjY1MzUwMSAzMiAzNyAzMS4zNDY0OTkgMzcgMzAuNSBMIDM3IDE5LjUgQyAzNyAxOC42NTM1MDEgMzcuNjUzNTAxIDE4IDM4LjUgMTggeiBNIDE3IDM4IEwgMjAgMzggTCAyMCA0MS41IEMgMjAgNDIuMzQ2NDk5IDE5LjM0NjQ5OSA0MyAxOC41IDQzIEMgMTcuNjUzNTAxIDQzIDE3IDQyLjM0NjQ5OSAxNyA0MS41IEwgMTcgMzggeiBNIDI4IDM4IEwgMzEgMzggTCAzMSA0MS41IEMgMzEgNDIuMzQ2NDk5IDMwLjM0NjQ5OSA0MyAyOS41IDQzIEMgMjguNjUzNTAxIDQzIDI4IDQyLjM0NjQ5OSAyOCA0MS41IEwgMjggMzggeiIvPjwvc3ZnPg==", ) From 77deb5458ca41e11b90949f03c8ea7d4ca71b629 Mon Sep 17 00:00:00 2001 From: Samir mlika <105347215+mlikasam-askui@users.noreply.github.com> Date: Mon, 23 Jun 2025 11:15:10 +0200 Subject: [PATCH 07/12] rename device serial number to device_sn --- src/askui/android_agent.py | 8 ++++---- src/askui/tools/android/agent_os.py | 2 +- src/askui/tools/android/agent_os_handler.py | 6 +++--- src/askui/tools/android/ppadb_agent_os.py | 8 +++----- 4 files changed, 11 insertions(+), 13 deletions(-) diff --git a/src/askui/android_agent.py b/src/askui/android_agent.py index 7869a58c..cd504a19 100644 --- a/src/askui/android_agent.py +++ b/src/askui/android_agent.py @@ -537,18 +537,18 @@ def swipe( self.os.swipe(x1, y1, x2, y2, duration_in_ms) @telemetry.record_call( - exclude={"device_serial_number"}, + exclude={"device_sn"}, ) @validate_call def set_device_by_serial_number( self, - device_serial_number: str, + device_sn: str, ) -> None: """ Sets the active device for screen interactions by name. Args: - device_serial_number (str): The serial number of the device to set as active. + device_sn (str): The serial number of the device to set as active. Example: ```python @@ -557,7 +557,7 @@ def set_device_by_serial_number( with AndroidVisionAgent() as agent: agent.set_device_by_serial_number("Pixel 6") # Sets the active device to the Pixel 6 """ - self.os.set_device_by_serial_number(device_serial_number) + self.os.set_device_by_serial_number(device_sn) @telemetry.record_call(exclude={"goal", "on_message"}) @validate_call diff --git a/src/askui/tools/android/agent_os.py b/src/askui/tools/android/agent_os.py index 00419ae6..7fecc2fb 100644 --- a/src/askui/tools/android/agent_os.py +++ b/src/askui/tools/android/agent_os.py @@ -375,7 +375,7 @@ def set_device_by_index(self, device_index: int = 0) -> None: raise NotImplementedError @abstractmethod - def set_device_by_serial_number(self, device_serial_number: str) -> None: + def set_device_by_serial_number(self, device_sn: str) -> None: """ Sets the active device for screen interactions by serial number. """ diff --git a/src/askui/tools/android/agent_os_handler.py b/src/askui/tools/android/agent_os_handler.py index f1269092..d40283cf 100644 --- a/src/askui/tools/android/agent_os_handler.py +++ b/src/askui/tools/android/agent_os_handler.py @@ -139,9 +139,9 @@ def set_device_by_index(self, device_index: int = 0) -> None: "AndroidAgentOS", f"Set device by index: {device_index}" ) - def set_device_by_serial_number(self, device_serial_number: str) -> None: - self._agent_os.set_device_by_serial_number(device_serial_number) + def set_device_by_serial_number(self, device_sn: str) -> None: + self._agent_os.set_device_by_serial_number(device_sn) self._real_screen_resolution = None self._reporter.add_message( - "AndroidAgentOS", f"Set device by serial number: {device_serial_number}" + "AndroidAgentOS", f"Set device by serial number: {device_sn}" ) diff --git a/src/askui/tools/android/ppadb_agent_os.py b/src/askui/tools/android/ppadb_agent_os.py index 32ae2561..117db355 100644 --- a/src/askui/tools/android/ppadb_agent_os.py +++ b/src/askui/tools/android/ppadb_agent_os.py @@ -111,14 +111,14 @@ def set_device_by_index(self, device_index: int = 0) -> None: self._device = devices[device_index] self.set_display_by_index(0) - def set_device_by_serial_number(self, device_serial_number: str) -> None: + def set_device_by_serial_number(self, device_sn: str) -> None: devices = self._get_connected_devices() for device in devices: - if device.serial == device_serial_number: + if device.serial == device_sn: self._device = device self.set_display_by_index(0) return - msg = f"Device name {device_serial_number} not found" + msg = f"Device name {device_sn} not found" raise RuntimeError(msg) def screenshot(self) -> Image.Image: @@ -189,9 +189,7 @@ def type(self, text: str) -> None: display_index: int = self._selected_display.display_index escaped_text = shlex.quote(text) - shell_safe_text = escaped_text.replace(" ", "%s") - self.shell(f"input -d {display_index} text {shell_safe_text}") def key_tap(self, key: ANDROID_KEY) -> None: From 3153861209569687393c0f5922d9316c181f04de Mon Sep 17 00:00:00 2001 From: Samir mlika <105347215+mlikasam-askui@users.noreply.github.com> Date: Mon, 23 Jun 2025 11:24:22 +0200 Subject: [PATCH 08/12] hide act_agent_os_handler --- src/askui/android_agent.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/askui/android_agent.py b/src/askui/android_agent.py index cd504a19..7631d422 100644 --- a/src/askui/android_agent.py +++ b/src/askui/android_agent.py @@ -60,17 +60,17 @@ def __init__( configure_logging(level=log_level) self.os = PpadbAgentOs() self._reporter = CompositeReporter(reporters=reporters) - self.act_agent_os_handler = AndroidAgentOsHandler(self.os, self._reporter) + self._act_agent_os_handler = AndroidAgentOsHandler(self.os, self._reporter) self.act_tool_collection = ToolCollection( tools=[ - AndroidScreenshotTool(self.act_agent_os_handler), - AndroidTapTool(self.act_agent_os_handler), - AndroidTypeTool(self.act_agent_os_handler), - AndroidDragAndDropTool(self.act_agent_os_handler), - AndroidKeyTapEventTool(self.act_agent_os_handler), - AndroidSwipeTool(self.act_agent_os_handler), - AndroidKeyCombinationTool(self.act_agent_os_handler), - AndroidShellTool(self.act_agent_os_handler), + AndroidScreenshotTool(self._act_agent_os_handler), + AndroidTapTool(self._act_agent_os_handler), + AndroidTypeTool(self._act_agent_os_handler), + AndroidDragAndDropTool(self._act_agent_os_handler), + AndroidKeyTapEventTool(self._act_agent_os_handler), + AndroidSwipeTool(self._act_agent_os_handler), + AndroidKeyCombinationTool(self._act_agent_os_handler), + AndroidShellTool(self._act_agent_os_handler), ExceptionTool(), ] ) From bc52d6d51b81b082af116b0403a2cd5d9eb4d832 Mon Sep 17 00:00:00 2001 From: Samir Mlika Date: Mon, 23 Jun 2025 16:07:15 +0200 Subject: [PATCH 09/12] rename AndroidAgentOsHandler to AndroidAgentOsFacade --- src/askui/android_agent.py | 4 +- ...agent_os_handler.py => agent_os_facade.py} | 4 +- src/askui/tools/android/tools.py | 50 +++++++++---------- 3 files changed, 29 insertions(+), 29 deletions(-) rename src/askui/tools/android/{agent_os_handler.py => agent_os_facade.py} (98%) diff --git a/src/askui/android_agent.py b/src/askui/android_agent.py index 7631d422..cc42506c 100644 --- a/src/askui/android_agent.py +++ b/src/askui/android_agent.py @@ -12,7 +12,7 @@ from askui.models.shared.computer_agent_message_param import MessageParam from askui.models.shared.tools import ToolCollection from askui.tools.android.agent_os import ANDROID_KEY -from askui.tools.android.agent_os_handler import AndroidAgentOsHandler +from askui.tools.android.agent_os_facade import AndroidAgentOsFacade from askui.tools.android.ppadb_agent_os import PpadbAgentOs from askui.tools.android.tools import ( AndroidDragAndDropTool, @@ -60,7 +60,7 @@ def __init__( configure_logging(level=log_level) self.os = PpadbAgentOs() self._reporter = CompositeReporter(reporters=reporters) - self._act_agent_os_handler = AndroidAgentOsHandler(self.os, self._reporter) + self._act_agent_os_handler = AndroidAgentOsFacade(self.os, self._reporter) self.act_tool_collection = ToolCollection( tools=[ AndroidScreenshotTool(self._act_agent_os_handler), diff --git a/src/askui/tools/android/agent_os_handler.py b/src/askui/tools/android/agent_os_facade.py similarity index 98% rename from src/askui/tools/android/agent_os_handler.py rename to src/askui/tools/android/agent_os_facade.py index d40283cf..85ec1712 100644 --- a/src/askui/tools/android/agent_os_handler.py +++ b/src/askui/tools/android/agent_os_facade.py @@ -7,9 +7,9 @@ from askui.utils.image_utils import scale_coordinates_back, scale_image_with_padding -class AndroidAgentOsHandler(AndroidAgentOs): +class AndroidAgentOsFacade(AndroidAgentOs): """ - This class is used to handle the AndroidAgentOs class. + This class is a facade for the AndroidAgentOs class. It is used to scale the coordinates to the target resolution and back to the real screen resolution. """ diff --git a/src/askui/tools/android/tools.py b/src/askui/tools/android/tools.py index b49b27ea..9ba91658 100644 --- a/src/askui/tools/android/tools.py +++ b/src/askui/tools/android/tools.py @@ -5,7 +5,7 @@ from askui.models.shared.tools import Tool from askui.tools.android.agent_os import ANDROID_KEY -from askui.tools.android.agent_os_handler import AndroidAgentOsHandler +from askui.tools.android.agent_os_facade import AndroidAgentOsFacade class AndroidScreenshotTool(Tool): @@ -13,7 +13,7 @@ class AndroidScreenshotTool(Tool): Takes a screenshot from the currently connected Android device. """ - def __init__(self, agent_os_handler: AndroidAgentOsHandler) -> None: + def __init__(self, agent_os_facade: AndroidAgentOsFacade) -> None: super().__init__( name="android_screenshot_tool", description=( @@ -25,11 +25,11 @@ def __init__(self, agent_os_handler: AndroidAgentOsHandler) -> None: """ ), ) - self._agent_os_handler = agent_os_handler + self._agent_os_facade = agent_os_facade @override def __call__(self) -> tuple[str, Image.Image]: - screenshot = self._agent_os_handler.screenshot() + screenshot = self._agent_os_facade.screenshot() return "Screenshot was taken.", screenshot @@ -41,7 +41,7 @@ class AndroidTapTool(Tool): The top left corner of the screen is (0, 0). """ - def __init__(self, agent_os_handler: AndroidAgentOsHandler) -> None: + def __init__(self, agent_os_facade: AndroidAgentOsFacade) -> None: super().__init__( name="android_tap_tool", description=( @@ -66,11 +66,11 @@ def __init__(self, agent_os_handler: AndroidAgentOsHandler) -> None: "required": ["x", "y"], }, ) - self._agent_os_handler = agent_os_handler + self._agent_os_facade = agent_os_facade @override def __call__(self, x: int, y: int) -> str: - self._agent_os_handler.tap(x, y) + self._agent_os_facade.tap(x, y) return f"Tapped at ({x}, {y})" @@ -79,7 +79,7 @@ class AndroidTypeTool(Tool): Types the given text on the Android device screen. """ - def __init__(self, agent_os_handler: AndroidAgentOsHandler) -> None: + def __init__(self, agent_os_facade: AndroidAgentOsFacade) -> None: super().__init__( name="android_type_tool", description=( @@ -103,11 +103,11 @@ def __init__(self, agent_os_handler: AndroidAgentOsHandler) -> None: "required": ["text"], }, ) - self._agent_os_handler = agent_os_handler + self._agent_os_facade = agent_os_facade @override def __call__(self, text: str) -> str: - self._agent_os_handler.type(text) + self._agent_os_facade.type(text) return f"Typed: {text}" @@ -116,8 +116,8 @@ class AndroidDragAndDropTool(Tool): Performs a drag and drop gesture on the Android device screen. """ - def __init__(self, agent_os_handler: AndroidAgentOsHandler) -> None: - self._agent_os_handler = agent_os_handler + def __init__(self, agent_os_facade: AndroidAgentOsFacade) -> None: + self._agent_os_facade = agent_os_facade super().__init__( name="android_drag_and_drop_tool", description=( @@ -163,12 +163,12 @@ def __init__(self, agent_os_handler: AndroidAgentOsHandler) -> None: @override def __call__(self, x1: int, y1: int, x2: int, y2: int, duration: int = 1000) -> str: - self._agent_os_handler.drag_and_drop(x1, y1, x2, y2, duration) + self._agent_os_facade.drag_and_drop(x1, y1, x2, y2, duration) return f"Dragged and dropped from ({x1}, {y1}) to ({x2}, {y2}) in {duration}ms" class AndroidKeyTapEventTool(Tool): - def __init__(self, agent_os_handler: AndroidAgentOsHandler) -> None: + def __init__(self, agent_os_facade: AndroidAgentOsFacade) -> None: super().__init__( name="android_key_event_tool", description=( @@ -192,11 +192,11 @@ def __init__(self, agent_os_handler: AndroidAgentOsHandler) -> None: "required": ["key_name"], }, ) - self._agent_os_handler = agent_os_handler + self._agent_os_facade = agent_os_facade @override def __call__(self, key_name: ANDROID_KEY) -> str: - self._agent_os_handler.key_tap(key_name) + self._agent_os_facade.key_tap(key_name) return f"Tapped on Key: {key_name}" @@ -205,7 +205,7 @@ class AndroidSwipeTool(Tool): Performs a swipe gesture on the Android device screen. """ - def __init__(self, agent_os_handler: AndroidAgentOsHandler) -> None: + def __init__(self, agent_os_facade: AndroidAgentOsFacade) -> None: super().__init__( name="android_swipe_tool", description=( @@ -265,11 +265,11 @@ def __init__(self, agent_os_handler: AndroidAgentOsHandler) -> None: "required": ["x1", "y1", "x2", "y2"], }, ) - self._agent_os_handler = agent_os_handler + self._agent_os_facade = agent_os_facade @override def __call__(self, x1: int, y1: int, x2: int, y2: int, duration: int = 1000) -> str: - self._agent_os_handler.swipe(x1, y1, x2, y2, duration) + self._agent_os_facade.swipe(x1, y1, x2, y2, duration) return f"Swiped from ({x1}, {y1}) to ({x2}, {y2}) in {duration}ms" @@ -278,7 +278,7 @@ class AndroidKeyCombinationTool(Tool): Performs a key combination on the Android device. """ - def __init__(self, agent_os_handler: AndroidAgentOsHandler) -> None: + def __init__(self, agent_os_facade: AndroidAgentOsFacade) -> None: super().__init__( name="android_key_combination_tool", description=( @@ -321,11 +321,11 @@ def __init__(self, agent_os_handler: AndroidAgentOsHandler) -> None: "required": ["keys"], }, ) - self._agent_os_handler = agent_os_handler + self._agent_os_facade = agent_os_facade @override def __call__(self, keys: list[ANDROID_KEY], duration: int = 100) -> str: - self._agent_os_handler.key_combination(keys, duration) + self._agent_os_facade.key_combination(keys, duration) return f"Performed key combination: {keys}" @@ -334,7 +334,7 @@ class AndroidShellTool(Tool): Executes a shell command on the Android device. """ - def __init__(self, agent_os_handler: AndroidAgentOsHandler) -> None: + def __init__(self, agent_os_facade: AndroidAgentOsFacade) -> None: super().__init__( name="android_shell_tool", description=( @@ -363,9 +363,9 @@ def __init__(self, agent_os_handler: AndroidAgentOsHandler) -> None: "required": ["command"], }, ) - self._agent_os_handler = agent_os_handler + self._agent_os_facade = agent_os_facade @override def __call__(self, command: str) -> str: - output = self._agent_os_handler.shell(command) + output = self._agent_os_facade.shell(command) return f"Shell command executed. Output: {output}" From ce5ea57e24beda3b074f084646b464cd66ff3bf2 Mon Sep 17 00:00:00 2001 From: Samir Mlika <105347215+mlikasam-askui@users.noreply.github.com> Date: Mon, 23 Jun 2025 16:14:55 +0200 Subject: [PATCH 10/12] Apply suggestions from code review --- src/askui/tools/android/ppadb_agent_os.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/askui/tools/android/ppadb_agent_os.py b/src/askui/tools/android/ppadb_agent_os.py index 117db355..d521e852 100644 --- a/src/askui/tools/android/ppadb_agent_os.py +++ b/src/askui/tools/android/ppadb_agent_os.py @@ -187,7 +187,6 @@ def type(self, text: str) -> None: raise RuntimeError(error_msg_nonprintable) assert self._selected_display is not None display_index: int = self._selected_display.display_index - escaped_text = shlex.quote(text) shell_safe_text = escaped_text.replace(" ", "%s") self.shell(f"input -d {display_index} text {shell_safe_text}") From d0c542ca69b81e78f32de3272fa0ff567e0466bb Mon Sep 17 00:00:00 2001 From: Samir Mlika Date: Mon, 23 Jun 2025 17:01:46 +0200 Subject: [PATCH 11/12] revert computer tool --- src/askui/tools/computer.py | 61 ++++++++++++++++++++++++++++++++++--- 1 file changed, 57 insertions(+), 4 deletions(-) diff --git a/src/askui/tools/computer.py b/src/askui/tools/computer.py index 44e1df07..780eca1b 100644 --- a/src/askui/tools/computer.py +++ b/src/askui/tools/computer.py @@ -1,5 +1,5 @@ from abc import ABC -from typing import Annotated, Literal, TypedDict +from typing import Annotated, Literal, TypedDict, get_args from anthropic.types.beta import ( BetaToolComputerUse20241022Param, @@ -13,7 +13,7 @@ from askui.utils.dict_utils import IdentityDefaultDict from askui.utils.image_utils import scale_coordinates_back, scale_image_with_padding -from ..models.shared.tools import Tool +from ..models.shared.tools import InputSchema, Tool Action20241022 = Literal[ "key", @@ -124,10 +124,12 @@ class ComputerToolBase(Tool, ABC): def __init__( self, agent_os: AgentOs, + input_schema: InputSchema, ) -> None: super().__init__( name="computer", description="A tool for interacting with the computer", + input_schema=input_schema, ) self._agent_os = agent_os self._width = 1280 @@ -255,7 +257,29 @@ def __init__( self, agent_os: AgentOs, ) -> None: - super().__init__(agent_os=agent_os) + super().__init__( + agent_os=agent_os, + input_schema={ + "type": "object", + "properties": { + "action": { + "type": "string", + "enum": list(get_args(Action20241022)), + }, + "text": { + "type": "string", + }, + "coordinate": { + "type": "object", + "properties": { + "x": {"type": "integer", "minimum": 0}, + "y": {"type": "integer", "minimum": 0}, + }, + }, + }, + "required": ["action"], + }, + ) @override def to_params( @@ -274,7 +298,36 @@ def __init__( self, agent_os: AgentOs, ) -> None: - super().__init__(agent_os=agent_os) + super().__init__( + agent_os=agent_os, + input_schema={ + "type": "object", + "properties": { + "action": { + "type": "string", + "enum": list(get_args(Action20250124)), + }, + "text": { + "type": "string", + }, + "coordinate": { + "type": "object", + "properties": { + "x": {"type": "integer", "minimum": 0}, + "y": {"type": "integer", "minimum": 0}, + }, + }, + "scroll_direction": { + "type": "string", + "enum": list(get_args(ScrollDirection)), + }, + "scroll_amount": {"type": "integer", "minimum": 0}, + "duration": {"type": "number", "minimum": 0.0, "maximum": 100.0}, + "key": {"type": "string"}, + }, + "required": ["action"], + }, + ) @override def to_params( From 33c49bf2d9162ec0c8b5a204dc657479ecc3b4c4 Mon Sep 17 00:00:00 2001 From: Samir Mlika Date: Mon, 23 Jun 2025 17:03:22 +0200 Subject: [PATCH 12/12] rename _act_agent_os_handler to _act_agent_os_facade --- src/askui/android_agent.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/askui/android_agent.py b/src/askui/android_agent.py index cc42506c..05a66068 100644 --- a/src/askui/android_agent.py +++ b/src/askui/android_agent.py @@ -60,17 +60,17 @@ def __init__( configure_logging(level=log_level) self.os = PpadbAgentOs() self._reporter = CompositeReporter(reporters=reporters) - self._act_agent_os_handler = AndroidAgentOsFacade(self.os, self._reporter) + self._act_agent_os_facade = AndroidAgentOsFacade(self.os, self._reporter) self.act_tool_collection = ToolCollection( tools=[ - AndroidScreenshotTool(self._act_agent_os_handler), - AndroidTapTool(self._act_agent_os_handler), - AndroidTypeTool(self._act_agent_os_handler), - AndroidDragAndDropTool(self._act_agent_os_handler), - AndroidKeyTapEventTool(self._act_agent_os_handler), - AndroidSwipeTool(self._act_agent_os_handler), - AndroidKeyCombinationTool(self._act_agent_os_handler), - AndroidShellTool(self._act_agent_os_handler), + AndroidScreenshotTool(self._act_agent_os_facade), + AndroidTapTool(self._act_agent_os_facade), + AndroidTypeTool(self._act_agent_os_facade), + AndroidDragAndDropTool(self._act_agent_os_facade), + AndroidKeyTapEventTool(self._act_agent_os_facade), + AndroidSwipeTool(self._act_agent_os_facade), + AndroidKeyCombinationTool(self._act_agent_os_facade), + AndroidShellTool(self._act_agent_os_facade), ExceptionTool(), ] )