-
Notifications
You must be signed in to change notification settings - Fork 54
refactor ui-tar-api #44
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,5 +1,7 @@ | ||
| import time | ||
| from askui.tools.agent_os import AgentOs | ||
| from pydantic import BaseModel, Field | ||
| from typing import Optional, Literal, Union, Tuple | ||
| from typing import Literal, Union | ||
| import re | ||
|
|
||
| class BoxCoordinate(BaseModel): | ||
|
|
@@ -15,55 +17,99 @@ def parse(cls, coord_str: str) -> "BoxCoordinate": | |
| raise ValueError(f"Invalid coordinate format: {coord_str}") | ||
| return cls(x=int(match.group(1)), y=int(match.group(2))) | ||
|
|
||
| class ClickAction(BaseModel): | ||
| """Click action with start box coordinates.""" | ||
| class BaseAction(BaseModel): | ||
| action_type: str | ||
|
|
||
| def execute(self, agent_os: AgentOs) -> None: | ||
| raise NotImplementedError(f"Action '{self.action_type}' must implement execute method.") | ||
|
|
||
|
|
||
| class ClickAction(BaseAction): | ||
| action_type: Literal["click"] = "click" | ||
| start_box: BoxCoordinate | ||
|
|
||
| class DoubleClickAction(BaseModel): | ||
| """Double left click action with start box coordinates.""" | ||
| def execute(self, agent_os: AgentOs) -> None: | ||
| agent_os.mouse(x=self.start_box.x, y=self.start_box.y) | ||
| time.sleep(0.2) | ||
| agent_os.click("left") | ||
|
|
||
| class DoubleClickAction(BaseAction): | ||
| action_type: Literal["left_double"] = "left_double" | ||
| start_box: BoxCoordinate | ||
|
|
||
| class RightClickAction(BaseModel): | ||
| """Right click action with start box coordinates.""" | ||
| def execute(self, agent_os: AgentOs) -> None: | ||
| agent_os.mouse(x=self.start_box.x, y=self.start_box.y) | ||
| time.sleep(0.2) | ||
| agent_os.click('left') | ||
|
|
||
| class RightClickAction(BaseAction): | ||
| action_type: Literal["right_single"] = "right_single" | ||
| start_box: BoxCoordinate | ||
|
|
||
| class DragAction(BaseModel): | ||
| """Drag action with start and end box coordinates.""" | ||
| def execute(self, agent_os: AgentOs) -> None: | ||
| agent_os.mouse(x=self.start_box.x, y=self.start_box.y) | ||
| time.sleep(0.2) | ||
| agent_os.click("right") | ||
|
|
||
| class DragAction(BaseAction): | ||
| action_type: Literal["drag"] = "drag" | ||
| start_box: BoxCoordinate | ||
| end_box: BoxCoordinate | ||
|
|
||
| class HotkeyAction(BaseModel): | ||
| """Hotkey action with key combination.""" | ||
| def execute(self, agent_os: AgentOs) -> None: | ||
| agent_os.mouse(x=self.start_box.x, y=self.start_box.y) | ||
| time.sleep(0.2) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would move this into the Also I would make this configurable through the constructor of
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. FYI: Just saw that there are already 2 properties ( |
||
| agent_os.mouse_down() | ||
| time.sleep(0.2) | ||
| agent_os.mouse(x=self.end_box.x, y=self.end_box.y) | ||
| time.sleep(0.2) | ||
| agent_os.mouse_up() | ||
|
|
||
|
|
||
| class HotkeyAction(BaseAction): | ||
| action_type: Literal["hotkey"] = "hotkey" | ||
| key: str | ||
|
|
||
| class TypeAction(BaseModel): | ||
| """Type action with content.""" | ||
| def execute(self, agent_os: AgentOs) -> None: | ||
| agent_os.keyboard_pressed(self.key) | ||
| agent_os.keyboard_release(self.key) | ||
|
|
||
| class TypeAction(BaseAction): | ||
| action_type: Literal["type"] = "type" | ||
| content: str | ||
|
|
||
| class ScrollAction(BaseModel): | ||
| """Scroll action with direction and start box.""" | ||
| def execute(self, agent_os: AgentOs) -> None: | ||
| agent_os.click("left") | ||
| agent_os.type(self.content) | ||
|
|
||
| class ScrollAction(BaseAction): | ||
| action_type: Literal["scroll"] = "scroll" | ||
| start_box: BoxCoordinate | ||
| direction: Literal["up", "down", "left", "right"] | ||
|
|
||
| class WaitAction(BaseModel): | ||
| """Wait action.""" | ||
| def execute(self, agent_os: AgentOs) -> None: | ||
| dx, dy = self.start_box.x, self.start_box.y | ||
| if self.direction == "left": | ||
| dx = -1 * dx | ||
| if self.direction == "up": | ||
| dy = -1 * dy | ||
| agent_os.mouse_scroll(dx, dy) | ||
|
|
||
| class WaitAction(BaseAction): | ||
| action_type: Literal["wait"] = "wait" | ||
|
|
||
| class FinishedAction(BaseModel): | ||
| """Finished action.""" | ||
| def execute(self, _agent_os: AgentOs) -> None: | ||
| time.sleep(5) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just thinking out loud where we could develop this in the future (not now): If we handed over other tools instead of only the AgentOs, e.g., one for waiting, we would be able to handle this here. |
||
|
|
||
| class FinishedAction(BaseAction): | ||
| action_type: Literal["finished"] = "finished" | ||
|
|
||
| class CallUserAction(BaseModel): | ||
| """Call user action.""" | ||
| class CallUserAction(BaseAction): | ||
| action_type: Literal["call_user"] = "call_user" | ||
|
|
||
| def execute(self, _agent_os: AgentOs) -> None: | ||
| raise Exception("Call user action executed. This should be handled by the agent's logic, not directly here.") | ||
|
Comment on lines
+110
to
+111
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just thinking out loud where we could develop this in the future (not now): If we handed over other tools instead of only the |
||
|
|
||
| ActionType = Union[ | ||
| ClickAction, DoubleClickAction, RightClickAction, DragAction, | ||
| HotkeyAction, TypeAction, ScrollAction, WaitAction, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2,6 +2,7 @@ | |
| import os | ||
| import pathlib | ||
| from typing import Any, Union | ||
| from askui.logger import logger | ||
| from openai import OpenAI | ||
| from askui.reporting import Reporter | ||
| from askui.tools.agent_os import AgentOs | ||
|
|
@@ -10,7 +11,7 @@ | |
|
|
||
| from askui.utils.image_utils import ImageSource | ||
| from .prompts import PROMPT, PROMPT_QA | ||
| from .parser import UITarsEPMessage | ||
| from .parser import CallUserAction, FinishedAction, UITarsEPMessage | ||
| import time | ||
|
|
||
|
|
||
|
|
@@ -107,6 +108,7 @@ def act(self, goal: str) -> None: | |
| self.execute_act(self.act_history) | ||
|
|
||
| def add_screenshot_to_history(self, message_history): | ||
| time.sleep(0.5) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why was this necessary? |
||
| screenshot = self._agent_os.screenshot() | ||
| message_history.append( | ||
| { | ||
|
|
@@ -177,14 +179,14 @@ def execute_act(self, message_history): | |
| presence_penalty=None | ||
| ) | ||
| raw_message = chat_completion.choices[-1].message.content | ||
| print(raw_message) | ||
| logger.debug(f"Raw message: {raw_message}") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Great :) |
||
|
|
||
| if self._reporter is not None: | ||
| self._reporter.add_message("UI-TARS", raw_message) | ||
|
|
||
| try: | ||
| message = UITarsEPMessage.parse_message(raw_message) | ||
| print(message) | ||
| logger.debug(f"Parsed message: {message}") | ||
| except Exception as e: | ||
| message_history.append( | ||
| { | ||
|
|
@@ -201,24 +203,13 @@ def execute_act(self, message_history): | |
| return | ||
|
|
||
| action = message.parsed_action | ||
| if action.action_type == "click": | ||
| self._agent_os.mouse(action.start_box.x, action.start_box.y) | ||
| self._agent_os.click("left") | ||
| time.sleep(1) | ||
| if action.action_type == "type": | ||
| self._agent_os.click("left") | ||
| self._agent_os.type(action.content) | ||
| time.sleep(0.5) | ||
| if action.action_type == "hotkey": | ||
| self._agent_os.keyboard_pressed(action.key) | ||
| self._agent_os.keyboard_release(action.key) | ||
| time.sleep(0.5) | ||
| if action.action_type == "call_user": | ||
| time.sleep(1) | ||
| if action.action_type == "wait": | ||
| time.sleep(2) | ||
| if action.action_type == "finished": | ||
| if isinstance(action, CallUserAction): | ||
| raise Exception(f'Agent is stuck. Call user action executed. Here is the thought: {message.thought}') | ||
|
|
||
| if isinstance(action, FinishedAction): | ||
| return | ||
|
|
||
| action.execute(self._agent_os) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Beautiful refactoring :) Using pydantic for parsing + using the strategy pattern |
||
|
|
||
| self.add_screenshot_to_history(message_history) | ||
| self.execute_act(message_history) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.