From f480d79942f8ddf4c33806144714c6694bf797fc Mon Sep 17 00:00:00 2001 From: Samir Mlika Date: Thu, 24 Apr 2025 09:57:39 +0100 Subject: [PATCH] refactor ui-tar-api --- src/askui/models/ui_tars_ep/parser.py | 88 ++++++++++++++++------ src/askui/models/ui_tars_ep/ui_tars_api.py | 31 +++----- 2 files changed, 78 insertions(+), 41 deletions(-) diff --git a/src/askui/models/ui_tars_ep/parser.py b/src/askui/models/ui_tars_ep/parser.py index 052d9da7..76f7e8f3 100644 --- a/src/askui/models/ui_tars_ep/parser.py +++ b/src/askui/models/ui_tars_ep/parser.py @@ -1,5 +1,7 @@ +import time +from askui.tools.agent_os import AgentOs from pydantic import BaseModel, Field -from typing import Optional, Literal, Union, Tuple +from typing import Literal, Union import re class BoxCoordinate(BaseModel): @@ -15,55 +17,99 @@ def parse(cls, coord_str: str) -> "BoxCoordinate": raise ValueError(f"Invalid coordinate format: {coord_str}") return cls(x=int(match.group(1)), y=int(match.group(2))) -class ClickAction(BaseModel): - """Click action with start box coordinates.""" +class BaseAction(BaseModel): + action_type: str + + def execute(self, agent_os: AgentOs) -> None: + raise NotImplementedError(f"Action '{self.action_type}' must implement execute method.") + + +class ClickAction(BaseAction): action_type: Literal["click"] = "click" start_box: BoxCoordinate -class DoubleClickAction(BaseModel): - """Double left click action with start box coordinates.""" + def execute(self, agent_os: AgentOs) -> None: + agent_os.mouse(x=self.start_box.x, y=self.start_box.y) + time.sleep(0.2) + agent_os.click("left") + +class DoubleClickAction(BaseAction): action_type: Literal["left_double"] = "left_double" start_box: BoxCoordinate -class RightClickAction(BaseModel): - """Right click action with start box coordinates.""" + def execute(self, agent_os: AgentOs) -> None: + agent_os.mouse(x=self.start_box.x, y=self.start_box.y) + time.sleep(0.2) + agent_os.click('left') + +class RightClickAction(BaseAction): action_type: Literal["right_single"] = "right_single" start_box: BoxCoordinate -class DragAction(BaseModel): - """Drag action with start and end box coordinates.""" + def execute(self, agent_os: AgentOs) -> None: + agent_os.mouse(x=self.start_box.x, y=self.start_box.y) + time.sleep(0.2) + agent_os.click("right") + +class DragAction(BaseAction): action_type: Literal["drag"] = "drag" start_box: BoxCoordinate end_box: BoxCoordinate -class HotkeyAction(BaseModel): - """Hotkey action with key combination.""" + def execute(self, agent_os: AgentOs) -> None: + agent_os.mouse(x=self.start_box.x, y=self.start_box.y) + time.sleep(0.2) + agent_os.mouse_down() + time.sleep(0.2) + agent_os.mouse(x=self.end_box.x, y=self.end_box.y) + time.sleep(0.2) + agent_os.mouse_up() + + +class HotkeyAction(BaseAction): action_type: Literal["hotkey"] = "hotkey" key: str -class TypeAction(BaseModel): - """Type action with content.""" + def execute(self, agent_os: AgentOs) -> None: + agent_os.keyboard_pressed(self.key) + agent_os.keyboard_release(self.key) + +class TypeAction(BaseAction): action_type: Literal["type"] = "type" content: str -class ScrollAction(BaseModel): - """Scroll action with direction and start box.""" + def execute(self, agent_os: AgentOs) -> None: + agent_os.click("left") + agent_os.type(self.content) + +class ScrollAction(BaseAction): action_type: Literal["scroll"] = "scroll" start_box: BoxCoordinate direction: Literal["up", "down", "left", "right"] -class WaitAction(BaseModel): - """Wait action.""" + def execute(self, agent_os: AgentOs) -> None: + dx, dy = self.start_box.x, self.start_box.y + if self.direction == "left": + dx = -1 * dx + if self.direction == "up": + dy = -1 * dy + agent_os.mouse_scroll(dx, dy) + +class WaitAction(BaseAction): action_type: Literal["wait"] = "wait" -class FinishedAction(BaseModel): - """Finished action.""" + def execute(self, _agent_os: AgentOs) -> None: + time.sleep(5) + +class FinishedAction(BaseAction): action_type: Literal["finished"] = "finished" -class CallUserAction(BaseModel): - """Call user action.""" +class CallUserAction(BaseAction): action_type: Literal["call_user"] = "call_user" + def execute(self, _agent_os: AgentOs) -> None: + raise Exception("Call user action executed. This should be handled by the agent's logic, not directly here.") + ActionType = Union[ ClickAction, DoubleClickAction, RightClickAction, DragAction, HotkeyAction, TypeAction, ScrollAction, WaitAction, diff --git a/src/askui/models/ui_tars_ep/ui_tars_api.py b/src/askui/models/ui_tars_ep/ui_tars_api.py index dd7135ea..ac3f6be3 100644 --- a/src/askui/models/ui_tars_ep/ui_tars_api.py +++ b/src/askui/models/ui_tars_ep/ui_tars_api.py @@ -2,6 +2,7 @@ import os import pathlib from typing import Any, Union +from askui.logger import logger from openai import OpenAI from askui.reporting import Reporter from askui.tools.agent_os import AgentOs @@ -10,7 +11,7 @@ from askui.utils.image_utils import ImageSource from .prompts import PROMPT, PROMPT_QA -from .parser import UITarsEPMessage +from .parser import CallUserAction, FinishedAction, UITarsEPMessage import time @@ -107,6 +108,7 @@ def act(self, goal: str) -> None: self.execute_act(self.act_history) def add_screenshot_to_history(self, message_history): + time.sleep(0.5) screenshot = self._agent_os.screenshot() message_history.append( { @@ -177,14 +179,14 @@ def execute_act(self, message_history): presence_penalty=None ) raw_message = chat_completion.choices[-1].message.content - print(raw_message) + logger.debug(f"Raw message: {raw_message}") if self._reporter is not None: self._reporter.add_message("UI-TARS", raw_message) try: message = UITarsEPMessage.parse_message(raw_message) - print(message) + logger.debug(f"Parsed message: {message}") except Exception as e: message_history.append( { @@ -201,24 +203,13 @@ def execute_act(self, message_history): return action = message.parsed_action - if action.action_type == "click": - self._agent_os.mouse(action.start_box.x, action.start_box.y) - self._agent_os.click("left") - time.sleep(1) - if action.action_type == "type": - self._agent_os.click("left") - self._agent_os.type(action.content) - time.sleep(0.5) - if action.action_type == "hotkey": - self._agent_os.keyboard_pressed(action.key) - self._agent_os.keyboard_release(action.key) - time.sleep(0.5) - if action.action_type == "call_user": - time.sleep(1) - if action.action_type == "wait": - time.sleep(2) - if action.action_type == "finished": + if isinstance(action, CallUserAction): + raise Exception(f'Agent is stuck. Call user action executed. Here is the thought: {message.thought}') + + if isinstance(action, FinishedAction): return + + action.execute(self._agent_os) self.add_screenshot_to_history(message_history) self.execute_act(message_history)