Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 67 additions & 21 deletions src/askui/models/ui_tars_ep/parser.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import time
from askui.tools.agent_os import AgentOs
from pydantic import BaseModel, Field
from typing import Optional, Literal, Union, Tuple
from typing import Literal, Union
import re

class BoxCoordinate(BaseModel):
Expand All @@ -15,55 +17,99 @@ def parse(cls, coord_str: str) -> "BoxCoordinate":
raise ValueError(f"Invalid coordinate format: {coord_str}")
return cls(x=int(match.group(1)), y=int(match.group(2)))

class ClickAction(BaseModel):
"""Click action with start box coordinates."""
class BaseAction(BaseModel):
action_type: str

def execute(self, agent_os: AgentOs) -> None:
raise NotImplementedError(f"Action '{self.action_type}' must implement execute method.")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
raise NotImplementedError(f"Action '{self.action_type}' must implement execute method.")
raise NotImplementedError(f"Action '{self.action_type}' not implemented yet")



class ClickAction(BaseAction):
action_type: Literal["click"] = "click"
start_box: BoxCoordinate

class DoubleClickAction(BaseModel):
"""Double left click action with start box coordinates."""
def execute(self, agent_os: AgentOs) -> None:
agent_os.mouse(x=self.start_box.x, y=self.start_box.y)
time.sleep(0.2)
agent_os.click("left")

class DoubleClickAction(BaseAction):
action_type: Literal["left_double"] = "left_double"
start_box: BoxCoordinate

class RightClickAction(BaseModel):
"""Right click action with start box coordinates."""
def execute(self, agent_os: AgentOs) -> None:
agent_os.mouse(x=self.start_box.x, y=self.start_box.y)
time.sleep(0.2)
agent_os.click('left')

class RightClickAction(BaseAction):
action_type: Literal["right_single"] = "right_single"
start_box: BoxCoordinate

class DragAction(BaseModel):
"""Drag action with start and end box coordinates."""
def execute(self, agent_os: AgentOs) -> None:
agent_os.mouse(x=self.start_box.x, y=self.start_box.y)
time.sleep(0.2)
agent_os.click("right")

class DragAction(BaseAction):
action_type: Literal["drag"] = "drag"
start_box: BoxCoordinate
end_box: BoxCoordinate

class HotkeyAction(BaseModel):
"""Hotkey action with key combination."""
def execute(self, agent_os: AgentOs) -> None:
agent_os.mouse(x=self.start_box.x, y=self.start_box.y)
time.sleep(0.2)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would move this into the AskUiControllerClient as this from my perspective depends on the underlying AgentOs implementation and is the general sleep/wait time between actions to ensure the agent os implementation, os, application etc. had time to react. It is generally model-independent.

Also I would make this configurable through the constructor of AskUiControllerClient as it is highly dependent on os, application being automated etc.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI: Just saw that there are already 2 properties (pre_action_wait and post_action_wait) that may just need value adjusting and exposing through the constructor.

agent_os.mouse_down()
time.sleep(0.2)
agent_os.mouse(x=self.end_box.x, y=self.end_box.y)
time.sleep(0.2)
agent_os.mouse_up()


class HotkeyAction(BaseAction):
action_type: Literal["hotkey"] = "hotkey"
key: str

class TypeAction(BaseModel):
"""Type action with content."""
def execute(self, agent_os: AgentOs) -> None:
agent_os.keyboard_pressed(self.key)
agent_os.keyboard_release(self.key)

class TypeAction(BaseAction):
action_type: Literal["type"] = "type"
content: str

class ScrollAction(BaseModel):
"""Scroll action with direction and start box."""
def execute(self, agent_os: AgentOs) -> None:
agent_os.click("left")
agent_os.type(self.content)

class ScrollAction(BaseAction):
action_type: Literal["scroll"] = "scroll"
start_box: BoxCoordinate
direction: Literal["up", "down", "left", "right"]

class WaitAction(BaseModel):
"""Wait action."""
def execute(self, agent_os: AgentOs) -> None:
dx, dy = self.start_box.x, self.start_box.y
if self.direction == "left":
dx = -1 * dx
if self.direction == "up":
dy = -1 * dy
agent_os.mouse_scroll(dx, dy)

class WaitAction(BaseAction):
action_type: Literal["wait"] = "wait"

class FinishedAction(BaseModel):
"""Finished action."""
def execute(self, _agent_os: AgentOs) -> None:
time.sleep(5)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just thinking out loud where we could develop this in the future (not now): If we handed over other tools instead of only the AgentOs, e.g., one for waiting, we would be able to handle this here.


class FinishedAction(BaseAction):
action_type: Literal["finished"] = "finished"

class CallUserAction(BaseModel):
"""Call user action."""
class CallUserAction(BaseAction):
action_type: Literal["call_user"] = "call_user"

def execute(self, _agent_os: AgentOs) -> None:
raise Exception("Call user action executed. This should be handled by the agent's logic, not directly here.")
Comment on lines +110 to +111
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just thinking out loud where we could develop this in the future (not now): If we handed over other tools instead of only the AgentOs, e.g., one for getting somehow data from the user, e.g., through a console, or just handing over control to the user busy waiting until the user confirms that he/she has helped out (potentially with some explanation of how he/she helped out)., we would be able to handle this here.


ActionType = Union[
ClickAction, DoubleClickAction, RightClickAction, DragAction,
HotkeyAction, TypeAction, ScrollAction, WaitAction,
Expand Down
31 changes: 11 additions & 20 deletions src/askui/models/ui_tars_ep/ui_tars_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os
import pathlib
from typing import Any, Union
from askui.logger import logger
from openai import OpenAI
from askui.reporting import Reporter
from askui.tools.agent_os import AgentOs
Expand All @@ -10,7 +11,7 @@

from askui.utils.image_utils import ImageSource
from .prompts import PROMPT, PROMPT_QA
from .parser import UITarsEPMessage
from .parser import CallUserAction, FinishedAction, UITarsEPMessage
import time


Expand Down Expand Up @@ -107,6 +108,7 @@ def act(self, goal: str) -> None:
self.execute_act(self.act_history)

def add_screenshot_to_history(self, message_history):
time.sleep(0.5)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why was this necessary?

screenshot = self._agent_os.screenshot()
message_history.append(
{
Expand Down Expand Up @@ -177,14 +179,14 @@ def execute_act(self, message_history):
presence_penalty=None
)
raw_message = chat_completion.choices[-1].message.content
print(raw_message)
logger.debug(f"Raw message: {raw_message}")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great :)


if self._reporter is not None:
self._reporter.add_message("UI-TARS", raw_message)

try:
message = UITarsEPMessage.parse_message(raw_message)
print(message)
logger.debug(f"Parsed message: {message}")
except Exception as e:
message_history.append(
{
Expand All @@ -201,24 +203,13 @@ def execute_act(self, message_history):
return

action = message.parsed_action
if action.action_type == "click":
self._agent_os.mouse(action.start_box.x, action.start_box.y)
self._agent_os.click("left")
time.sleep(1)
if action.action_type == "type":
self._agent_os.click("left")
self._agent_os.type(action.content)
time.sleep(0.5)
if action.action_type == "hotkey":
self._agent_os.keyboard_pressed(action.key)
self._agent_os.keyboard_release(action.key)
time.sleep(0.5)
if action.action_type == "call_user":
time.sleep(1)
if action.action_type == "wait":
time.sleep(2)
if action.action_type == "finished":
if isinstance(action, CallUserAction):
raise Exception(f'Agent is stuck. Call user action executed. Here is the thought: {message.thought}')

if isinstance(action, FinishedAction):
return

action.execute(self._agent_os)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Beautiful refactoring :)

Using pydantic for parsing + using the strategy pattern


self.add_screenshot_to_history(message_history)
self.execute_act(message_history)