From f480d79942f8ddf4c33806144714c6694bf797fc Mon Sep 17 00:00:00 2001
From: Samir Mlika <samir.mlika@askui.com>
Date: Thu, 24 Apr 2025 09:57:39 +0100
Subject: [PATCH] refactor ui-tar-api

---
 src/askui/models/ui_tars_ep/parser.py      | 88 ++++++++++++++++------
 src/askui/models/ui_tars_ep/ui_tars_api.py | 31 +++-----
 2 files changed, 78 insertions(+), 41 deletions(-)

diff --git a/src/askui/models/ui_tars_ep/parser.py b/src/askui/models/ui_tars_ep/parser.py
index 052d9da7..76f7e8f3 100644
--- a/src/askui/models/ui_tars_ep/parser.py
+++ b/src/askui/models/ui_tars_ep/parser.py
@@ -1,5 +1,7 @@
+import time
+from askui.tools.agent_os import AgentOs
 from pydantic import BaseModel, Field
-from typing import Optional, Literal, Union, Tuple
+from typing import Literal, Union
 import re
 
 class BoxCoordinate(BaseModel):
@@ -15,55 +17,99 @@ def parse(cls, coord_str: str) -> "BoxCoordinate":
             raise ValueError(f"Invalid coordinate format: {coord_str}")
         return cls(x=int(match.group(1)), y=int(match.group(2)))
 
-class ClickAction(BaseModel):
-    """Click action with start box coordinates."""
+class BaseAction(BaseModel):
+    action_type: str
+
+    def execute(self, agent_os: AgentOs) -> None:
+        raise NotImplementedError(f"Action '{self.action_type}' must implement execute method.")
+
+        
+class ClickAction(BaseAction):
     action_type: Literal["click"] = "click"
     start_box: BoxCoordinate
 
-class DoubleClickAction(BaseModel):
-    """Double left click action with start box coordinates."""
+    def execute(self, agent_os: AgentOs) -> None:
+        agent_os.mouse(x=self.start_box.x, y=self.start_box.y)
+        time.sleep(0.2)
+        agent_os.click("left")
+
+class DoubleClickAction(BaseAction):
     action_type: Literal["left_double"] = "left_double"
     start_box: BoxCoordinate
 
-class RightClickAction(BaseModel):
-    """Right click action with start box coordinates."""
+    def execute(self, agent_os: AgentOs) -> None:
+        agent_os.mouse(x=self.start_box.x, y=self.start_box.y)
+        time.sleep(0.2)
+        agent_os.click('left')
+
+class RightClickAction(BaseAction):
     action_type: Literal["right_single"] = "right_single"
     start_box: BoxCoordinate
 
-class DragAction(BaseModel):
-    """Drag action with start and end box coordinates."""
+    def execute(self, agent_os: AgentOs) -> None:
+        agent_os.mouse(x=self.start_box.x, y=self.start_box.y)
+        time.sleep(0.2)
+        agent_os.click("right")
+
+class DragAction(BaseAction):
     action_type: Literal["drag"] = "drag"
     start_box: BoxCoordinate
     end_box: BoxCoordinate
 
-class HotkeyAction(BaseModel):
-    """Hotkey action with key combination."""
+    def execute(self, agent_os: AgentOs) -> None:
+        agent_os.mouse(x=self.start_box.x, y=self.start_box.y)
+        time.sleep(0.2)
+        agent_os.mouse_down()
+        time.sleep(0.2)
+        agent_os.mouse(x=self.end_box.x, y=self.end_box.y)
+        time.sleep(0.2)
+        agent_os.mouse_up()
+
+
+class HotkeyAction(BaseAction):
     action_type: Literal["hotkey"] = "hotkey"
     key: str
 
-class TypeAction(BaseModel):
-    """Type action with content."""
+    def execute(self, agent_os: AgentOs) -> None:
+        agent_os.keyboard_pressed(self.key)
+        agent_os.keyboard_release(self.key)
+
+class TypeAction(BaseAction):
     action_type: Literal["type"] = "type"
     content: str
 
-class ScrollAction(BaseModel):
-    """Scroll action with direction and start box."""
+    def execute(self, agent_os: AgentOs) -> None:
+        agent_os.click("left")
+        agent_os.type(self.content)
+
+class ScrollAction(BaseAction):
     action_type: Literal["scroll"] = "scroll"
     start_box: BoxCoordinate
     direction: Literal["up", "down", "left", "right"]
 
-class WaitAction(BaseModel):
-    """Wait action."""
+    def execute(self, agent_os: AgentOs) -> None:
+        dx, dy = self.start_box.x, self.start_box.y
+        if self.direction == "left":
+            dx = -1 * dx
+        if self.direction == "up":
+            dy = -1 * dy
+        agent_os.mouse_scroll(dx, dy)
+
+class WaitAction(BaseAction):
     action_type: Literal["wait"] = "wait"
 
-class FinishedAction(BaseModel):
-    """Finished action."""
+    def execute(self, _agent_os: AgentOs) -> None:
+        time.sleep(5)
+
+class FinishedAction(BaseAction):
     action_type: Literal["finished"] = "finished"
 
-class CallUserAction(BaseModel):
-    """Call user action."""
+class CallUserAction(BaseAction):
     action_type: Literal["call_user"] = "call_user"
 
+    def execute(self, _agent_os: AgentOs) -> None:
+        raise Exception("Call user action executed. This should be handled by the agent's logic, not directly here.")
+
 ActionType = Union[
     ClickAction, DoubleClickAction, RightClickAction, DragAction,
     HotkeyAction, TypeAction, ScrollAction, WaitAction,
diff --git a/src/askui/models/ui_tars_ep/ui_tars_api.py b/src/askui/models/ui_tars_ep/ui_tars_api.py
index dd7135ea..ac3f6be3 100644
--- a/src/askui/models/ui_tars_ep/ui_tars_api.py
+++ b/src/askui/models/ui_tars_ep/ui_tars_api.py
@@ -2,6 +2,7 @@
 import os
 import pathlib
 from typing import Any, Union
+from askui.logger import logger
 from openai import OpenAI
 from askui.reporting import Reporter
 from askui.tools.agent_os import AgentOs
@@ -10,7 +11,7 @@
 
 from askui.utils.image_utils import ImageSource
 from .prompts import PROMPT, PROMPT_QA
-from .parser import UITarsEPMessage
+from .parser import CallUserAction, FinishedAction, UITarsEPMessage
 import time
 
 
@@ -107,6 +108,7 @@ def act(self, goal: str) -> None:
         self.execute_act(self.act_history)
 
     def add_screenshot_to_history(self, message_history):
+        time.sleep(0.5)
         screenshot = self._agent_os.screenshot()
         message_history.append(
             {
@@ -177,14 +179,14 @@ def execute_act(self, message_history):
             presence_penalty=None
         )
         raw_message = chat_completion.choices[-1].message.content
-        print(raw_message)
+        logger.debug(f"Raw message: {raw_message}")
 
         if self._reporter is not None: 
             self._reporter.add_message("UI-TARS", raw_message)
 
         try:
             message = UITarsEPMessage.parse_message(raw_message)
-            print(message)
+            logger.debug(f"Parsed message: {message}")
         except Exception as e:
             message_history.append(
                 {
@@ -201,24 +203,13 @@ def execute_act(self, message_history):
             return
 
         action = message.parsed_action
-        if action.action_type == "click":
-            self._agent_os.mouse(action.start_box.x, action.start_box.y)
-            self._agent_os.click("left")
-            time.sleep(1)
-        if action.action_type == "type":
-            self._agent_os.click("left")
-            self._agent_os.type(action.content)
-            time.sleep(0.5)
-        if action.action_type == "hotkey":
-            self._agent_os.keyboard_pressed(action.key)
-            self._agent_os.keyboard_release(action.key)
-            time.sleep(0.5)
-        if action.action_type == "call_user":
-            time.sleep(1)
-        if action.action_type == "wait":
-            time.sleep(2)
-        if action.action_type == "finished":
+        if  isinstance(action, CallUserAction):
+            raise Exception(f'Agent is stuck. Call user action executed. Here is the thought: {message.thought}')
+        
+        if isinstance(action, FinishedAction):
             return
+        
+        action.execute(self._agent_os)
 
         self.add_screenshot_to_history(message_history)
         self.execute_act(message_history)