From d0dce6708738d9f3095c5f67f89951a9e3bb499e Mon Sep 17 00:00:00 2001 From: Tristan Drummond Date: Mon, 23 Jun 2025 12:41:25 +0200 Subject: [PATCH] feat: add Holo-1 Vision Language Model support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements support for Holo-1 (Hcompany/Holo1-7B) from GitHub issue #81. ## What was implemented: - Holo1LocateModel class for UI element location using transformers - Holo1Settings for model configuration with environment variables - Model registry integration with lazy initialization - Comprehensive integration tests with mocking - Dependencies: transformers>=4.45.0, torch>=2.1.0 ## Research findings from issue #81: - Holo-1: Successfully found as "Hcompany/Holo1-7B" on Hugging Face - Runner H: APIs (api.runnerh.com/v1) do not exist publicly - appears to be private beta only - Only implemented Holo-1 as it's the accessible model ## Implementation details: - Uses AutoModelForImageTextToText with chat template approach - Supports both GPU (CUDA) and CPU inference with auto-detection - Parses coordinates from JSON bbox format or text coordinates - Includes proper error handling and logging - Follows existing codebase patterns and style guidelines 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- pyproject.toml | 2 + src/askui/models/huggingface/holo1.py | 224 ++++++++++++++++++++++ src/askui/models/huggingface/settings.py | 32 ++++ src/askui/models/model_router.py | 12 ++ src/askui/models/models.py | 1 + tests/integration/models/__init__.py | 1 + tests/integration/models/test_holo1.py | 229 +++++++++++++++++++++++ 7 files changed, 501 insertions(+) create mode 100644 src/askui/models/huggingface/holo1.py create mode 100644 src/askui/models/huggingface/settings.py create mode 100644 tests/integration/models/__init__.py create mode 100644 tests/integration/models/test_holo1.py diff --git a/pyproject.toml b/pyproject.toml index 9fe2e7f8..4c2e500a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,6 +24,8 @@ dependencies = [ "httpx>=0.28.1", "fastmcp>=2.3.4", "pure-python-adb>=0.3.0.dev0", + "transformers>=4.45.0", + "torch>=2.1.0", ] requires-python = ">=3.10" readme = "README.md" diff --git a/src/askui/models/huggingface/holo1.py b/src/askui/models/huggingface/holo1.py new file mode 100644 index 00000000..93e895bb --- /dev/null +++ b/src/askui/models/huggingface/holo1.py @@ -0,0 +1,224 @@ +"""Holo-1 Vision Language Model implementation for element location. + +This module provides the Holo1LocateModel class that uses the Holo-1 VLM +for locating UI elements on screen based on natural language descriptions. +""" + +import json + +from typing_extensions import override + +from askui.exceptions import AutomationError, ElementNotFoundError +from askui.locators.locators import Locator +from askui.locators.serializers import VlmLocatorSerializer +from askui.logger import logger +from askui.models.models import LocateModel, ModelComposition, Point +from askui.utils.image_utils import ImageSource + + +class Holo1LocateModel(LocateModel): + """Holo-1 model implementation for locating UI elements. + + This model uses the Holo-1 Vision Language Model for element detection + and supports both GPU and CPU inference. + + Attributes: + _model_name: The Hugging Face model identifier + _device: The device to run inference on (cuda/cpu) + _locator_serializer: Serializer for converting locators to prompts + """ + + def __init__( + self, + locator_serializer: VlmLocatorSerializer, + model_name: str = "Hcompany/Holo1-7B", + device: str | None = None, + ) -> None: + """Initialize the Holo-1 model. + + Args: + locator_serializer: Serializer for converting locators to prompts + model_name: The Hugging Face model identifier + device: Device to run inference on. If None, auto-detects GPU availability + """ + self._model_name = model_name + self._locator_serializer = locator_serializer + self._model = None + self._processor = None + + # Lazy import to avoid loading heavy dependencies + import torch + + if device is None: + self._device = "cuda" if torch.cuda.is_available() else "cpu" + else: + self._device = device + + logger.info(f"Holo-1 model will use device: {self._device}") + + def _load_model(self) -> None: + """Lazy load the model and processor.""" + if self._model is not None: + return + + logger.info(f"Loading Holo-1 model from {self._model_name}") + + try: + from transformers import AutoModelForImageTextToText, AutoProcessor + + self._processor = AutoProcessor.from_pretrained(self._model_name) + self._model = AutoModelForImageTextToText.from_pretrained( + self._model_name, + torch_dtype="auto", + device_map=self._device if self._device != "cpu" else None, + ) + + # Set to evaluation mode + self._model.eval() + + logger.info("Holo-1 model loaded successfully") + + except Exception as e: + error_msg = f"Failed to load Holo-1 model: {e}" + logger.error(error_msg) + raise AutomationError(error_msg) from e + + def _parse_model_output( + self, output: str, _image_width: int, _image_height: int + ) -> Point: + """Parse the model output to extract coordinates. + + Args: + output: The model's text output + image_width: Width of the input image + image_height: Height of the input image + + Returns: + A tuple of (x, y) coordinates + + Raises: + ElementNotFoundError: If coordinates cannot be parsed from output + """ + try: + # Expected format: {"bbox": [x1, y1, x2, y2]} or similar + # This may need adjustment based on actual model output format + if "bbox" in output: + bbox_data = json.loads(output) + bbox = bbox_data["bbox"] + x1, y1, x2, y2 = bbox + + # Return center point + x = int((x1 + x2) / 2) + y = int((y1 + y2) / 2) + + return x, y + + # Try to extract coordinates from text + # Format might be "Element at (x, y)" or similar + import re + + coord_pattern = r"\\((\\d+),\\s*(\\d+)\\)" + match = re.search(coord_pattern, output) + + if match: + x = int(match.group(1)) + y = int(match.group(2)) + return x, y + + error_msg = f"Could not parse coordinates from model output: {output}" + raise ValueError(error_msg) # noqa: TRY301 + + except (json.JSONDecodeError, ValueError, KeyError) as e: + error_msg = f"Failed to parse Holo-1 output: {output}" + logger.error(error_msg) + empty_locator = "" + raise ElementNotFoundError(empty_locator, empty_locator) from e + + @override + def locate( + self, + locator: str | Locator, + image: ImageSource, + model_choice: ModelComposition | str, + ) -> Point: + """Locate an element using the Holo-1 model. + + Args: + locator: Element description or locator object + image: Screenshot to analyze + model_choice: Model selection (ignored for single model) + + Returns: + Coordinates of the located element as (x, y) tuple + + Raises: + AutomationError: If model inference fails + ElementNotFoundError: If element cannot be found + """ + if isinstance(model_choice, ModelComposition): + error_msg = "Model composition is not supported for Holo-1" + raise NotImplementedError(error_msg) + + # Ensure model is loaded + self._load_model() + + # Serialize locator if needed + serialized_locator = ( + self._locator_serializer.serialize(locator) + if isinstance(locator, Locator) + else locator + ) + + # Prepare messages for chat template + messages = [ + {"role": "user", "content": f"Locate the UI element: {serialized_locator}"} + ] + + try: + # Apply chat template and process + text = self._processor.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + + inputs = self._processor( + text=[text], images=image.root, return_tensors="pt" + ) + + # Move to device if not CPU + if self._device != "cpu": + inputs = inputs.to(self._device) + + # Generate response + import torch + + with torch.no_grad(): + generated_ids = self._model.generate( + **inputs, + max_new_tokens=128, + ) + + # Trim generated tokens and decode + generated_ids_trimmed = [ + out_ids[len(in_ids) :] + for in_ids, out_ids in zip( + inputs.input_ids, generated_ids, strict=False + ) + ] + + response = self._processor.batch_decode( + generated_ids_trimmed, skip_special_tokens=True + )[0] + + logger.debug(f"Holo-1 response: {response}") + + # Parse coordinates from response + return self._parse_model_output( + response, image.root.width, image.root.height + ) + + except Exception as e: + if isinstance(e, (ElementNotFoundError, NotImplementedError)): + raise + error_msg = f"Holo-1 inference failed: {e}" + logger.error(error_msg) + raise AutomationError(error_msg) from e diff --git a/src/askui/models/huggingface/settings.py b/src/askui/models/huggingface/settings.py new file mode 100644 index 00000000..83ff427e --- /dev/null +++ b/src/askui/models/huggingface/settings.py @@ -0,0 +1,32 @@ +from pydantic import Field +from pydantic_settings import BaseSettings + + +class Holo1Settings(BaseSettings): + """Settings for Holo-1 model configuration. + + Environment variables: + HOLO1_MODEL_NAME: Hugging Face model identifier (default: Hcompany/Holo1-7B) + HOLO1_DEVICE: Device to run inference on (default: auto-detect) + HOLO1_MAX_NEW_TOKENS: Maximum tokens to generate (default: 128) + HOLO1_TEMPERATURE: Sampling temperature (default: 0.1) + """ + + model_name: str = Field( + default="Hcompany/Holo1-7B", + description="Hugging Face model identifier", + ) + device: str | None = Field( + default=None, + description="Device to run inference on (cuda/cpu, auto-detect if None)", + ) + max_new_tokens: int = Field( + default=128, + description="Maximum number of tokens to generate", + ) + temperature: float = Field( + default=0.1, + description="Sampling temperature for generation", + ) + + model_config = {"env_prefix": "HOLO1_"} diff --git a/src/askui/models/model_router.py b/src/askui/models/model_router.py index affee49a..0330dc32 100644 --- a/src/askui/models/model_router.py +++ b/src/askui/models/model_router.py @@ -19,6 +19,8 @@ AskUiComputerAgentSettings, ) from askui.models.exceptions import ModelNotFoundError, ModelTypeMismatchError +from askui.models.huggingface.holo1 import Holo1LocateModel +from askui.models.huggingface.settings import Holo1Settings from askui.models.huggingface.spaces_api import HFSpacesHandler from askui.models.models import ( MODEL_TYPES, @@ -116,6 +118,15 @@ def hf_spaces_handler() -> HFSpacesHandler: locator_serializer=vlm_locator_serializer(), ) + @functools.cache + def holo1_locate_model() -> Holo1LocateModel: + settings = Holo1Settings() + return Holo1LocateModel( + locator_serializer=vlm_locator_serializer(), + model_name=settings.model_name, + device=settings.device, + ) + return { ModelName.ASKUI: askui_facade, ModelName.ASKUI__AI_ELEMENT: askui_model_router, @@ -128,6 +139,7 @@ def hf_spaces_handler() -> HFSpacesHandler: ModelName.HF__SPACES__QWEN__QWEN2_VL_7B_INSTRUCT: hf_spaces_handler, ModelName.HF__SPACES__OS_COPILOT__OS_ATLAS_BASE_7B: hf_spaces_handler, ModelName.HF__SPACES__SHOWUI__2B: hf_spaces_handler, + ModelName.HF__HOLO_1: holo1_locate_model, } diff --git a/src/askui/models/models.py b/src/askui/models/models.py index 1c612d8c..99acf558 100644 --- a/src/askui/models/models.py +++ b/src/askui/models/models.py @@ -35,6 +35,7 @@ class ModelName(str, Enum): HF__SPACES__QWEN__QWEN2_VL_7B_INSTRUCT = "Qwen/Qwen2-VL-7B-Instruct" HF__SPACES__SHOWUI__2B = "showlab/ShowUI-2B" TARS = "tars" + HF__HOLO_1 = "holo-1" ANTHROPIC_MODEL_NAME_MAPPING = { diff --git a/tests/integration/models/__init__.py b/tests/integration/models/__init__.py new file mode 100644 index 00000000..b3da0c2c --- /dev/null +++ b/tests/integration/models/__init__.py @@ -0,0 +1 @@ +"""Tests for model implementations.""" \ No newline at end of file diff --git a/tests/integration/models/test_holo1.py b/tests/integration/models/test_holo1.py new file mode 100644 index 00000000..5379641e --- /dev/null +++ b/tests/integration/models/test_holo1.py @@ -0,0 +1,229 @@ +"""Integration tests for Holo-1 model implementation.""" + +import pytest +from PIL import Image +from pytest_mock import MockerFixture + +from askui.locators.locators import Button, Locator +from askui.locators.serializers import VlmLocatorSerializer +from askui.models.huggingface.holo1 import Holo1LocateModel +from askui.models.huggingface.settings import Holo1Settings +from askui.utils.image_utils import Img + + +class TestHolo1Integration: + """Integration tests for Holo-1 model.""" + + @pytest.fixture + def mock_settings(self) -> Holo1Settings: + """Create mock settings for testing.""" + return Holo1Settings( + model_name="Hcompany/Holo1-7B", + device="cpu", + max_new_tokens=50, + temperature=0.1, + ) + + @pytest.fixture + def mock_locator_serializer(self) -> VlmLocatorSerializer: + """Create a mock locator serializer.""" + return VlmLocatorSerializer() + + @pytest.fixture + def sample_image(self) -> Img: + """Create a sample image for testing.""" + # Create a simple test image + img = Image.new("RGB", (800, 600), color="white") + return Img(img) + + def test_holo1_initialization( + self, + mock_locator_serializer: VlmLocatorSerializer, + mock_settings: Holo1Settings, + ) -> None: + """Test Holo-1 model initialization.""" + model = Holo1LocateModel( + locator_serializer=mock_locator_serializer, + model_name=mock_settings.model_name, + device=mock_settings.device, + ) + + assert model._model_name == "Hcompany/Holo1-7B" + assert model._device == "cpu" + assert model._model is None # Lazy loading + assert model._processor is None + + def test_holo1_locate_with_string( + self, + mocker: MockerFixture, + mock_locator_serializer: VlmLocatorSerializer, + sample_image: Img, + ) -> None: + """Test locating an element with a string description.""" + # Mock the model loading and inference + mock_processor = mocker.MagicMock() + mock_model = mocker.MagicMock() + + mocker.patch( + "transformers.AutoProcessor.from_pretrained", return_value=mock_processor + ) + mocker.patch( + "transformers.AutoModelForImageTextToText.from_pretrained", + return_value=mock_model, + ) + + # Mock the model output + mock_processor.batch_decode.return_value = ['{"bbox": [100, 200, 150, 250]}'] + mock_model.generate.return_value = [[1, 2, 3]] # Mock token output + + model = Holo1LocateModel( + locator_serializer=mock_locator_serializer, + model_name="Hcompany/Holo1-7B", + device="cpu", + ) + + result = model.locate( + locator="Submit button", + image=sample_image, + model_choice="holo-1", + ) + + assert result == (125, 225) # Center of bbox [100, 200, 150, 250] + + def test_holo1_locate_with_locator_object( + self, + mocker: MockerFixture, + mock_locator_serializer: VlmLocatorSerializer, + sample_image: Img, + ) -> None: + """Test locating an element with a Locator object.""" + # Mock the model loading and inference + mock_processor = mocker.MagicMock() + mock_model = mocker.MagicMock() + + mocker.patch( + "transformers.AutoProcessor.from_pretrained", return_value=mock_processor + ) + mocker.patch( + "transformers.AutoModelForImageTextToText.from_pretrained", + return_value=mock_model, + ) + + # Mock the model output with coordinate format + mock_processor.batch_decode.return_value = ["Element at (300, 400)"] + mock_model.generate.return_value = [[1, 2, 3]] + + model = Holo1LocateModel( + locator_serializer=mock_locator_serializer, + model_name="Hcompany/Holo1-7B", + device="cpu", + ) + + locator = Locator(Button("Submit")) + result = model.locate( + locator=locator, + image=sample_image, + model_choice="holo-1", + ) + + assert result == (300, 400) + + def test_holo1_model_composition_not_supported( + self, + mock_locator_serializer: VlmLocatorSerializer, + sample_image: Img, + ) -> None: + """Test that model composition raises NotImplementedError.""" + from askui.models import ModelComposition, ModelDefinition + + model = Holo1LocateModel( + locator_serializer=mock_locator_serializer, + model_name="Hcompany/Holo1-7B", + device="cpu", + ) + + composition = ModelComposition( + [ + ModelDefinition( + task="locate", + architecture="holo1", + version="1", + interface="test", + ) + ] + ) + + with pytest.raises( + NotImplementedError, match="Model composition is not supported" + ): + model.locate( + locator="button", + image=sample_image, + model_choice=composition, + ) + + def test_holo1_element_not_found( + self, + mocker: MockerFixture, + mock_locator_serializer: VlmLocatorSerializer, + sample_image: Img, + ) -> None: + """Test handling when element cannot be found.""" + from askui.exceptions import ElementNotFoundError + + # Mock the model loading and inference + mock_processor = mocker.MagicMock() + mock_model = mocker.MagicMock() + + mocker.patch( + "transformers.AutoProcessor.from_pretrained", return_value=mock_processor + ) + mocker.patch( + "transformers.AutoModelForImageTextToText.from_pretrained", + return_value=mock_model, + ) + + # Mock the model output without valid coordinates + mock_processor.batch_decode.return_value = ["No element found"] + mock_model.generate.return_value = [[1, 2, 3]] + + model = Holo1LocateModel( + locator_serializer=mock_locator_serializer, + model_name="Hcompany/Holo1-7B", + device="cpu", + ) + + with pytest.raises(ElementNotFoundError): + model.locate( + locator="Submit button", + image=sample_image, + model_choice="holo-1", + ) + + def test_holo1_model_loading_error( + self, + mocker: MockerFixture, + mock_locator_serializer: VlmLocatorSerializer, + sample_image: Img, + ) -> None: + """Test handling of model loading errors.""" + from askui.exceptions import AutomationError + + # Mock model loading to fail + mocker.patch( + "transformers.AutoProcessor.from_pretrained", + side_effect=Exception("Model not found"), + ) + + model = Holo1LocateModel( + locator_serializer=mock_locator_serializer, + model_name="invalid-model", + device="cpu", + ) + + with pytest.raises(AutomationError, match="Failed to load Holo-1 model"): + model.locate( + locator="button", + image=sample_image, + model_choice="holo-1", + )