From d0dce6708738d9f3095c5f67f89951a9e3bb499e Mon Sep 17 00:00:00 2001
From: Tristan Drummond <tristdrum@gmail.com>
Date: Mon, 23 Jun 2025 12:41:25 +0200
Subject: [PATCH] feat: add Holo-1 Vision Language Model support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implements support for Holo-1 (Hcompany/Holo1-7B) from GitHub issue #81.

## What was implemented:
- Holo1LocateModel class for UI element location using transformers
- Holo1Settings for model configuration with environment variables
- Model registry integration with lazy initialization
- Comprehensive integration tests with mocking
- Dependencies: transformers>=4.45.0, torch>=2.1.0

## Research findings from issue #81:
- Holo-1: Successfully found as "Hcompany/Holo1-7B" on Hugging Face
- Runner H: APIs (api.runnerh.com/v1) do not exist publicly - appears to be private beta only
- Only implemented Holo-1 as it's the accessible model

## Implementation details:
- Uses AutoModelForImageTextToText with chat template approach
- Supports both GPU (CUDA) and CPU inference with auto-detection
- Parses coordinates from JSON bbox format or text coordinates
- Includes proper error handling and logging
- Follows existing codebase patterns and style guidelines

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 pyproject.toml                           |   2 +
 src/askui/models/huggingface/holo1.py    | 224 ++++++++++++++++++++++
 src/askui/models/huggingface/settings.py |  32 ++++
 src/askui/models/model_router.py         |  12 ++
 src/askui/models/models.py               |   1 +
 tests/integration/models/__init__.py     |   1 +
 tests/integration/models/test_holo1.py   | 229 +++++++++++++++++++++++
 7 files changed, 501 insertions(+)
 create mode 100644 src/askui/models/huggingface/holo1.py
 create mode 100644 src/askui/models/huggingface/settings.py
 create mode 100644 tests/integration/models/__init__.py
 create mode 100644 tests/integration/models/test_holo1.py

diff --git a/pyproject.toml b/pyproject.toml
index 9fe2e7f8..4c2e500a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -24,6 +24,8 @@ dependencies = [
     "httpx>=0.28.1",
     "fastmcp>=2.3.4",
     "pure-python-adb>=0.3.0.dev0",
+    "transformers>=4.45.0",
+    "torch>=2.1.0",
 ]
 requires-python = ">=3.10"
 readme = "README.md"
diff --git a/src/askui/models/huggingface/holo1.py b/src/askui/models/huggingface/holo1.py
new file mode 100644
index 00000000..93e895bb
--- /dev/null
+++ b/src/askui/models/huggingface/holo1.py
@@ -0,0 +1,224 @@
+"""Holo-1 Vision Language Model implementation for element location.
+
+This module provides the Holo1LocateModel class that uses the Holo-1 VLM
+for locating UI elements on screen based on natural language descriptions.
+"""
+
+import json
+
+from typing_extensions import override
+
+from askui.exceptions import AutomationError, ElementNotFoundError
+from askui.locators.locators import Locator
+from askui.locators.serializers import VlmLocatorSerializer
+from askui.logger import logger
+from askui.models.models import LocateModel, ModelComposition, Point
+from askui.utils.image_utils import ImageSource
+
+
+class Holo1LocateModel(LocateModel):
+    """Holo-1 model implementation for locating UI elements.
+
+    This model uses the Holo-1 Vision Language Model for element detection
+    and supports both GPU and CPU inference.
+
+    Attributes:
+        _model_name: The Hugging Face model identifier
+        _device: The device to run inference on (cuda/cpu)
+        _locator_serializer: Serializer for converting locators to prompts
+    """
+
+    def __init__(
+        self,
+        locator_serializer: VlmLocatorSerializer,
+        model_name: str = "Hcompany/Holo1-7B",
+        device: str | None = None,
+    ) -> None:
+        """Initialize the Holo-1 model.
+
+        Args:
+            locator_serializer: Serializer for converting locators to prompts
+            model_name: The Hugging Face model identifier
+            device: Device to run inference on. If None, auto-detects GPU availability
+        """
+        self._model_name = model_name
+        self._locator_serializer = locator_serializer
+        self._model = None
+        self._processor = None
+
+        # Lazy import to avoid loading heavy dependencies
+        import torch
+
+        if device is None:
+            self._device = "cuda" if torch.cuda.is_available() else "cpu"
+        else:
+            self._device = device
+
+        logger.info(f"Holo-1 model will use device: {self._device}")
+
+    def _load_model(self) -> None:
+        """Lazy load the model and processor."""
+        if self._model is not None:
+            return
+
+        logger.info(f"Loading Holo-1 model from {self._model_name}")
+
+        try:
+            from transformers import AutoModelForImageTextToText, AutoProcessor
+
+            self._processor = AutoProcessor.from_pretrained(self._model_name)
+            self._model = AutoModelForImageTextToText.from_pretrained(
+                self._model_name,
+                torch_dtype="auto",
+                device_map=self._device if self._device != "cpu" else None,
+            )
+
+            # Set to evaluation mode
+            self._model.eval()
+
+            logger.info("Holo-1 model loaded successfully")
+
+        except Exception as e:
+            error_msg = f"Failed to load Holo-1 model: {e}"
+            logger.error(error_msg)
+            raise AutomationError(error_msg) from e
+
+    def _parse_model_output(
+        self, output: str, _image_width: int, _image_height: int
+    ) -> Point:
+        """Parse the model output to extract coordinates.
+
+        Args:
+            output: The model's text output
+            image_width: Width of the input image
+            image_height: Height of the input image
+
+        Returns:
+            A tuple of (x, y) coordinates
+
+        Raises:
+            ElementNotFoundError: If coordinates cannot be parsed from output
+        """
+        try:
+            # Expected format: {"bbox": [x1, y1, x2, y2]} or similar
+            # This may need adjustment based on actual model output format
+            if "bbox" in output:
+                bbox_data = json.loads(output)
+                bbox = bbox_data["bbox"]
+                x1, y1, x2, y2 = bbox
+
+                # Return center point
+                x = int((x1 + x2) / 2)
+                y = int((y1 + y2) / 2)
+
+                return x, y
+
+            # Try to extract coordinates from text
+            # Format might be "Element at (x, y)" or similar
+            import re
+
+            coord_pattern = r"\\((\\d+),\\s*(\\d+)\\)"
+            match = re.search(coord_pattern, output)
+
+            if match:
+                x = int(match.group(1))
+                y = int(match.group(2))
+                return x, y
+
+            error_msg = f"Could not parse coordinates from model output: {output}"
+            raise ValueError(error_msg)  # noqa: TRY301
+
+        except (json.JSONDecodeError, ValueError, KeyError) as e:
+            error_msg = f"Failed to parse Holo-1 output: {output}"
+            logger.error(error_msg)
+            empty_locator = ""
+            raise ElementNotFoundError(empty_locator, empty_locator) from e
+
+    @override
+    def locate(
+        self,
+        locator: str | Locator,
+        image: ImageSource,
+        model_choice: ModelComposition | str,
+    ) -> Point:
+        """Locate an element using the Holo-1 model.
+
+        Args:
+            locator: Element description or locator object
+            image: Screenshot to analyze
+            model_choice: Model selection (ignored for single model)
+
+        Returns:
+            Coordinates of the located element as (x, y) tuple
+
+        Raises:
+            AutomationError: If model inference fails
+            ElementNotFoundError: If element cannot be found
+        """
+        if isinstance(model_choice, ModelComposition):
+            error_msg = "Model composition is not supported for Holo-1"
+            raise NotImplementedError(error_msg)
+
+        # Ensure model is loaded
+        self._load_model()
+
+        # Serialize locator if needed
+        serialized_locator = (
+            self._locator_serializer.serialize(locator)
+            if isinstance(locator, Locator)
+            else locator
+        )
+
+        # Prepare messages for chat template
+        messages = [
+            {"role": "user", "content": f"Locate the UI element: {serialized_locator}"}
+        ]
+
+        try:
+            # Apply chat template and process
+            text = self._processor.apply_chat_template(
+                messages, tokenize=False, add_generation_prompt=True
+            )
+
+            inputs = self._processor(
+                text=[text], images=image.root, return_tensors="pt"
+            )
+
+            # Move to device if not CPU
+            if self._device != "cpu":
+                inputs = inputs.to(self._device)
+
+            # Generate response
+            import torch
+
+            with torch.no_grad():
+                generated_ids = self._model.generate(
+                    **inputs,
+                    max_new_tokens=128,
+                )
+
+            # Trim generated tokens and decode
+            generated_ids_trimmed = [
+                out_ids[len(in_ids) :]
+                for in_ids, out_ids in zip(
+                    inputs.input_ids, generated_ids, strict=False
+                )
+            ]
+
+            response = self._processor.batch_decode(
+                generated_ids_trimmed, skip_special_tokens=True
+            )[0]
+
+            logger.debug(f"Holo-1 response: {response}")
+
+            # Parse coordinates from response
+            return self._parse_model_output(
+                response, image.root.width, image.root.height
+            )
+
+        except Exception as e:
+            if isinstance(e, (ElementNotFoundError, NotImplementedError)):
+                raise
+            error_msg = f"Holo-1 inference failed: {e}"
+            logger.error(error_msg)
+            raise AutomationError(error_msg) from e
diff --git a/src/askui/models/huggingface/settings.py b/src/askui/models/huggingface/settings.py
new file mode 100644
index 00000000..83ff427e
--- /dev/null
+++ b/src/askui/models/huggingface/settings.py
@@ -0,0 +1,32 @@
+from pydantic import Field
+from pydantic_settings import BaseSettings
+
+
+class Holo1Settings(BaseSettings):
+    """Settings for Holo-1 model configuration.
+
+    Environment variables:
+        HOLO1_MODEL_NAME: Hugging Face model identifier (default: Hcompany/Holo1-7B)
+        HOLO1_DEVICE: Device to run inference on (default: auto-detect)
+        HOLO1_MAX_NEW_TOKENS: Maximum tokens to generate (default: 128)
+        HOLO1_TEMPERATURE: Sampling temperature (default: 0.1)
+    """
+
+    model_name: str = Field(
+        default="Hcompany/Holo1-7B",
+        description="Hugging Face model identifier",
+    )
+    device: str | None = Field(
+        default=None,
+        description="Device to run inference on (cuda/cpu, auto-detect if None)",
+    )
+    max_new_tokens: int = Field(
+        default=128,
+        description="Maximum number of tokens to generate",
+    )
+    temperature: float = Field(
+        default=0.1,
+        description="Sampling temperature for generation",
+    )
+
+    model_config = {"env_prefix": "HOLO1_"}
diff --git a/src/askui/models/model_router.py b/src/askui/models/model_router.py
index affee49a..0330dc32 100644
--- a/src/askui/models/model_router.py
+++ b/src/askui/models/model_router.py
@@ -19,6 +19,8 @@
     AskUiComputerAgentSettings,
 )
 from askui.models.exceptions import ModelNotFoundError, ModelTypeMismatchError
+from askui.models.huggingface.holo1 import Holo1LocateModel
+from askui.models.huggingface.settings import Holo1Settings
 from askui.models.huggingface.spaces_api import HFSpacesHandler
 from askui.models.models import (
     MODEL_TYPES,
@@ -116,6 +118,15 @@ def hf_spaces_handler() -> HFSpacesHandler:
             locator_serializer=vlm_locator_serializer(),
         )
 
+    @functools.cache
+    def holo1_locate_model() -> Holo1LocateModel:
+        settings = Holo1Settings()
+        return Holo1LocateModel(
+            locator_serializer=vlm_locator_serializer(),
+            model_name=settings.model_name,
+            device=settings.device,
+        )
+
     return {
         ModelName.ASKUI: askui_facade,
         ModelName.ASKUI__AI_ELEMENT: askui_model_router,
@@ -128,6 +139,7 @@ def hf_spaces_handler() -> HFSpacesHandler:
         ModelName.HF__SPACES__QWEN__QWEN2_VL_7B_INSTRUCT: hf_spaces_handler,
         ModelName.HF__SPACES__OS_COPILOT__OS_ATLAS_BASE_7B: hf_spaces_handler,
         ModelName.HF__SPACES__SHOWUI__2B: hf_spaces_handler,
+        ModelName.HF__HOLO_1: holo1_locate_model,
     }
 
 
diff --git a/src/askui/models/models.py b/src/askui/models/models.py
index 1c612d8c..99acf558 100644
--- a/src/askui/models/models.py
+++ b/src/askui/models/models.py
@@ -35,6 +35,7 @@ class ModelName(str, Enum):
     HF__SPACES__QWEN__QWEN2_VL_7B_INSTRUCT = "Qwen/Qwen2-VL-7B-Instruct"
     HF__SPACES__SHOWUI__2B = "showlab/ShowUI-2B"
     TARS = "tars"
+    HF__HOLO_1 = "holo-1"
 
 
 ANTHROPIC_MODEL_NAME_MAPPING = {
diff --git a/tests/integration/models/__init__.py b/tests/integration/models/__init__.py
new file mode 100644
index 00000000..b3da0c2c
--- /dev/null
+++ b/tests/integration/models/__init__.py
@@ -0,0 +1 @@
+"""Tests for model implementations."""
\ No newline at end of file
diff --git a/tests/integration/models/test_holo1.py b/tests/integration/models/test_holo1.py
new file mode 100644
index 00000000..5379641e
--- /dev/null
+++ b/tests/integration/models/test_holo1.py
@@ -0,0 +1,229 @@
+"""Integration tests for Holo-1 model implementation."""
+
+import pytest
+from PIL import Image
+from pytest_mock import MockerFixture
+
+from askui.locators.locators import Button, Locator
+from askui.locators.serializers import VlmLocatorSerializer
+from askui.models.huggingface.holo1 import Holo1LocateModel
+from askui.models.huggingface.settings import Holo1Settings
+from askui.utils.image_utils import Img
+
+
+class TestHolo1Integration:
+    """Integration tests for Holo-1 model."""
+
+    @pytest.fixture
+    def mock_settings(self) -> Holo1Settings:
+        """Create mock settings for testing."""
+        return Holo1Settings(
+            model_name="Hcompany/Holo1-7B",
+            device="cpu",
+            max_new_tokens=50,
+            temperature=0.1,
+        )
+
+    @pytest.fixture
+    def mock_locator_serializer(self) -> VlmLocatorSerializer:
+        """Create a mock locator serializer."""
+        return VlmLocatorSerializer()
+
+    @pytest.fixture
+    def sample_image(self) -> Img:
+        """Create a sample image for testing."""
+        # Create a simple test image
+        img = Image.new("RGB", (800, 600), color="white")
+        return Img(img)
+
+    def test_holo1_initialization(
+        self,
+        mock_locator_serializer: VlmLocatorSerializer,
+        mock_settings: Holo1Settings,
+    ) -> None:
+        """Test Holo-1 model initialization."""
+        model = Holo1LocateModel(
+            locator_serializer=mock_locator_serializer,
+            model_name=mock_settings.model_name,
+            device=mock_settings.device,
+        )
+
+        assert model._model_name == "Hcompany/Holo1-7B"
+        assert model._device == "cpu"
+        assert model._model is None  # Lazy loading
+        assert model._processor is None
+
+    def test_holo1_locate_with_string(
+        self,
+        mocker: MockerFixture,
+        mock_locator_serializer: VlmLocatorSerializer,
+        sample_image: Img,
+    ) -> None:
+        """Test locating an element with a string description."""
+        # Mock the model loading and inference
+        mock_processor = mocker.MagicMock()
+        mock_model = mocker.MagicMock()
+
+        mocker.patch(
+            "transformers.AutoProcessor.from_pretrained", return_value=mock_processor
+        )
+        mocker.patch(
+            "transformers.AutoModelForImageTextToText.from_pretrained",
+            return_value=mock_model,
+        )
+
+        # Mock the model output
+        mock_processor.batch_decode.return_value = ['{"bbox": [100, 200, 150, 250]}']
+        mock_model.generate.return_value = [[1, 2, 3]]  # Mock token output
+
+        model = Holo1LocateModel(
+            locator_serializer=mock_locator_serializer,
+            model_name="Hcompany/Holo1-7B",
+            device="cpu",
+        )
+
+        result = model.locate(
+            locator="Submit button",
+            image=sample_image,
+            model_choice="holo-1",
+        )
+
+        assert result == (125, 225)  # Center of bbox [100, 200, 150, 250]
+
+    def test_holo1_locate_with_locator_object(
+        self,
+        mocker: MockerFixture,
+        mock_locator_serializer: VlmLocatorSerializer,
+        sample_image: Img,
+    ) -> None:
+        """Test locating an element with a Locator object."""
+        # Mock the model loading and inference
+        mock_processor = mocker.MagicMock()
+        mock_model = mocker.MagicMock()
+
+        mocker.patch(
+            "transformers.AutoProcessor.from_pretrained", return_value=mock_processor
+        )
+        mocker.patch(
+            "transformers.AutoModelForImageTextToText.from_pretrained",
+            return_value=mock_model,
+        )
+
+        # Mock the model output with coordinate format
+        mock_processor.batch_decode.return_value = ["Element at (300, 400)"]
+        mock_model.generate.return_value = [[1, 2, 3]]
+
+        model = Holo1LocateModel(
+            locator_serializer=mock_locator_serializer,
+            model_name="Hcompany/Holo1-7B",
+            device="cpu",
+        )
+
+        locator = Locator(Button("Submit"))
+        result = model.locate(
+            locator=locator,
+            image=sample_image,
+            model_choice="holo-1",
+        )
+
+        assert result == (300, 400)
+
+    def test_holo1_model_composition_not_supported(
+        self,
+        mock_locator_serializer: VlmLocatorSerializer,
+        sample_image: Img,
+    ) -> None:
+        """Test that model composition raises NotImplementedError."""
+        from askui.models import ModelComposition, ModelDefinition
+
+        model = Holo1LocateModel(
+            locator_serializer=mock_locator_serializer,
+            model_name="Hcompany/Holo1-7B",
+            device="cpu",
+        )
+
+        composition = ModelComposition(
+            [
+                ModelDefinition(
+                    task="locate",
+                    architecture="holo1",
+                    version="1",
+                    interface="test",
+                )
+            ]
+        )
+
+        with pytest.raises(
+            NotImplementedError, match="Model composition is not supported"
+        ):
+            model.locate(
+                locator="button",
+                image=sample_image,
+                model_choice=composition,
+            )
+
+    def test_holo1_element_not_found(
+        self,
+        mocker: MockerFixture,
+        mock_locator_serializer: VlmLocatorSerializer,
+        sample_image: Img,
+    ) -> None:
+        """Test handling when element cannot be found."""
+        from askui.exceptions import ElementNotFoundError
+
+        # Mock the model loading and inference
+        mock_processor = mocker.MagicMock()
+        mock_model = mocker.MagicMock()
+
+        mocker.patch(
+            "transformers.AutoProcessor.from_pretrained", return_value=mock_processor
+        )
+        mocker.patch(
+            "transformers.AutoModelForImageTextToText.from_pretrained",
+            return_value=mock_model,
+        )
+
+        # Mock the model output without valid coordinates
+        mock_processor.batch_decode.return_value = ["No element found"]
+        mock_model.generate.return_value = [[1, 2, 3]]
+
+        model = Holo1LocateModel(
+            locator_serializer=mock_locator_serializer,
+            model_name="Hcompany/Holo1-7B",
+            device="cpu",
+        )
+
+        with pytest.raises(ElementNotFoundError):
+            model.locate(
+                locator="Submit button",
+                image=sample_image,
+                model_choice="holo-1",
+            )
+
+    def test_holo1_model_loading_error(
+        self,
+        mocker: MockerFixture,
+        mock_locator_serializer: VlmLocatorSerializer,
+        sample_image: Img,
+    ) -> None:
+        """Test handling of model loading errors."""
+        from askui.exceptions import AutomationError
+
+        # Mock model loading to fail
+        mocker.patch(
+            "transformers.AutoProcessor.from_pretrained",
+            side_effect=Exception("Model not found"),
+        )
+
+        model = Holo1LocateModel(
+            locator_serializer=mock_locator_serializer,
+            model_name="invalid-model",
+            device="cpu",
+        )
+
+        with pytest.raises(AutomationError, match="Failed to load Holo-1 model"):
+            model.locate(
+                locator="button",
+                image=sample_image,
+                model_choice="holo-1",
+            )