Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -731,10 +731,8 @@ with VisionAgent() as agent:
```

**⚠️ Limitations:**
- Not all models support response schemas or all kinds of properties that a response schema can have at the moment
- Default values are not supported, e.g., `url: str = "github.com"` or `url: str | None = None`. This includes `default_factory`
and `default` args of `pydantic.Field` as well, e.g., `url: str = Field(default="github.com")` or
`url: str = Field(default_factory=lambda: "github.com")`.
- The support for response schemas varies among models. Currently, the `askui` model provides best support for response schemas
as we try different models under the hood with your schema to see which one works best.

## What is AskUI Vision Agent?

Expand Down
88 changes: 87 additions & 1 deletion pdm.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ dependencies = [
"tenacity>=9.1.2",
"jsonref>=1.1.0",
"protobuf>=6.31.1",
"google-genai>=1.20.0",
]
requires-python = ">=3.10"
readme = "README.md"
Expand Down
72 changes: 72 additions & 0 deletions src/askui/models/askui/get_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
from typing import Type

from google.genai.errors import ClientError
from typing_extensions import override

from askui.logger import logger
from askui.models.askui.google_genai_api import AskUiGoogleGenAiApi
from askui.models.askui.inference_api import AskUiInferenceApi
from askui.models.exceptions import QueryNoResponseError, QueryUnexpectedResponseError
from askui.models.models import GetModel
from askui.models.types.response_schemas import ResponseSchema
from askui.utils.image_utils import ImageSource


class AskUiGetModel(GetModel):
"""A GetModel implementation that is supposed to be as comprehensive and
powerful as possible using the available AskUi models.

This model first attempts to use the Google GenAI API for information extraction.
If the Google GenAI API fails (e.g., no response, unexpected response, or other
errors), it falls back to using the AskUI Inference API.

Args:
google_genai_api (AskUiGoogleGenAiApi): The Google GenAI API instance to use
as primary.
inference_api (AskUiInferenceApi): The Inference API instance to use as
fallback.
"""

def __init__(
self,
google_genai_api: AskUiGoogleGenAiApi,
inference_api: AskUiInferenceApi,
) -> None:
self._google_genai_api = google_genai_api
self._inference_api = inference_api

@override
def get(
self,
query: str,
image: ImageSource,
response_schema: Type[ResponseSchema] | None,
model_choice: str,
) -> ResponseSchema | str:
try:
logger.debug("Attempting to use Google GenAI API")
return self._google_genai_api.get(
query=query,
image=image,
response_schema=response_schema,
model_choice=model_choice,
)
except (
ClientError,
QueryNoResponseError,
QueryUnexpectedResponseError,
NotImplementedError,
) as e:
if isinstance(e, ClientError) and e.code != 400:
raise
logger.debug(
f"Google GenAI API failed with error that may not occur with other "
f"models/apis: {e}"
". Falling back to Inference API..."
)
return self._inference_api.get(
query=query,
image=image,
response_schema=response_schema,
model_choice=model_choice,
)
92 changes: 92 additions & 0 deletions src/askui/models/askui/google_genai_api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import json as json_lib
from typing import Type

import google.genai as genai
from google.genai import types as genai_types
from pydantic import ValidationError
from typing_extensions import override

from askui.logger import logger
from askui.models.askui.inference_api import AskUiInferenceApiSettings
from askui.models.exceptions import QueryNoResponseError, QueryUnexpectedResponseError
from askui.models.models import GetModel, ModelName
from askui.models.shared.prompts import SYSTEM_PROMPT_GET
from askui.models.types.response_schemas import ResponseSchema, to_response_schema
from askui.utils.image_utils import ImageSource

ASKUI_MODEL_CHOICE_PREFIX = "askui/"
ASKUI_MODEL_CHOICE_PREFIX_LEN = len(ASKUI_MODEL_CHOICE_PREFIX)


def _extract_model_id(model_choice: str) -> str:
if model_choice == ModelName.ASKUI:
return ModelName.GEMINI__2_5__FLASH
if model_choice.startswith(ASKUI_MODEL_CHOICE_PREFIX):
return model_choice[ASKUI_MODEL_CHOICE_PREFIX_LEN:]
return model_choice


class AskUiGoogleGenAiApi(GetModel):
def __init__(self, settings: AskUiInferenceApiSettings | None = None) -> None:
self._settings = settings or AskUiInferenceApiSettings()
self._client = genai.Client(
vertexai=True,
api_key="DummyValueRequiredByGenaiClient",
http_options=genai_types.HttpOptions(
base_url=f"{self._settings.base_url}/proxy/vertexai",
headers={
"Authorization": self._settings.authorization_header,
},
),
)

@override
def get(
self,
query: str,
image: ImageSource,
response_schema: Type[ResponseSchema] | None,
model_choice: str,
) -> ResponseSchema | str:
try:
_response_schema = to_response_schema(response_schema)
json_schema = _response_schema.model_json_schema()
logger.debug(f"json_schema:\n{json_lib.dumps(json_schema)}")
content = genai_types.Content(
parts=[
genai_types.Part.from_bytes(
data=image.to_bytes(),
mime_type="image/png",
),
genai_types.Part.from_text(text=query),
],
role="user",
)
generate_content_response = self._client.models.generate_content(
model=f"models/{_extract_model_id(model_choice)}",
contents=content,
config={
"response_mime_type": "application/json",
"response_schema": _response_schema,
"system_instruction": SYSTEM_PROMPT_GET,
},
)
json_str = generate_content_response.text
if json_str is None:
raise QueryNoResponseError(
message="No response from the model", query=query
)
try:
return _response_schema.model_validate_json(json_str).root
except ValidationError as e:
error_message = str(e.errors())
raise QueryUnexpectedResponseError(
message=f"Unexpected response from the model: {error_message}",
query=query,
response=json_str,
) from e
except RecursionError as e:
error_message = (
"Recursive response schemas are not supported by AskUiGoogleGenAiApi"
)
raise NotImplementedError(error_message) from e
17 changes: 16 additions & 1 deletion src/askui/models/model_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
from askui.locators.serializers import AskUiLocatorSerializer, VlmLocatorSerializer
from askui.models.anthropic.messages_api import AnthropicMessagesApi
from askui.models.askui.ai_element_utils import AiElementCollection
from askui.models.askui.get_model import AskUiGetModel
from askui.models.askui.google_genai_api import AskUiGoogleGenAiApi
from askui.models.askui.model_router import AskUiModelRouter
from askui.models.exceptions import ModelNotFoundError, ModelTypeMismatchError
from askui.models.huggingface.spaces_api import HFSpacesHandler
Expand Down Expand Up @@ -57,6 +59,10 @@ def anthropic_facade() -> ModelFacade:
locate_model=messages_api,
)

@functools.cache
def askui_google_genai_api() -> AskUiGoogleGenAiApi:
return AskUiGoogleGenAiApi()

@functools.cache
def askui_inference_api() -> AskUiInferenceApi:
return AskUiInferenceApi(
Expand All @@ -72,6 +78,13 @@ def askui_model_router() -> AskUiModelRouter:
inference_api=askui_inference_api(),
)

@functools.cache
def askui_get_model() -> AskUiGetModel:
return AskUiGetModel(
google_genai_api=askui_google_genai_api(),
inference_api=askui_inference_api(),
)

@functools.cache
def askui_facade() -> ModelFacade:
computer_agent = Agent(
Expand All @@ -80,7 +93,7 @@ def askui_facade() -> ModelFacade:
)
return ModelFacade(
act_model=computer_agent,
get_model=askui_inference_api(),
get_model=askui_get_model(),
locate_model=askui_model_router(),
)

Expand All @@ -93,6 +106,8 @@ def hf_spaces_handler() -> HFSpacesHandler:
return {
ModelName.ANTHROPIC__CLAUDE__3_5__SONNET__20241022: anthropic_facade,
ModelName.ASKUI: askui_facade,
ModelName.ASKUI__GEMINI__2_5__FLASH: askui_google_genai_api,
ModelName.ASKUI__GEMINI__2_5__PRO: askui_google_genai_api,
ModelName.ASKUI__AI_ELEMENT: askui_model_router,
ModelName.ASKUI__COMBO: askui_model_router,
ModelName.ASKUI__OCR: askui_model_router,
Expand Down
4 changes: 4 additions & 0 deletions src/askui/models/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,15 @@ class ModelName(str, Enum):

ANTHROPIC__CLAUDE__3_5__SONNET__20241022 = "anthropic-claude-3-5-sonnet-20241022"
ASKUI = "askui"
ASKUI__GEMINI__2_5__FLASH = "askui/gemini-2.5-flash"
ASKUI__GEMINI__2_5__PRO = "askui/gemini-2.5-pro"
ASKUI__AI_ELEMENT = "askui-ai-element"
ASKUI__COMBO = "askui-combo"
ASKUI__OCR = "askui-ocr"
ASKUI__PTA = "askui-pta"
CLAUDE__SONNET__4__20250514 = "claude-sonnet-4-20250514"
GEMINI__2_5__FLASH = "gemini-2.5-flash"
GEMINI__2_5__PRO = "gemini-2.5-pro"
HF__SPACES__ASKUI__PTA_1 = "AskUI/PTA-1"
HF__SPACES__OS_COPILOT__OS_ATLAS_BASE_7B = "OS-Copilot/OS-Atlas-Base-7B"
HF__SPACES__QWEN__QWEN2_VL_2B_INSTRUCT = "Qwen/Qwen2-VL-2B-Instruct"
Expand Down
10 changes: 10 additions & 0 deletions src/askui/utils/image_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -410,6 +410,16 @@ def to_base64(self) -> str:
"""
return image_to_base64(image=self.root)

def to_bytes(self) -> bytes:
"""Convert the image to bytes.

Returns:
bytes: The image as bytes.
"""
img_byte_arr = io.BytesIO()
self.root.save(img_byte_arr, format="PNG")
return img_byte_arr.getvalue()


__all__ = [
"load_image",
Expand Down
Loading