Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -446,15 +446,15 @@ You can use Vision Agent with [OpenRouter](https://openrouter.ai/) to access a w
```python
from askui import VisionAgent
from askui.models import (
OpenRouterGetModel,
OpenRouterModel,
OpenRouterSettings,
ModelRegistry,
)


# Register OpenRouter model in the registry
custom_models: ModelRegistry = {
"my-custom-model": OpenRouterGetModel(
"my-custom-model": OpenRouterModel(
OpenRouterSettings(
model="anthropic/claude-opus-4",
)
Expand Down
6 changes: 4 additions & 2 deletions src/askui/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
OnMessageCb,
Point,
)
from .openrouter.handler import OpenRouterGetModel
from .openrouter.model import OpenRouterModel
from .openrouter.settings import OpenRouterSettings
from .shared.computer_agent_message_param import (
Base64ImageSourceParam,
Expand All @@ -28,6 +28,7 @@
ToolUseBlockParam,
UrlImageSourceParam,
)
from .shared.settings import ChatCompletionsCreateSettings

__all__ = [
"ActModel",
Expand All @@ -54,6 +55,7 @@
"ToolResultBlockParam",
"ToolUseBlockParam",
"UrlImageSourceParam",
"OpenRouterGetModel",
"OpenRouterModel",
"OpenRouterSettings",
"ChatCompletionsCreateSettings",
]
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
ModelName,
Point,
)
from askui.models.shared.prompts import SYSTEM_PROMPT_GET, build_system_prompt_locate
from askui.models.types.response_schemas import ResponseSchema
from askui.utils.image_utils import (
ImageSource,
Expand All @@ -47,8 +48,8 @@ def _inference(
) -> list[anthropic.types.ContentBlock]:
message = self._client.messages.create(
model=model,
max_tokens=self._settings.max_tokens,
temperature=self._settings.temperature,
max_tokens=self._settings.chat_completions_create_settings.max_tokens,
temperature=self._settings.chat_completions_create_settings.temperature,
system=system_prompt,
messages=[
{
Expand Down Expand Up @@ -87,12 +88,11 @@ def locate(
prompt = f"Click on {locator_serialized}"
screen_width = self._settings.resolution[0]
screen_height = self._settings.resolution[1]
system_prompt = f"Use a mouse and keyboard to interact with a computer, and take screenshots.\n* This is an interface to a desktop GUI. You do not have access to a terminal or applications menu. You must click on desktop icons to start applications.\n* Some applications may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions. E.g. if you click on Firefox and a window doesn't open, try taking another screenshot.\n* The screen's resolution is {screen_width}x{screen_height}.\n* The display number is 0\n* Whenever you intend to move the cursor to click on an element like an icon, you should consult a screenshot to determine the coordinates of the element before moving the cursor.\n* If you tried clicking on a program or link but it failed to load, even after waiting, try adjusting your cursor position so that the tip of the cursor visually falls on the element that you want to click.\n* Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges unless asked.\n" # noqa: E501
scaled_image = scale_image_with_padding(image.root, screen_width, screen_height)
response = self._inference(
image_to_base64(scaled_image),
prompt,
system_prompt,
build_system_prompt_locate(str(screen_width), str(screen_height)),
model=ANTHROPIC_MODEL_NAME_MAPPING[ModelName(model_choice)],
)
assert len(response) > 0
Expand Down Expand Up @@ -129,11 +129,10 @@ def get(
max_width=self._settings.resolution[0],
max_height=self._settings.resolution[1],
)
system_prompt = "You are an agent to process screenshots and answer questions about things on the screen or extract information from it. Answer only with the response to the question and keep it short and precise." # noqa: E501
response = self._inference(
base64_image=image_to_base64(scaled_image),
prompt=query,
system_prompt=system_prompt,
system_prompt=SYSTEM_PROMPT_GET,
model=ANTHROPIC_MODEL_NAME_MAPPING[ModelName(model_choice)],
)
if len(response) == 0:
Expand Down
8 changes: 6 additions & 2 deletions src/askui/models/anthropic/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@
from pydantic_settings import BaseSettings

from askui.models.shared.computer_agent import ComputerAgentSettingsBase
from askui.models.shared.settings import ChatCompletionsCreateSettings

COMPUTER_USE_BETA_FLAG = "computer-use-2024-10-22"


class AnthropicSettings(BaseSettings):
api_key: SecretStr = Field(
default=...,
min_length=1,
validation_alias="ANTHROPIC_API_KEY",
)
Expand All @@ -19,8 +21,10 @@ class ClaudeSettingsBase(BaseModel):

class ClaudeSettings(ClaudeSettingsBase):
resolution: tuple[int, int] = Field(default_factory=lambda: (1280, 800))
max_tokens: int = 1000
temperature: float = 0.0
chat_completions_create_settings: ChatCompletionsCreateSettings = Field(
default_factory=ChatCompletionsCreateSettings,
description="Settings for ChatCompletions",
)


class ClaudeComputerAgentSettings(ComputerAgentSettingsBase, ClaudeSettingsBase):
Expand Down
2 changes: 1 addition & 1 deletion src/askui/models/model_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@

from ..logger import logger
from .anthropic.computer_agent import ClaudeComputerAgent
from .anthropic.handler import ClaudeHandler
from .anthropic.model import ClaudeHandler
from .askui.inference_api import AskUiInferenceApi, AskUiSettings
from .ui_tars_ep.ui_tars_api import UiTarsApiHandler, UiTarsApiHandlerSettings

Expand Down
77 changes: 0 additions & 77 deletions src/askui/models/openrouter/handler.py

This file was deleted.

185 changes: 185 additions & 0 deletions src/askui/models/openrouter/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
import json
from typing import TYPE_CHECKING, Any, Optional, Type

import openai
from openai import OpenAI
from typing_extensions import override

from askui.logger import logger
from askui.models.exceptions import QueryNoResponseError
from askui.models.models import GetModel
from askui.models.shared.prompts import SYSTEM_PROMPT_GET
from askui.models.types.response_schemas import ResponseSchema, to_response_schema
from askui.utils.image_utils import ImageSource

from .settings import OpenRouterSettings

if TYPE_CHECKING:
from openai.types.chat.completion_create_params import ResponseFormat


def _clean_schema_refs(schema: dict[str, Any] | list[Any]) -> None:
"""Remove title fields that are at the same level as $ref fields as they are not supported by OpenAI.""" # noqa: E501
if isinstance(schema, dict):
if "$ref" in schema and "title" in schema:
del schema["title"]
for value in schema.values():
if isinstance(value, (dict, list)):
_clean_schema_refs(value)
elif isinstance(schema, list):
for item in schema:
if isinstance(item, (dict, list)):
_clean_schema_refs(item)


class OpenRouterModel(GetModel):
"""
This class implements the GetModel interface for the OpenRouter API.

Args:
settings (OpenRouterSettings): The settings for the OpenRouter model.

Example:
```python
from askui import VisionAgent
from askui.models import (
OpenRouterModel,
OpenRouterSettings,
ModelRegistry,
)


# Register OpenRouter model in the registry
custom_models: ModelRegistry = {
"my-custom-model": OpenRouterGetModel(
OpenRouterSettings(
model="anthropic/claude-opus-4",
)
),
}

with VisionAgent(models=custom_models, model={"get":"my-custom-model"}) as agent:
result = agent.get("What is the main heading on the screen?")
print(result)
```
""" # noqa: E501

def __init__(
self,
settings: OpenRouterSettings | None = None,
client: Optional[OpenAI] = None,
):
self._settings = settings or OpenRouterSettings()

self._client = (
client
if client is not None
else OpenAI(
api_key=self._settings.open_router_api_key.get_secret_value(),
base_url=str(self._settings.base_url),
)
)

def _predict(
self,
image_url: str,
instruction: str,
prompt: str,
response_schema: type[ResponseSchema] | None,
) -> str | None | ResponseSchema:
extra_body: dict[str, object] = {}

if len(self._settings.models) > 0:
extra_body["models"] = self._settings.models

_response_schema = (
to_response_schema(response_schema) if response_schema else None
)

response_format: openai.NotGiven | ResponseFormat = openai.NOT_GIVEN
if _response_schema is not None:
extra_body["provider"] = {"require_parameters": True}
schema = _response_schema.model_json_schema()
_clean_schema_refs(schema)

defs = schema.pop("$defs", None)
schema_response_wrapper = {
"type": "object",
"properties": {"response": schema},
"additionalProperties": False,
"required": ["response"],
}
if defs:
schema_response_wrapper["$defs"] = defs
response_format = {
"type": "json_schema",
"json_schema": {
"name": "user_json_schema",
"schema": schema_response_wrapper,
"strict": True,
},
}

chat_completion = self._client.chat.completions.create(
model=self._settings.model,
extra_body=extra_body,
response_format=response_format,
messages=[
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": image_url,
},
},
{"type": "text", "text": prompt + instruction},
],
}
],
stream=False,
top_p=self._settings.chat_completions_create_settings.top_p,
temperature=self._settings.chat_completions_create_settings.temperature,
max_tokens=self._settings.chat_completions_create_settings.max_tokens,
seed=self._settings.chat_completions_create_settings.seed,
stop=self._settings.chat_completions_create_settings.stop,
frequency_penalty=self._settings.chat_completions_create_settings.frequency_penalty,
presence_penalty=self._settings.chat_completions_create_settings.presence_penalty,
)

model_response = chat_completion.choices[0].message.content

if _response_schema is not None and model_response is not None:
try:
response_json = json.loads(model_response)
except json.JSONDecodeError:
error_msg = f"Expected JSON, but model {self._settings.model} returned: {model_response}" # noqa: E501
logger.error(error_msg)
raise ValueError(error_msg) from None

validated_response = _response_schema.model_validate(
response_json["response"]
)
return validated_response.root

return model_response

@override
def get(
self,
query: str,
image: ImageSource,
response_schema: Type[ResponseSchema] | None,
model_choice: str,
) -> ResponseSchema | str:
response = self._predict(
image_url=image.to_data_url(),
instruction=query,
prompt=SYSTEM_PROMPT_GET,
response_schema=response_schema,
)
if response is None:
error_msg = f'No response from model "{model_choice}" to query: "{query}"'
raise QueryNoResponseError(error_msg, query)
return response
1 change: 0 additions & 1 deletion src/askui/models/openrouter/prompts.py

This file was deleted.

Loading