Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .cursorrules
Original file line number Diff line number Diff line change
Expand Up @@ -40,15 +40,15 @@
def locate(
self,
locator: str | Locator,
screenshot: Img | None = None,
screenshot: InputSource | None = None,
model: ModelComposition | str | None = None,
) -> Point:
"""
Find the position of the UI element identified by the `locator` using the `model`.

Args:
locator (str | Locator): The identifier or description of the element to locate.
screenshot (Img | None, optional): The screenshot to use for locating the
screenshot (InputSource | None, optional): The screenshot to use for locating the
element. Can be a path to an image file, a PIL Image object or a data URL.
If `None`, takes a screenshot of the currently selected screen.
model (ModelComposition | str | None, optional): The composition or name of
Expand Down
11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -756,6 +756,17 @@ with VisionAgent() as agent:
as we try different models under the hood with your schema to see which one works best.
- PDF processing is only supported for Gemini models hosted on AskUI and for PDFs up to 20MB.

### 📄 Document Processing with `markitdown`

When extracting data from documents like Docs or Excel files, we use the `markitdown` library to convert them into markdown format. We chose `markitdown` over other tools for several reasons:

- **LLM-Friendly Output:** The markdown output is optimized for token usage, which is efficient for subsequent processing with large language models.
- **Includes Sheet Names:** When converting Excel files, the name of the sheet is included in the generated markdown, providing better context.
- **Enhanced Image Descriptions:** It can use an OpenAI client (`llm_client` and `llm_model`) to generate more descriptive captions for images within documents.
- **No Local Inference:** No model inference is performed on the client machine, which means no need to install and maintain heavy packages like `torch`.
- **Optional Dependencies:** It allows for optional imports, meaning you only need to install the dependencies for the file types you are working with. This reduces the number of packages to manage.
- **Microsoft Maintained:** Being maintained by Microsoft, it offers robust support for converting Office documents.

## What is AskUI Vision Agent?

**AskUI Vision Agent** is a versatile AI powered framework that enables you to automate computer tasks in Python.
Expand Down
509 changes: 507 additions & 2 deletions pdm.lock

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ dependencies = [
"protobuf>=6.31.1",
"google-genai>=1.20.0",
"filetype>=1.2.0",
"markitdown[xls,xlsx,docx]>=0.1.2",
]
requires-python = ">=3.10"
readme = "README.md"
Expand Down Expand Up @@ -224,4 +225,4 @@ pynput = [
]
web = [
"playwright>=1.41.0",
]
]
5 changes: 3 additions & 2 deletions src/askui/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@
from .models.types.response_schemas import ResponseSchema, ResponseSchemaBase
from .retry import ConfigurableRetry, Retry
from .tools import ModifierKey, PcKey
from .utils.image_utils import ImageSource, Img
from .utils.image_utils import ImageSource
from .utils.source_utils import InputSource

try:
from .android_agent import AndroidVisionAgent
Expand Down Expand Up @@ -67,7 +68,7 @@
"GetModel",
"ImageBlockParam",
"ImageSource",
"Img",
"InputSource",
"LocateModel",
"Locator",
"MessageParam",
Expand Down
38 changes: 20 additions & 18 deletions src/askui/agent_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,8 @@
from askui.models.shared.tools import Tool
from askui.tools.agent_os import AgentOs
from askui.tools.android.agent_os import AndroidAgentOs
from askui.utils.image_utils import ImageSource, Img
from askui.utils.pdf_utils import Pdf
from askui.utils.source_utils import load_image_source, load_source
from askui.utils.image_utils import ImageSource
from askui.utils.source_utils import InputSource, load_image_source, load_source

from .logger import configure_logging, logger
from .models import ModelComposition
Expand Down Expand Up @@ -193,15 +192,15 @@ def get(
query: Annotated[str, Field(min_length=1)],
response_schema: None = None,
model: str | None = None,
source: Optional[Img | Pdf] = None,
source: Optional[InputSource] = None,
) -> str: ...
@overload
def get(
self,
query: Annotated[str, Field(min_length=1)],
response_schema: Type[ResponseSchema],
model: str | None = None,
source: Optional[Img | Pdf] = None,
source: Optional[InputSource] = None,
) -> ResponseSchema: ...

@telemetry.record_call(exclude={"query", "source", "response_schema"})
Expand All @@ -211,7 +210,7 @@ def get(
query: Annotated[str, Field(min_length=1)],
response_schema: Type[ResponseSchema] | None = None,
model: str | None = None,
source: Optional[Img | Pdf] = None,
source: Optional[InputSource] = None,
) -> ResponseSchema | str:
"""
Retrieves information from an image or PDF based on the provided `query`.
Expand All @@ -220,9 +219,10 @@ def get(

Args:
query (str): The query describing what information to retrieve.
source (Img | Pdf | None, optional): The source to extract information from.
Can be a path to a PDF file, a path to an image file, a PIL Image
object or a data URL. Defaults to a screenshot of the current screen.
source (InputSource | None, optional): The source to extract information
from. Can be a path to an image, PDF, or office document file,
a PIL Image object or a data URL. Defaults to a screenshot of the
current screen.
response_schema (Type[ResponseSchema] | None, optional): A Pydantic model
class that defines the response schema. If not provided, returns a
string.
Expand Down Expand Up @@ -357,7 +357,7 @@ class LinkedListNode(ResponseSchemaBase):
def _locate(
self,
locator: str | Locator,
screenshot: Optional[Img] = None,
screenshot: Optional[InputSource] = None,
model: ModelComposition | str | None = None,
) -> PointList:
def locate_with_screenshot() -> PointList:
Expand All @@ -380,7 +380,7 @@ def locate_with_screenshot() -> PointList:
def locate(
self,
locator: str | Locator,
screenshot: Optional[Img] = None,
screenshot: Optional[InputSource] = None,
model: ModelComposition | str | None = None,
) -> Point:
"""
Expand All @@ -389,9 +389,10 @@ def locate(
Args:
locator (str | Locator): The identifier or description of the element to
locate.
screenshot (Img | None, optional): The screenshot to use for locating the
element. Can be a path to an image file, a PIL Image object or a data
URL. If `None`, takes a screenshot of the currently selected display.
screenshot (InputSource | None, optional): The screenshot to use for
locating the element. Can be a path to an image file, a PIL Image object
or a data URL. If `None`, takes a screenshot of the currently
selected display.
model (ModelComposition | str | None, optional): The composition or name
of the model(s) to be used for locating the element using the `locator`.

Expand Down Expand Up @@ -419,7 +420,7 @@ def locate(
def locate_all(
self,
locator: str | Locator,
screenshot: Optional[Img] = None,
screenshot: Optional[InputSource] = None,
model: ModelComposition | str | None = None,
) -> PointList:
"""
Expand All @@ -431,9 +432,10 @@ def locate_all(
Args:
locator (str | Locator): The identifier or description of the element to
locate.
screenshot (Img | None, optional): The screenshot to use for locating the
element. Can be a path to an image file, a PIL Image object or a data
URL. If `None`, takes a screenshot of the currently selected display.
screenshot (InputSource | None, optional): The screenshot to use for
locating the element. Can be a path to an image file, a PIL Image object
or a data URL. If `None`, takes a screenshot of the currently
selected display.
model (ModelComposition | str | None, optional): The composition or name
of the model(s) to be used for locating the element using the `locator`.

Expand Down
8 changes: 6 additions & 2 deletions src/askui/models/anthropic/messages_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
from askui.models.shared.tools import ToolCollection
from askui.models.types.response_schemas import ResponseSchema
from askui.utils.dict_utils import IdentityDefaultDict
from askui.utils.excel_utils import OfficeDocumentSource
from askui.utils.image_utils import (
ImageSource,
image_to_base64,
Expand Down Expand Up @@ -242,8 +243,11 @@ def get(
response_schema: Type[ResponseSchema] | None,
model_choice: str,
) -> ResponseSchema | str:
if isinstance(source, PdfSource):
err_msg = f"PDF processing is not supported for the model {model_choice}"
if isinstance(source, (PdfSource, OfficeDocumentSource)):
err_msg = (
f"PDF or Office Document processing is not supported for the model: "
f"{model_choice}"
)
raise NotImplementedError(err_msg)
try:
if response_schema is not None:
Expand Down
5 changes: 5 additions & 0 deletions src/askui/models/askui/google_genai_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from askui.models.models import GetModel, ModelName
from askui.models.shared.prompts import SYSTEM_PROMPT_GET
from askui.models.types.response_schemas import ResponseSchema, to_response_schema
from askui.utils.excel_utils import OfficeDocumentSource
from askui.utils.http_utils import parse_retry_after_header
from askui.utils.image_utils import ImageSource
from askui.utils.source_utils import Source
Expand Down Expand Up @@ -185,6 +186,10 @@ def _create_genai_part_from_source(self, source: Source) -> genai_types.Part:
data=data,
mime_type="image/png",
)
if isinstance(source, OfficeDocumentSource):
with source.reader as r:
data = r.read()
return genai_types.Part.from_text(text=data.decode())
with source.reader as r:
data = r.read()
if len(data) > MAX_FILE_SIZE_BYTES:
Expand Down
8 changes: 6 additions & 2 deletions src/askui/models/askui/inference_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from askui.models.shared.settings import MessageSettings
from askui.models.shared.tools import ToolCollection
from askui.models.types.response_schemas import ResponseSchema
from askui.utils.excel_utils import OfficeDocumentSource
from askui.utils.image_utils import ImageSource
from askui.utils.pdf_utils import PdfSource
from askui.utils.source_utils import Source
Expand Down Expand Up @@ -205,8 +206,11 @@ def get(
response_schema: Type[ResponseSchema] | None,
model_choice: str,
) -> ResponseSchema | str:
if isinstance(source, PdfSource):
err_msg = f"PDF processing is not supported for the model {model_choice}"
if isinstance(source, (PdfSource, OfficeDocumentSource)):
err_msg = (
f"PDF or Office Document processing is not supported for the model: "
f"{model_choice}"
)
raise NotImplementedError(err_msg)
json: dict[str, Any] = {
"image": source.to_data_url(),
Expand Down
8 changes: 6 additions & 2 deletions src/askui/models/openrouter/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from askui.models.models import GetModel
from askui.models.shared.prompts import SYSTEM_PROMPT_GET
from askui.models.types.response_schemas import ResponseSchema, to_response_schema
from askui.utils.excel_utils import OfficeDocumentSource
from askui.utils.pdf_utils import PdfSource
from askui.utils.source_utils import Source

Expand Down Expand Up @@ -174,8 +175,11 @@ def get(
response_schema: Type[ResponseSchema] | None,
model_choice: str,
) -> ResponseSchema | str:
if isinstance(source, PdfSource):
err_msg = f"PDF processing is not supported for the model {model_choice}"
if isinstance(source, (PdfSource, OfficeDocumentSource)):
err_msg = (
f"PDF or Office Document processing is not supported for the model: "
f"{model_choice}"
)
raise NotImplementedError(err_msg)
response = self._predict(
image_url=source.to_data_url(),
Expand Down
5 changes: 3 additions & 2 deletions src/askui/models/ui_tars_ep/ui_tars_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from askui.models.shared.tools import Tool
from askui.models.types.response_schemas import ResponseSchema
from askui.reporting import Reporter
from askui.utils.excel_utils import OfficeDocumentSource
from askui.utils.image_utils import ImageSource, image_to_base64
from askui.utils.pdf_utils import PdfSource
from askui.utils.source_utils import Source
Expand Down Expand Up @@ -188,8 +189,8 @@ def get(
response_schema: Type[ResponseSchema] | None,
model_choice: str,
) -> ResponseSchema | str:
if isinstance(source, PdfSource):
err_msg = f"PDF processing is not supported for the model {model_choice}"
if isinstance(source, (PdfSource, OfficeDocumentSource)):
err_msg = f"PDF and Excel processing is not supported for the model {model_choice}"
raise NotImplementedError(err_msg)
if response_schema is not None:
error_msg = f'Response schema is not supported for model "{model_choice}"'
Expand Down
33 changes: 33 additions & 0 deletions src/askui/utils/excel_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from io import BytesIO
from pathlib import Path

from pydantic import ConfigDict, RootModel

from askui.utils.markdown_utils import convert_to_markdown


class OfficeDocumentSource(RootModel):
"""Represents an Excel source that can be read as markdown.

The class can be initialized with:
- A file path (str or pathlib.Path)

Attributes:
root (bytes | Path): The underlying Excel bytes or file path.

Args:
root (Excel): The Excel source to load from.
"""

model_config = ConfigDict(arbitrary_types_allowed=True)
root: bytes | Path

@property
def reader(self) -> BytesIO:
markdown_content = convert_to_markdown(self.root)
return BytesIO(markdown_content.encode())


__all__ = [
"OfficeDocumentSource",
]
11 changes: 0 additions & 11 deletions src/askui/utils/image_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,16 +310,6 @@ def scale_coordinates(
return result


Img = Union[str, Path, PILImage.Image]
"""Type of the input images for `askui.VisionAgent.get()`, `askui.VisionAgent.locate()`, etc.

Accepts:
- `PIL.Image.Image`
- Relative or absolute file path (`str` or `pathlib.Path`)
- Data URL (e.g., `"data:image/png;base64,..."`)
"""


class ImageSource(RootModel):
"""A class that represents an image source and provides methods to convert it to different formats.

Expand Down Expand Up @@ -375,5 +365,4 @@ def to_bytes(self) -> bytes:
"scale_coordinates",
"ScalingResults",
"ImageSource",
"Img",
]
24 changes: 24 additions & 0 deletions src/askui/utils/markdown_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from io import BytesIO
from pathlib import Path
from typing import BinaryIO

from markitdown import MarkItDown

_MARKDOWN_CONVERTER = MarkItDown()


def convert_to_markdown(source: Path | bytes | BinaryIO) -> str:
"""Converts a source to markdown text.

Args:
source (Path | bytes | BinaryIO): The source to convert.

Returns:
str: The markdown representation of the source.
"""
if isinstance(source, bytes):
bytes_source = BytesIO(source)
result = _MARKDOWN_CONVERTER.convert(bytes_source)
return result.text_content
result = _MARKDOWN_CONVERTER.convert(source)
return result.text_content
9 changes: 0 additions & 9 deletions src/askui/utils/pdf_utils.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,8 @@
from io import BufferedReader, BytesIO
from pathlib import Path
from typing import Union

from pydantic import ConfigDict, RootModel

Pdf = Union[str, Path]
"""Type of the input PDFs for `askui.VisionAgent.get()`, etc.

Accepts:
- Relative or absolute file path (`str` or `pathlib.Path`)
"""


class PdfSource(RootModel):
"""A class that represents a PDF source.
Expand Down Expand Up @@ -38,5 +30,4 @@ def reader(self) -> BufferedReader | BytesIO:

__all__ = [
"PdfSource",
"Pdf",
]
Loading