askui · adi-wan-askui · Aug 20, 2025 · Aug 18, 2025 · Aug 18, 2025 · Aug 20, 2025
diff --git a/.cursorrules b/.cursorrules
@@ -40,15 +40,15 @@
   def locate(
       self,
       locator: str | Locator,
-      screenshot: Img | None = None,
+      screenshot: InputSource | None = None,
       model: ModelComposition | str | None = None,
   ) -> Point:
       """
       Find the position of the UI element identified by the `locator` using the `model`.
 
       Args:
           locator (str | Locator): The identifier or description of the element to locate.
-          screenshot (Img | None, optional): The screenshot to use for locating the
+          screenshot (InputSource | None, optional): The screenshot to use for locating the
               element. Can be a path to an image file, a PIL Image object or a data URL.
               If `None`, takes a screenshot of the currently selected screen.
           model (ModelComposition | str | None, optional): The composition or name of

diff --git a/README.md b/README.md
@@ -756,6 +756,17 @@ with VisionAgent() as agent:
   as we try different models under the hood with your schema to see which one works best.
 - PDF processing is only supported for Gemini models hosted on AskUI and for PDFs up to 20MB.
 
+### 📄 Document Processing with `markitdown`
+
+When extracting data from documents like Docs or Excel files, we use the `markitdown` library to convert them into markdown format. We chose `markitdown` over other tools for several reasons:
+
+- **LLM-Friendly Output:** The markdown output is optimized for token usage, which is efficient for subsequent processing with large language models.
+- **Includes Sheet Names:** When converting Excel files, the name of the sheet is included in the generated markdown, providing better context.
+- **Enhanced Image Descriptions:** It can use an OpenAI client (`llm_client` and `llm_model`) to generate more descriptive captions for images within documents.
+- **No Local Inference:** No model inference is performed on the client machine, which means no need to install and maintain heavy packages like `torch`.
+- **Optional Dependencies:** It allows for optional imports, meaning you only need to install the dependencies for the file types you are working with. This reduces the number of packages to manage.
+- **Microsoft Maintained:** Being maintained by Microsoft, it offers robust support for converting Office documents.
+
 ## What is AskUI Vision Agent?
 
 **AskUI Vision Agent** is a versatile AI powered framework that enables you to automate computer tasks in Python.

diff --git a/pdm.lock b/pdm.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -26,6 +26,7 @@ dependencies = [
     "protobuf>=6.31.1",
     "google-genai>=1.20.0",
     "filetype>=1.2.0",
+    "markitdown[xls,xlsx,docx]>=0.1.2",
 ]
 requires-python = ">=3.10"
 readme = "README.md"
@@ -224,4 +225,4 @@ pynput = [
 ]
 web = [
     "playwright>=1.41.0",
-]
+]
diff --git a/src/askui/__init__.py b/src/askui/__init__.py
@@ -37,7 +37,8 @@
 from .models.types.response_schemas import ResponseSchema, ResponseSchemaBase
 from .retry import ConfigurableRetry, Retry
 from .tools import ModifierKey, PcKey
-from .utils.image_utils import ImageSource, Img
+from .utils.image_utils import ImageSource
+from .utils.source_utils import InputSource
 
 try:
     from .android_agent import AndroidVisionAgent
@@ -67,7 +68,7 @@
     "GetModel",
     "ImageBlockParam",
     "ImageSource",
-    "Img",
+    "InputSource",
     "LocateModel",
     "Locator",
     "MessageParam",

diff --git a/src/askui/agent_base.py b/src/askui/agent_base.py
@@ -16,9 +16,8 @@
 from askui.models.shared.tools import Tool
 from askui.tools.agent_os import AgentOs
 from askui.tools.android.agent_os import AndroidAgentOs
-from askui.utils.image_utils import ImageSource, Img
-from askui.utils.pdf_utils import Pdf
-from askui.utils.source_utils import load_image_source, load_source
+from askui.utils.image_utils import ImageSource
+from askui.utils.source_utils import InputSource, load_image_source, load_source
 
 from .logger import configure_logging, logger
 from .models import ModelComposition
@@ -193,15 +192,15 @@ def get(
         query: Annotated[str, Field(min_length=1)],
         response_schema: None = None,
         model: str | None = None,
-        source: Optional[Img | Pdf] = None,
+        source: Optional[InputSource] = None,
     ) -> str: ...
     @overload
     def get(
         self,
         query: Annotated[str, Field(min_length=1)],
         response_schema: Type[ResponseSchema],
         model: str | None = None,
-        source: Optional[Img | Pdf] = None,
+        source: Optional[InputSource] = None,
     ) -> ResponseSchema: ...
 
     @telemetry.record_call(exclude={"query", "source", "response_schema"})
@@ -211,7 +210,7 @@ def get(
         query: Annotated[str, Field(min_length=1)],
         response_schema: Type[ResponseSchema] | None = None,
         model: str | None = None,
-        source: Optional[Img | Pdf] = None,
+        source: Optional[InputSource] = None,
     ) -> ResponseSchema | str:
         """
         Retrieves information from an image or PDF based on the provided `query`.
@@ -220,9 +219,10 @@ def get(
 
         Args:
             query (str): The query describing what information to retrieve.
-            source (Img | Pdf | None, optional): The source to extract information from.
-                Can be a path to a PDF file, a path to an image file, a PIL Image
-                object or a data URL. Defaults to a screenshot of the current screen.
+            source (InputSource | None, optional): The source to extract information
+                from. Can be a path to an image, PDF, or office document file,
+                a PIL Image object or a data URL. Defaults to a screenshot of the
+                current screen.
             response_schema (Type[ResponseSchema] | None, optional): A Pydantic model
                 class that defines the response schema. If not provided, returns a
                 string.
@@ -357,7 +357,7 @@ class LinkedListNode(ResponseSchemaBase):
     def _locate(
         self,
         locator: str | Locator,
-        screenshot: Optional[Img] = None,
+        screenshot: Optional[InputSource] = None,
         model: ModelComposition | str | None = None,
     ) -> PointList:
         def locate_with_screenshot() -> PointList:
@@ -380,7 +380,7 @@ def locate_with_screenshot() -> PointList:
     def locate(
         self,
         locator: str | Locator,
-        screenshot: Optional[Img] = None,
+        screenshot: Optional[InputSource] = None,
         model: ModelComposition | str | None = None,
     ) -> Point:
         """
@@ -389,9 +389,10 @@ def locate(
         Args:
             locator (str | Locator): The identifier or description of the element to
                 locate.
-            screenshot (Img | None, optional): The screenshot to use for locating the
-                element. Can be a path to an image file, a PIL Image object or a data
-                URL. If `None`, takes a screenshot of the currently selected display.
+            screenshot (InputSource | None, optional): The screenshot to use for
+                locating the element. Can be a path to an image file, a PIL Image object
+                or a data URL. If `None`, takes a screenshot of the currently
+                selected display.
             model (ModelComposition | str | None, optional): The composition or name
                 of the model(s) to be used for locating the element using the `locator`.
 
@@ -419,7 +420,7 @@ def locate(
     def locate_all(
         self,
         locator: str | Locator,
-        screenshot: Optional[Img] = None,
+        screenshot: Optional[InputSource] = None,
         model: ModelComposition | str | None = None,
     ) -> PointList:
         """
@@ -431,9 +432,10 @@ def locate_all(
         Args:
             locator (str | Locator): The identifier or description of the element to
                 locate.
-            screenshot (Img | None, optional): The screenshot to use for locating the
-                element. Can be a path to an image file, a PIL Image object or a data
-                URL. If `None`, takes a screenshot of the currently selected display.
+            screenshot (InputSource | None, optional): The screenshot to use for
+                locating the element. Can be a path to an image file, a PIL Image object
+                or a data URL. If `None`, takes a screenshot of the currently
+                selected display.
             model (ModelComposition | str | None, optional): The composition or name
                 of the model(s) to be used for locating the element using the `locator`.
 

diff --git a/src/askui/models/anthropic/messages_api.py b/src/askui/models/anthropic/messages_api.py
@@ -42,6 +42,7 @@
 from askui.models.shared.tools import ToolCollection
 from askui.models.types.response_schemas import ResponseSchema
 from askui.utils.dict_utils import IdentityDefaultDict
+from askui.utils.excel_utils import OfficeDocumentSource
 from askui.utils.image_utils import (
     ImageSource,
     image_to_base64,
@@ -242,8 +243,11 @@ def get(
         response_schema: Type[ResponseSchema] | None,
         model_choice: str,
     ) -> ResponseSchema | str:
-        if isinstance(source, PdfSource):
-            err_msg = f"PDF processing is not supported for the model {model_choice}"
+        if isinstance(source, (PdfSource, OfficeDocumentSource)):
+            err_msg = (
+                f"PDF or Office Document processing is not supported for the model: "
+                f"{model_choice}"
+            )
             raise NotImplementedError(err_msg)
         try:
             if response_schema is not None:

diff --git a/src/askui/models/askui/google_genai_api.py b/src/askui/models/askui/google_genai_api.py
@@ -21,6 +21,7 @@
 from askui.models.models import GetModel, ModelName
 from askui.models.shared.prompts import SYSTEM_PROMPT_GET
 from askui.models.types.response_schemas import ResponseSchema, to_response_schema
+from askui.utils.excel_utils import OfficeDocumentSource
 from askui.utils.http_utils import parse_retry_after_header
 from askui.utils.image_utils import ImageSource
 from askui.utils.source_utils import Source
@@ -185,6 +186,10 @@ def _create_genai_part_from_source(self, source: Source) -> genai_types.Part:
                 data=data,
                 mime_type="image/png",
             )
+        if isinstance(source, OfficeDocumentSource):
+            with source.reader as r:
+                data = r.read()
+                return genai_types.Part.from_text(text=data.decode())
         with source.reader as r:
             data = r.read()
             if len(data) > MAX_FILE_SIZE_BYTES:

diff --git a/src/askui/models/askui/inference_api.py b/src/askui/models/askui/inference_api.py
@@ -26,6 +26,7 @@
 from askui.models.shared.settings import MessageSettings
 from askui.models.shared.tools import ToolCollection
 from askui.models.types.response_schemas import ResponseSchema
+from askui.utils.excel_utils import OfficeDocumentSource
 from askui.utils.image_utils import ImageSource
 from askui.utils.pdf_utils import PdfSource
 from askui.utils.source_utils import Source
@@ -205,8 +206,11 @@ def get(
         response_schema: Type[ResponseSchema] | None,
         model_choice: str,
     ) -> ResponseSchema | str:
-        if isinstance(source, PdfSource):
-            err_msg = f"PDF processing is not supported for the model {model_choice}"
+        if isinstance(source, (PdfSource, OfficeDocumentSource)):
+            err_msg = (
+                f"PDF or Office Document processing is not supported for the model: "
+                f"{model_choice}"
+            )
             raise NotImplementedError(err_msg)
         json: dict[str, Any] = {
             "image": source.to_data_url(),

diff --git a/src/askui/models/openrouter/model.py b/src/askui/models/openrouter/model.py
@@ -10,6 +10,7 @@
 from askui.models.models import GetModel
 from askui.models.shared.prompts import SYSTEM_PROMPT_GET
 from askui.models.types.response_schemas import ResponseSchema, to_response_schema
+from askui.utils.excel_utils import OfficeDocumentSource
 from askui.utils.pdf_utils import PdfSource
 from askui.utils.source_utils import Source
 
@@ -174,8 +175,11 @@ def get(
         response_schema: Type[ResponseSchema] | None,
         model_choice: str,
     ) -> ResponseSchema | str:
-        if isinstance(source, PdfSource):
-            err_msg = f"PDF processing is not supported for the model {model_choice}"
+        if isinstance(source, (PdfSource, OfficeDocumentSource)):
+            err_msg = (
+                f"PDF or Office Document processing is not supported for the model: "
+                f"{model_choice}"
+            )
             raise NotImplementedError(err_msg)
         response = self._predict(
             image_url=source.to_data_url(),

diff --git a/src/askui/models/ui_tars_ep/ui_tars_api.py b/src/askui/models/ui_tars_ep/ui_tars_api.py
@@ -24,6 +24,7 @@
 from askui.models.shared.tools import Tool
 from askui.models.types.response_schemas import ResponseSchema
 from askui.reporting import Reporter
+from askui.utils.excel_utils import OfficeDocumentSource
 from askui.utils.image_utils import ImageSource, image_to_base64
 from askui.utils.pdf_utils import PdfSource
 from askui.utils.source_utils import Source
@@ -188,8 +189,8 @@ def get(
         response_schema: Type[ResponseSchema] | None,
         model_choice: str,
     ) -> ResponseSchema | str:
-        if isinstance(source, PdfSource):
-            err_msg = f"PDF processing is not supported for the model {model_choice}"
+        if isinstance(source, (PdfSource, OfficeDocumentSource)):
+            err_msg = f"PDF and Excel processing is not supported for the model {model_choice}"
             raise NotImplementedError(err_msg)
         if response_schema is not None:
             error_msg = f'Response schema is not supported for model "{model_choice}"'

diff --git a/src/askui/utils/excel_utils.py b/src/askui/utils/excel_utils.py
@@ -0,0 +1,33 @@
+from io import BytesIO
+from pathlib import Path
+
+from pydantic import ConfigDict, RootModel
+
+from askui.utils.markdown_utils import convert_to_markdown
+
+
+class OfficeDocumentSource(RootModel):
+    """Represents an Excel source that can be read as markdown.
+
+    The class can be initialized with:
+    - A file path (str or pathlib.Path)
+
+    Attributes:
+        root (bytes | Path): The underlying Excel bytes or file path.
+
+    Args:
+        root (Excel): The Excel source to load from.
+    """
+
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+    root: bytes | Path
+
+    @property
+    def reader(self) -> BytesIO:
+        markdown_content = convert_to_markdown(self.root)
+        return BytesIO(markdown_content.encode())
+
+
+__all__ = [
+    "OfficeDocumentSource",
+]
diff --git a/src/askui/utils/image_utils.py b/src/askui/utils/image_utils.py
@@ -310,16 +310,6 @@ def scale_coordinates(
     return result
 
 
-Img = Union[str, Path, PILImage.Image]
-"""Type of the input images for `askui.VisionAgent.get()`, `askui.VisionAgent.locate()`, etc.
-
-Accepts:
-- `PIL.Image.Image`
-- Relative or absolute file path (`str` or `pathlib.Path`)
-- Data URL (e.g., `"data:image/png;base64,..."`)
-"""
-
-
 class ImageSource(RootModel):
     """A class that represents an image source and provides methods to convert it to different formats.
 
@@ -375,5 +365,4 @@ def to_bytes(self) -> bytes:
     "scale_coordinates",
     "ScalingResults",
     "ImageSource",
-    "Img",
 ]
diff --git a/src/askui/utils/markdown_utils.py b/src/askui/utils/markdown_utils.py
@@ -0,0 +1,24 @@
+from io import BytesIO
+from pathlib import Path
+from typing import BinaryIO
+
+from markitdown import MarkItDown
+
+_MARKDOWN_CONVERTER = MarkItDown()
+
+
+def convert_to_markdown(source: Path | bytes | BinaryIO) -> str:
+    """Converts a source to markdown text.
+
+    Args:
+        source (Path | bytes | BinaryIO): The source to convert.
+
+    Returns:
+        str: The markdown representation of the source.
+    """
+    if isinstance(source, bytes):
+        bytes_source = BytesIO(source)
+        result = _MARKDOWN_CONVERTER.convert(bytes_source)
+        return result.text_content
+    result = _MARKDOWN_CONVERTER.convert(source)
+    return result.text_content
diff --git a/src/askui/utils/pdf_utils.py b/src/askui/utils/pdf_utils.py
@@ -1,16 +1,8 @@
 from io import BufferedReader, BytesIO
 from pathlib import Path
-from typing import Union
 
 from pydantic import ConfigDict, RootModel
 
-Pdf = Union[str, Path]
-"""Type of the input PDFs for `askui.VisionAgent.get()`, etc.
-
-Accepts:
-- Relative or absolute file path (`str` or `pathlib.Path`)
-"""
-
 
 class PdfSource(RootModel):
     """A class that represents a PDF source.
@@ -38,5 +30,4 @@ def reader(self) -> BufferedReader | BytesIO:
 
 __all__ = [
     "PdfSource",
-    "Pdf",
 ]