askui
diff --git a/‎README.md‎
Lines changed: 11 additions & 7 deletions b/‎README.md‎
Lines changed: 11 additions & 7 deletions
diff --git a/‎pdm.lock‎
Lines changed: 11 additions & 1 deletion b/‎pdm.lock‎
Lines changed: 11 additions & 1 deletion
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions b/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/askui/agent_base.py‎
Lines changed: 46 additions & 15 deletions b/‎src/askui/agent_base.py‎
Lines changed: 46 additions & 15 deletions
diff --git a/‎src/askui/locators/locators.py‎
Lines changed: 2 additions & 2 deletions b/‎src/askui/locators/locators.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/askui/models/anthropic/messages_api.py‎
Lines changed: 7 additions & 2 deletions b/‎src/askui/models/anthropic/messages_api.py‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎src/askui/models/askui/get_model.py‎
Lines changed: 4 additions & 4 deletions b/‎src/askui/models/askui/get_model.py‎
Lines changed: 4 additions & 4 deletions
@@ -350,7 +350,7 @@ class MyGetAndLocateModel(GetModel, LocateModel):
     def get(
         self,
         query: str,
-        image: ImageSource,
+        source: Source,
         response_schema: Type[ResponseSchema] | None,
         model_choice: str,
     ) -> ResponseSchema | str:
@@ -640,9 +640,9 @@ else:
     agent.click("Login")
 ```
 
-#### Using custom images
+#### Using custom images and PDFs
 
-Instead of taking a screenshot, you can analyze specific images:
+Instead of taking a screenshot, you can analyze specific images or PDFs:
 
 ```python
 from PIL import Image
@@ -651,10 +651,13 @@ from askui import VisionAgent
 # From PIL Image
 with VisionAgent() as agent:
   image = Image.open("screenshot.png")
-  result = agent.get("What's in this image?", image)
+  result = agent.get("What's in this image?", source=image)
 
   # From file path
-  result = agent.get("What's in this image?", "screenshot.png")
+  result = agent.get("What's in this image?", source="screenshot.png")
+
+  # From PDF
+  result = agent.get("What is this PDF about?", source="document.pdf")
 ```
 
 #### Using response schemas
@@ -696,7 +699,7 @@ with VisionAgent() as agent:
     response = agent.get(
         "What is the current url shown in the url bar?",
         response_schema=UrlResponse,
-        image="screenshot.png",
+        source="screenshot.png",
     )
 
     # Dump whole model
@@ -712,7 +715,7 @@ with VisionAgent() as agent:
     is_login_page = agent.get(
         "Is this a login page?",
         response_schema=bool,
-        image=Image.open("screenshot.png"),
+        source=Image.open("screenshot.png"),
     )
     print(is_login_page)
 
@@ -751,6 +754,7 @@ with VisionAgent() as agent:
 **⚠️ Limitations:**
 - The support for response schemas varies among models. Currently, the `askui` model provides best support for response schemas
   as we try different models under the hood with your schema to see which one works best.
+- PDF processing is only supported for Gemini models hosted on AskUI and for PDFs up to 20MB.
 
 ## What is AskUI Vision Agent?
 
 
@@ -25,6 +25,7 @@ dependencies = [
     "jsonref>=1.1.0",
     "protobuf>=6.31.1",
     "google-genai>=1.20.0",
+    "filetype>=1.2.0",
 ]
 requires-python = ">=3.10"
 readme = "README.md"
 
@@ -1,6 +1,7 @@
 import time
 import types
 from abc import ABC
+from pathlib import Path
 from typing import Annotated, Optional, Type, overload
 
 from dotenv import load_dotenv
@@ -16,6 +17,8 @@
 from askui.tools.agent_os import AgentOs
 from askui.tools.android.agent_os import AndroidAgentOs
 from askui.utils.image_utils import ImageSource, Img
+from askui.utils.pdf_utils import Pdf
+from askui.utils.source_utils import load_image_source, load_source
 
 from .logger import configure_logging, logger
 from .models import ModelComposition
@@ -189,46 +192,53 @@ def get(
         query: Annotated[str, Field(min_length=1)],
         response_schema: None = None,
         model: str | None = None,
-        image: Optional[Img] = None,
+        source: Optional[Img | Pdf] = None,
     ) -> str: ...
     @overload
     def get(
         self,
         query: Annotated[str, Field(min_length=1)],
         response_schema: Type[ResponseSchema],
         model: str | None = None,
-        image: Optional[Img] = None,
+        source: Optional[Img | Pdf] = None,
     ) -> ResponseSchema: ...
 
-    @telemetry.record_call(exclude={"query", "image", "response_schema"})
+    @telemetry.record_call(exclude={"query", "source", "response_schema"})
     @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
     def get(
         self,
         query: Annotated[str, Field(min_length=1)],
         response_schema: Type[ResponseSchema] | None = None,
         model: str | None = None,
-        image: Optional[Img] = None,
+        source: Optional[Img | Pdf] = None,
     ) -> ResponseSchema | str:
         """
-        Retrieves information from an image (defaults to a screenshot of the current
-        screen) based on the provided `query`.
+        Retrieves information from an image or PDF based on the provided `query`.
+
+        If no `source` is provided, a screenshot of the current screen is taken.
 
         Args:
             query (str): The query describing what information to retrieve.
-            image (Img | None, optional): The image to extract information from.
-                Defaults to a screenshot of the current screen. Can be a path to
-                an image file, a PIL Image object or a data URL.
+            source (Img | Pdf | None, optional): The source to extract information from.
+                Can be a path to a PDF file, a path to an image file, a PIL Image
+                object or a data URL. Defaults to a screenshot of the current screen.
             response_schema (Type[ResponseSchema] | None, optional): A Pydantic model
                 class that defines the response schema. If not provided, returns a
                 string.
             model (str | None, optional): The composition or name of the model(s) to
                 be used for retrieving information from the screen or image using the
                 `query`. Note: `response_schema` is not supported by all models.
+                PDF processing is only supported for Gemini models hosted on AskUI.
 
         Returns:
             ResponseSchema | str: The extracted information, `str` if no
                 `response_schema` is provided.
 
+        Raises:
+            NotImplementedError: If PDF processing is not supported for the selected
+                model.
+            ValueError: If the `source` is not a valid PDF or image.
+
         Example:
             ```python
             from askui import ResponseSchemaBase, VisionAgent
@@ -253,7 +263,7 @@ class LinkedListNode(ResponseSchemaBase):
                 response = agent.get(
                     "What is the current url shown in the url bar?",
                     response_schema=UrlResponse,
-                    image="screenshot.png",
+                    source="screenshot.png",
                 )
                 # Dump whole model
                 print(response.model_dump_json(indent=2))
@@ -268,7 +278,7 @@ class LinkedListNode(ResponseSchemaBase):
                 is_login_page = agent.get(
                     "Is this a login page?",
                     response_schema=bool,
-                    image=Image.open("screenshot.png"),
+                    source=Image.open("screenshot.png"),
                 )
                 print(is_login_page)
 
@@ -302,13 +312,34 @@ class LinkedListNode(ResponseSchemaBase):
                 while current:
                     print(current.value)
                     current = current.next
+
+                # Get text from PDF
+                text = agent.get(
+                    "Extract all text from the PDF",
+                    source="document.pdf",
+                )
+                print(text)
             ```
         """
         logger.debug("VisionAgent received instruction to get '%s'", query)
-        _image = ImageSource(self._agent_os.screenshot() if image is None else image)
-        self._reporter.add_message("User", f'get: "{query}"', image=_image.root)
+        _source = (
+            ImageSource(self._agent_os.screenshot())
+            if source is None
+            else load_source(source)
+        )
+
+        # Prepare message content with file path if available
+        user_message_content = f'get: "{query}"' + (
+            f" from '{source}'" if isinstance(source, (str, Path)) else ""
+        )
+
+        self._reporter.add_message(
+            "User",
+            user_message_content,
+            image=_source.root if isinstance(_source, ImageSource) else None,
+        )
         response = self._model_router.get(
-            image=_image,
+            source=_source,
             query=query,
             response_schema=response_schema,
             model_choice=model or self._model_choice["get"],
@@ -328,7 +359,7 @@ def _locate(
         model: ModelComposition | str | None = None,
     ) -> Point:
         def locate_with_screenshot() -> Point:
-            _screenshot = ImageSource(
+            _screenshot = load_image_source(
                 self._agent_os.screenshot() if screenshot is None else screenshot
             )
             return self._model_router.locate(
 
@@ -7,7 +7,7 @@
 from pydantic import ConfigDict, Field, validate_call
 
 from askui.locators.relatable import Relatable
-from askui.utils.image_utils import ImageSource
+from askui.utils.source_utils import load_image_source
 
 TextMatchType = Literal["similar", "exact", "contains", "regex"]
 """The type of match to use.
@@ -303,7 +303,7 @@ def __init__(
             image_compare_format=image_compare_format,
             name=_generate_name() if name is None else name,
         )
-        self._image = ImageSource(image)
+        self._image = load_image_source(image)
 
 
 class AiElement(ImageBase):
 
@@ -48,6 +48,8 @@
     scale_coordinates,
     scale_image_to_fit,
 )
+from askui.utils.pdf_utils import PdfSource
+from askui.utils.source_utils import Source
 
 from .utils import extract_click_coordinates
 
@@ -234,16 +236,19 @@ def locate(
     def get(
         self,
         query: str,
-        image: ImageSource,
+        source: Source,
         response_schema: Type[ResponseSchema] | None,
         model_choice: str,
     ) -> ResponseSchema | str:
+        if isinstance(source, PdfSource):
+            err_msg = f"PDF processing is not supported for the model {model_choice}"
+            raise NotImplementedError(err_msg)
         try:
             if response_schema is not None:
                 error_msg = "Response schema is not yet supported for Anthropic"
                 raise NotImplementedError(error_msg)
             return self._inference(
-                image=image,
+                image=source,
                 prompt=query,
                 system=SYSTEM_PROMPT_GET,
                 model_choice=model_choice,
 
@@ -9,7 +9,7 @@
 from askui.models.exceptions import QueryNoResponseError, QueryUnexpectedResponseError
 from askui.models.models import GetModel
 from askui.models.types.response_schemas import ResponseSchema
-from askui.utils.image_utils import ImageSource
+from askui.utils.source_utils import Source
 
 
 class AskUiGetModel(GetModel):
@@ -39,15 +39,15 @@ def __init__(
     def get(
         self,
         query: str,
-        image: ImageSource,
+        source: Source,
         response_schema: Type[ResponseSchema] | None,
         model_choice: str,
     ) -> ResponseSchema | str:
         try:
             logger.debug("Attempting to use Google GenAI API")
             return self._google_genai_api.get(
                 query=query,
-                image=image,
+                source=source,
                 response_schema=response_schema,
                 model_choice=model_choice,
             )
@@ -66,7 +66,7 @@ def get(
             )
             return self._inference_api.get(
                 query=query,
-                image=image,
+                source=source,
                 response_schema=response_schema,
                 model_choice=model_choice,
             )
Original file line number	Diff line number	Diff line change
`@@ -25,6 +25,7 @@ dependencies = [`
`25`	`25`	`"jsonref>=1.1.0",`
`26`	`26`	`"protobuf>=6.31.1",`
`27`	`27`	`"google-genai>=1.20.0",`
	`28`	`+ "filetype>=1.2.0",`
`28`	`29`	`]`
`29`	`30`	`requires-python = ">=3.10"`
`30`	`31`	`readme = "README.md"`