11import time
22import types
33from abc import ABC
4+ from pathlib import Path
45from typing import Annotated , Optional , Type , overload
56
67from dotenv import load_dotenv
1617from askui .tools .agent_os import AgentOs
1718from askui .tools .android .agent_os import AndroidAgentOs
1819from askui .utils .image_utils import ImageSource , Img
20+ from askui .utils .pdf_utils import Pdf
21+ from askui .utils .source_utils import load_image_source , load_source
1922
2023from .logger import configure_logging , logger
2124from .models import ModelComposition
@@ -189,46 +192,53 @@ def get(
189192 query : Annotated [str , Field (min_length = 1 )],
190193 response_schema : None = None ,
191194 model : str | None = None ,
192- image : Optional [Img ] = None ,
195+ source : Optional [Img | Pdf ] = None ,
193196 ) -> str : ...
194197 @overload
195198 def get (
196199 self ,
197200 query : Annotated [str , Field (min_length = 1 )],
198201 response_schema : Type [ResponseSchema ],
199202 model : str | None = None ,
200- image : Optional [Img ] = None ,
203+ source : Optional [Img | Pdf ] = None ,
201204 ) -> ResponseSchema : ...
202205
203- @telemetry .record_call (exclude = {"query" , "image " , "response_schema" })
206+ @telemetry .record_call (exclude = {"query" , "source " , "response_schema" })
204207 @validate_call (config = ConfigDict (arbitrary_types_allowed = True ))
205208 def get (
206209 self ,
207210 query : Annotated [str , Field (min_length = 1 )],
208211 response_schema : Type [ResponseSchema ] | None = None ,
209212 model : str | None = None ,
210- image : Optional [Img ] = None ,
213+ source : Optional [Img | Pdf ] = None ,
211214 ) -> ResponseSchema | str :
212215 """
213- Retrieves information from an image (defaults to a screenshot of the current
214- screen) based on the provided `query`.
216+ Retrieves information from an image or PDF based on the provided `query`.
217+
218+ If no `source` is provided, a screenshot of the current screen is taken.
215219
216220 Args:
217221 query (str): The query describing what information to retrieve.
218- image (Img | None, optional): The image to extract information from.
219- Defaults to a screenshot of the current screen. Can be a path to
220- an image file, a PIL Image object or a data URL .
222+ source (Img | Pdf | None, optional): The source to extract information from.
223+ Can be a path to a PDF file, a path to an image file, a PIL Image
224+ object or a data URL. Defaults to a screenshot of the current screen .
221225 response_schema (Type[ResponseSchema] | None, optional): A Pydantic model
222226 class that defines the response schema. If not provided, returns a
223227 string.
224228 model (str | None, optional): The composition or name of the model(s) to
225229 be used for retrieving information from the screen or image using the
226230 `query`. Note: `response_schema` is not supported by all models.
231+ PDF processing is only supported for Gemini models hosted on AskUI.
227232
228233 Returns:
229234 ResponseSchema | str: The extracted information, `str` if no
230235 `response_schema` is provided.
231236
237+ Raises:
238+ NotImplementedError: If PDF processing is not supported for the selected
239+ model.
240+ ValueError: If the `source` is not a valid PDF or image.
241+
232242 Example:
233243 ```python
234244 from askui import ResponseSchemaBase, VisionAgent
@@ -253,7 +263,7 @@ class LinkedListNode(ResponseSchemaBase):
253263 response = agent.get(
254264 "What is the current url shown in the url bar?",
255265 response_schema=UrlResponse,
256- image ="screenshot.png",
266+ source ="screenshot.png",
257267 )
258268 # Dump whole model
259269 print(response.model_dump_json(indent=2))
@@ -268,7 +278,7 @@ class LinkedListNode(ResponseSchemaBase):
268278 is_login_page = agent.get(
269279 "Is this a login page?",
270280 response_schema=bool,
271- image =Image.open("screenshot.png"),
281+ source =Image.open("screenshot.png"),
272282 )
273283 print(is_login_page)
274284
@@ -302,13 +312,34 @@ class LinkedListNode(ResponseSchemaBase):
302312 while current:
303313 print(current.value)
304314 current = current.next
315+
316+ # Get text from PDF
317+ text = agent.get(
318+ "Extract all text from the PDF",
319+ source="document.pdf",
320+ )
321+ print(text)
305322 ```
306323 """
307324 logger .debug ("VisionAgent received instruction to get '%s'" , query )
308- _image = ImageSource (self ._agent_os .screenshot () if image is None else image )
309- self ._reporter .add_message ("User" , f'get: "{ query } "' , image = _image .root )
325+ _source = (
326+ ImageSource (self ._agent_os .screenshot ())
327+ if source is None
328+ else load_source (source )
329+ )
330+
331+ # Prepare message content with file path if available
332+ user_message_content = f'get: "{ query } "' + (
333+ f" from '{ source } '" if isinstance (source , (str , Path )) else ""
334+ )
335+
336+ self ._reporter .add_message (
337+ "User" ,
338+ user_message_content ,
339+ image = _source .root if isinstance (_source , ImageSource ) else None ,
340+ )
310341 response = self ._model_router .get (
311- image = _image ,
342+ source = _source ,
312343 query = query ,
313344 response_schema = response_schema ,
314345 model_choice = model or self ._model_choice ["get" ],
@@ -328,7 +359,7 @@ def _locate(
328359 model : ModelComposition | str | None = None ,
329360 ) -> Point :
330361 def locate_with_screenshot () -> Point :
331- _screenshot = ImageSource (
362+ _screenshot = load_image_source (
332363 self ._agent_os .screenshot () if screenshot is None else screenshot
333364 )
334365 return self ._model_router .locate (
0 commit comments