Skip to content

Commit aed0b15

Browse files
Merge pull request #115 from askui/CL-1574-lib-python-pdf-support-multi-page
feat(get): support PDF processing and refactor image handling
2 parents d6e0d8b + 252d0a4 commit aed0b15

File tree

26 files changed

+595
-231
lines changed

26 files changed

+595
-231
lines changed

README.md

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -350,7 +350,7 @@ class MyGetAndLocateModel(GetModel, LocateModel):
350350
def get(
351351
self,
352352
query: str,
353-
image: ImageSource,
353+
source: Source,
354354
response_schema: Type[ResponseSchema] | None,
355355
model_choice: str,
356356
) -> ResponseSchema | str:
@@ -640,9 +640,9 @@ else:
640640
agent.click("Login")
641641
```
642642

643-
#### Using custom images
643+
#### Using custom images and PDFs
644644

645-
Instead of taking a screenshot, you can analyze specific images:
645+
Instead of taking a screenshot, you can analyze specific images or PDFs:
646646

647647
```python
648648
from PIL import Image
@@ -651,10 +651,13 @@ from askui import VisionAgent
651651
# From PIL Image
652652
with VisionAgent() as agent:
653653
image = Image.open("screenshot.png")
654-
result = agent.get("What's in this image?", image)
654+
result = agent.get("What's in this image?", source=image)
655655

656656
# From file path
657-
result = agent.get("What's in this image?", "screenshot.png")
657+
result = agent.get("What's in this image?", source="screenshot.png")
658+
659+
# From PDF
660+
result = agent.get("What is this PDF about?", source="document.pdf")
658661
```
659662

660663
#### Using response schemas
@@ -696,7 +699,7 @@ with VisionAgent() as agent:
696699
response = agent.get(
697700
"What is the current url shown in the url bar?",
698701
response_schema=UrlResponse,
699-
image="screenshot.png",
702+
source="screenshot.png",
700703
)
701704

702705
# Dump whole model
@@ -712,7 +715,7 @@ with VisionAgent() as agent:
712715
is_login_page = agent.get(
713716
"Is this a login page?",
714717
response_schema=bool,
715-
image=Image.open("screenshot.png"),
718+
source=Image.open("screenshot.png"),
716719
)
717720
print(is_login_page)
718721

@@ -751,6 +754,7 @@ with VisionAgent() as agent:
751754
**⚠️ Limitations:**
752755
- The support for response schemas varies among models. Currently, the `askui` model provides best support for response schemas
753756
as we try different models under the hood with your schema to see which one works best.
757+
- PDF processing is only supported for Gemini models hosted on AskUI and for PDFs up to 20MB.
754758

755759
## What is AskUI Vision Agent?
756760

pdm.lock

Lines changed: 11 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ dependencies = [
2525
"jsonref>=1.1.0",
2626
"protobuf>=6.31.1",
2727
"google-genai>=1.20.0",
28+
"filetype>=1.2.0",
2829
]
2930
requires-python = ">=3.10"
3031
readme = "README.md"

src/askui/agent_base.py

Lines changed: 46 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import time
22
import types
33
from abc import ABC
4+
from pathlib import Path
45
from typing import Annotated, Optional, Type, overload
56

67
from dotenv import load_dotenv
@@ -16,6 +17,8 @@
1617
from askui.tools.agent_os import AgentOs
1718
from askui.tools.android.agent_os import AndroidAgentOs
1819
from askui.utils.image_utils import ImageSource, Img
20+
from askui.utils.pdf_utils import Pdf
21+
from askui.utils.source_utils import load_image_source, load_source
1922

2023
from .logger import configure_logging, logger
2124
from .models import ModelComposition
@@ -189,46 +192,53 @@ def get(
189192
query: Annotated[str, Field(min_length=1)],
190193
response_schema: None = None,
191194
model: str | None = None,
192-
image: Optional[Img] = None,
195+
source: Optional[Img | Pdf] = None,
193196
) -> str: ...
194197
@overload
195198
def get(
196199
self,
197200
query: Annotated[str, Field(min_length=1)],
198201
response_schema: Type[ResponseSchema],
199202
model: str | None = None,
200-
image: Optional[Img] = None,
203+
source: Optional[Img | Pdf] = None,
201204
) -> ResponseSchema: ...
202205

203-
@telemetry.record_call(exclude={"query", "image", "response_schema"})
206+
@telemetry.record_call(exclude={"query", "source", "response_schema"})
204207
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
205208
def get(
206209
self,
207210
query: Annotated[str, Field(min_length=1)],
208211
response_schema: Type[ResponseSchema] | None = None,
209212
model: str | None = None,
210-
image: Optional[Img] = None,
213+
source: Optional[Img | Pdf] = None,
211214
) -> ResponseSchema | str:
212215
"""
213-
Retrieves information from an image (defaults to a screenshot of the current
214-
screen) based on the provided `query`.
216+
Retrieves information from an image or PDF based on the provided `query`.
217+
218+
If no `source` is provided, a screenshot of the current screen is taken.
215219
216220
Args:
217221
query (str): The query describing what information to retrieve.
218-
image (Img | None, optional): The image to extract information from.
219-
Defaults to a screenshot of the current screen. Can be a path to
220-
an image file, a PIL Image object or a data URL.
222+
source (Img | Pdf | None, optional): The source to extract information from.
223+
Can be a path to a PDF file, a path to an image file, a PIL Image
224+
object or a data URL. Defaults to a screenshot of the current screen.
221225
response_schema (Type[ResponseSchema] | None, optional): A Pydantic model
222226
class that defines the response schema. If not provided, returns a
223227
string.
224228
model (str | None, optional): The composition or name of the model(s) to
225229
be used for retrieving information from the screen or image using the
226230
`query`. Note: `response_schema` is not supported by all models.
231+
PDF processing is only supported for Gemini models hosted on AskUI.
227232
228233
Returns:
229234
ResponseSchema | str: The extracted information, `str` if no
230235
`response_schema` is provided.
231236
237+
Raises:
238+
NotImplementedError: If PDF processing is not supported for the selected
239+
model.
240+
ValueError: If the `source` is not a valid PDF or image.
241+
232242
Example:
233243
```python
234244
from askui import ResponseSchemaBase, VisionAgent
@@ -253,7 +263,7 @@ class LinkedListNode(ResponseSchemaBase):
253263
response = agent.get(
254264
"What is the current url shown in the url bar?",
255265
response_schema=UrlResponse,
256-
image="screenshot.png",
266+
source="screenshot.png",
257267
)
258268
# Dump whole model
259269
print(response.model_dump_json(indent=2))
@@ -268,7 +278,7 @@ class LinkedListNode(ResponseSchemaBase):
268278
is_login_page = agent.get(
269279
"Is this a login page?",
270280
response_schema=bool,
271-
image=Image.open("screenshot.png"),
281+
source=Image.open("screenshot.png"),
272282
)
273283
print(is_login_page)
274284
@@ -302,13 +312,34 @@ class LinkedListNode(ResponseSchemaBase):
302312
while current:
303313
print(current.value)
304314
current = current.next
315+
316+
# Get text from PDF
317+
text = agent.get(
318+
"Extract all text from the PDF",
319+
source="document.pdf",
320+
)
321+
print(text)
305322
```
306323
"""
307324
logger.debug("VisionAgent received instruction to get '%s'", query)
308-
_image = ImageSource(self._agent_os.screenshot() if image is None else image)
309-
self._reporter.add_message("User", f'get: "{query}"', image=_image.root)
325+
_source = (
326+
ImageSource(self._agent_os.screenshot())
327+
if source is None
328+
else load_source(source)
329+
)
330+
331+
# Prepare message content with file path if available
332+
user_message_content = f'get: "{query}"' + (
333+
f" from '{source}'" if isinstance(source, (str, Path)) else ""
334+
)
335+
336+
self._reporter.add_message(
337+
"User",
338+
user_message_content,
339+
image=_source.root if isinstance(_source, ImageSource) else None,
340+
)
310341
response = self._model_router.get(
311-
image=_image,
342+
source=_source,
312343
query=query,
313344
response_schema=response_schema,
314345
model_choice=model or self._model_choice["get"],
@@ -328,7 +359,7 @@ def _locate(
328359
model: ModelComposition | str | None = None,
329360
) -> Point:
330361
def locate_with_screenshot() -> Point:
331-
_screenshot = ImageSource(
362+
_screenshot = load_image_source(
332363
self._agent_os.screenshot() if screenshot is None else screenshot
333364
)
334365
return self._model_router.locate(

src/askui/locators/locators.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from pydantic import ConfigDict, Field, validate_call
88

99
from askui.locators.relatable import Relatable
10-
from askui.utils.image_utils import ImageSource
10+
from askui.utils.source_utils import load_image_source
1111

1212
TextMatchType = Literal["similar", "exact", "contains", "regex"]
1313
"""The type of match to use.
@@ -303,7 +303,7 @@ def __init__(
303303
image_compare_format=image_compare_format,
304304
name=_generate_name() if name is None else name,
305305
)
306-
self._image = ImageSource(image)
306+
self._image = load_image_source(image)
307307

308308

309309
class AiElement(ImageBase):

src/askui/models/anthropic/messages_api.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@
4848
scale_coordinates,
4949
scale_image_to_fit,
5050
)
51+
from askui.utils.pdf_utils import PdfSource
52+
from askui.utils.source_utils import Source
5153

5254
from .utils import extract_click_coordinates
5355

@@ -234,16 +236,19 @@ def locate(
234236
def get(
235237
self,
236238
query: str,
237-
image: ImageSource,
239+
source: Source,
238240
response_schema: Type[ResponseSchema] | None,
239241
model_choice: str,
240242
) -> ResponseSchema | str:
243+
if isinstance(source, PdfSource):
244+
err_msg = f"PDF processing is not supported for the model {model_choice}"
245+
raise NotImplementedError(err_msg)
241246
try:
242247
if response_schema is not None:
243248
error_msg = "Response schema is not yet supported for Anthropic"
244249
raise NotImplementedError(error_msg)
245250
return self._inference(
246-
image=image,
251+
image=source,
247252
prompt=query,
248253
system=SYSTEM_PROMPT_GET,
249254
model_choice=model_choice,

src/askui/models/askui/get_model.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from askui.models.exceptions import QueryNoResponseError, QueryUnexpectedResponseError
1010
from askui.models.models import GetModel
1111
from askui.models.types.response_schemas import ResponseSchema
12-
from askui.utils.image_utils import ImageSource
12+
from askui.utils.source_utils import Source
1313

1414

1515
class AskUiGetModel(GetModel):
@@ -39,15 +39,15 @@ def __init__(
3939
def get(
4040
self,
4141
query: str,
42-
image: ImageSource,
42+
source: Source,
4343
response_schema: Type[ResponseSchema] | None,
4444
model_choice: str,
4545
) -> ResponseSchema | str:
4646
try:
4747
logger.debug("Attempting to use Google GenAI API")
4848
return self._google_genai_api.get(
4949
query=query,
50-
image=image,
50+
source=source,
5151
response_schema=response_schema,
5252
model_choice=model_choice,
5353
)
@@ -66,7 +66,7 @@ def get(
6666
)
6767
return self._inference_api.get(
6868
query=query,
69-
image=image,
69+
source=source,
7070
response_schema=response_schema,
7171
model_choice=model_choice,
7272
)

0 commit comments

Comments
 (0)