diff --git a/README.md b/README.md index c698bb79..299a6ba8 100644 --- a/README.md +++ b/README.md @@ -349,7 +349,7 @@ class MyGetAndLocateModel(GetModel, LocateModel): def get( self, query: str, - image: ImageSource, + source: Source, response_schema: Type[ResponseSchema] | None, model_choice: str, ) -> ResponseSchema | str: @@ -639,9 +639,9 @@ else: agent.click("Login") ``` -#### Using custom images +#### Using custom images and PDFs -Instead of taking a screenshot, you can analyze specific images: +Instead of taking a screenshot, you can analyze specific images or PDFs: ```python from PIL import Image @@ -650,10 +650,13 @@ from askui import VisionAgent # From PIL Image with VisionAgent() as agent: image = Image.open("screenshot.png") - result = agent.get("What's in this image?", image) + result = agent.get("What's in this image?", source=image) # From file path - result = agent.get("What's in this image?", "screenshot.png") + result = agent.get("What's in this image?", source="screenshot.png") + + # From PDF + result = agent.get("What is this PDF about?", source="document.pdf") ``` #### Using response schemas @@ -695,7 +698,7 @@ with VisionAgent() as agent: response = agent.get( "What is the current url shown in the url bar?", response_schema=UrlResponse, - image="screenshot.png", + source="screenshot.png", ) # Dump whole model @@ -711,7 +714,7 @@ with VisionAgent() as agent: is_login_page = agent.get( "Is this a login page?", response_schema=bool, - image=Image.open("screenshot.png"), + source=Image.open("screenshot.png"), ) print(is_login_page) @@ -750,6 +753,7 @@ with VisionAgent() as agent: **⚠️ Limitations:** - The support for response schemas varies among models. Currently, the `askui` model provides best support for response schemas as we try different models under the hood with your schema to see which one works best. +- PDF processing is only supported for Gemini models hosted on AskUI and for PDFs up to 20MB. ## What is AskUI Vision Agent? diff --git a/pdm.lock b/pdm.lock index 333efa4e..5952bf23 100644 --- a/pdm.lock +++ b/pdm.lock @@ -5,7 +5,7 @@ groups = ["default", "all", "android", "chat", "dev", "mcp", "pynput", "test", "web"] strategy = ["inherit_metadata"] lock_version = "4.5.0" -content_hash = "sha256:5033ba9a7c1da164c790105efce8b762b5ccc39bf8b872bca8451916baab0a8e" +content_hash = "sha256:3fe75d92bfe97e6b257a5591a7d6ad8355209fe259fc580cd8262c982f3485e3" [[metadata.targets]] requires_python = ">=3.10" @@ -679,6 +679,16 @@ files = [ {file = "filelock-3.18.0.tar.gz", hash = "sha256:adbc88eabb99d2fec8c9c1b229b171f18afa655400173ddc653d5d01501fb9f2"}, ] +[[package]] +name = "filetype" +version = "1.2.0" +summary = "Infer file type and MIME type of any file/buffer. No external dependencies." +groups = ["default"] +files = [ + {file = "filetype-1.2.0-py2.py3-none-any.whl", hash = "sha256:7ce71b6880181241cf7ac8697a2f1eb6a8bd9b429f7ad6d27b8db9ba5f1c2d25"}, + {file = "filetype-1.2.0.tar.gz", hash = "sha256:66b56cd6474bf41d8c54660347d37afcc3f7d1970648de365c102ef77548aadb"}, +] + [[package]] name = "fsspec" version = "2025.3.2" diff --git a/pyproject.toml b/pyproject.toml index 7f3efe83..823e559a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,6 +25,7 @@ dependencies = [ "jsonref>=1.1.0", "protobuf>=6.31.1", "google-genai>=1.20.0", + "filetype>=1.2.0", ] requires-python = ">=3.10" readme = "README.md" diff --git a/src/askui/agent_base.py b/src/askui/agent_base.py index a98a4679..88d8e517 100644 --- a/src/askui/agent_base.py +++ b/src/askui/agent_base.py @@ -1,6 +1,7 @@ import time import types from abc import ABC +from pathlib import Path from typing import Annotated, Optional, Type, overload from dotenv import load_dotenv @@ -16,6 +17,8 @@ from askui.tools.agent_os import AgentOs from askui.tools.android.agent_os import AndroidAgentOs from askui.utils.image_utils import ImageSource, Img +from askui.utils.pdf_utils import Pdf +from askui.utils.source_utils import load_image_source, load_source from .logger import configure_logging, logger from .models import ModelComposition @@ -189,7 +192,7 @@ def get( query: Annotated[str, Field(min_length=1)], response_schema: None = None, model: str | None = None, - image: Optional[Img] = None, + source: Optional[Img | Pdf] = None, ) -> str: ... @overload def get( @@ -197,38 +200,45 @@ def get( query: Annotated[str, Field(min_length=1)], response_schema: Type[ResponseSchema], model: str | None = None, - image: Optional[Img] = None, + source: Optional[Img | Pdf] = None, ) -> ResponseSchema: ... - @telemetry.record_call(exclude={"query", "image", "response_schema"}) + @telemetry.record_call(exclude={"query", "source", "response_schema"}) @validate_call(config=ConfigDict(arbitrary_types_allowed=True)) def get( self, query: Annotated[str, Field(min_length=1)], response_schema: Type[ResponseSchema] | None = None, model: str | None = None, - image: Optional[Img] = None, + source: Optional[Img | Pdf] = None, ) -> ResponseSchema | str: """ - Retrieves information from an image (defaults to a screenshot of the current - screen) based on the provided `query`. + Retrieves information from an image or PDF based on the provided `query`. + + If no `source` is provided, a screenshot of the current screen is taken. Args: query (str): The query describing what information to retrieve. - image (Img | None, optional): The image to extract information from. - Defaults to a screenshot of the current screen. Can be a path to - an image file, a PIL Image object or a data URL. + source (Img | Pdf | None, optional): The source to extract information from. + Can be a path to a PDF file, a path to an image file, a PIL Image + object or a data URL. Defaults to a screenshot of the current screen. response_schema (Type[ResponseSchema] | None, optional): A Pydantic model class that defines the response schema. If not provided, returns a string. model (str | None, optional): The composition or name of the model(s) to be used for retrieving information from the screen or image using the `query`. Note: `response_schema` is not supported by all models. + PDF processing is only supported for Gemini models hosted on AskUI. Returns: ResponseSchema | str: The extracted information, `str` if no `response_schema` is provided. + Raises: + NotImplementedError: If PDF processing is not supported for the selected + model. + ValueError: If the `source` is not a valid PDF or image. + Example: ```python from askui import ResponseSchemaBase, VisionAgent @@ -253,7 +263,7 @@ class LinkedListNode(ResponseSchemaBase): response = agent.get( "What is the current url shown in the url bar?", response_schema=UrlResponse, - image="screenshot.png", + source="screenshot.png", ) # Dump whole model print(response.model_dump_json(indent=2)) @@ -268,7 +278,7 @@ class LinkedListNode(ResponseSchemaBase): is_login_page = agent.get( "Is this a login page?", response_schema=bool, - image=Image.open("screenshot.png"), + source=Image.open("screenshot.png"), ) print(is_login_page) @@ -302,13 +312,34 @@ class LinkedListNode(ResponseSchemaBase): while current: print(current.value) current = current.next + + # Get text from PDF + text = agent.get( + "Extract all text from the PDF", + source="document.pdf", + ) + print(text) ``` """ logger.debug("VisionAgent received instruction to get '%s'", query) - _image = ImageSource(self._agent_os.screenshot() if image is None else image) - self._reporter.add_message("User", f'get: "{query}"', image=_image.root) + _source = ( + ImageSource(self._agent_os.screenshot()) + if source is None + else load_source(source) + ) + + # Prepare message content with file path if available + user_message_content = f'get: "{query}"' + ( + f" from '{source}'" if isinstance(source, (str, Path)) else "" + ) + + self._reporter.add_message( + "User", + user_message_content, + image=_source.root if isinstance(_source, ImageSource) else None, + ) response = self._model_router.get( - image=_image, + source=_source, query=query, response_schema=response_schema, model_choice=model or self._model_choice["get"], @@ -328,7 +359,7 @@ def _locate( model: ModelComposition | str | None = None, ) -> Point: def locate_with_screenshot() -> Point: - _screenshot = ImageSource( + _screenshot = load_image_source( self._agent_os.screenshot() if screenshot is None else screenshot ) return self._model_router.locate( diff --git a/src/askui/locators/locators.py b/src/askui/locators/locators.py index 652c08b0..bc552a1c 100644 --- a/src/askui/locators/locators.py +++ b/src/askui/locators/locators.py @@ -7,7 +7,7 @@ from pydantic import ConfigDict, Field, validate_call from askui.locators.relatable import Relatable -from askui.utils.image_utils import ImageSource +from askui.utils.source_utils import load_image_source TextMatchType = Literal["similar", "exact", "contains", "regex"] """The type of match to use. @@ -303,7 +303,7 @@ def __init__( image_compare_format=image_compare_format, name=_generate_name() if name is None else name, ) - self._image = ImageSource(image) + self._image = load_image_source(image) class AiElement(ImageBase): diff --git a/src/askui/models/anthropic/messages_api.py b/src/askui/models/anthropic/messages_api.py index bfb8704a..dfde5dfc 100644 --- a/src/askui/models/anthropic/messages_api.py +++ b/src/askui/models/anthropic/messages_api.py @@ -48,6 +48,8 @@ scale_coordinates, scale_image_to_fit, ) +from askui.utils.pdf_utils import PdfSource +from askui.utils.source_utils import Source from .utils import extract_click_coordinates @@ -238,16 +240,19 @@ def locate( def get( self, query: str, - image: ImageSource, + source: Source, response_schema: Type[ResponseSchema] | None, model_choice: str, ) -> ResponseSchema | str: + if isinstance(source, PdfSource): + err_msg = f"PDF processing is not supported for the model {model_choice}" + raise NotImplementedError(err_msg) try: if response_schema is not None: error_msg = "Response schema is not yet supported for Anthropic" raise NotImplementedError(error_msg) return self._inference( - image=image, + image=source, prompt=query, system=SYSTEM_PROMPT_GET, model_choice=model_choice, diff --git a/src/askui/models/askui/get_model.py b/src/askui/models/askui/get_model.py index ef753caf..9534cf1a 100644 --- a/src/askui/models/askui/get_model.py +++ b/src/askui/models/askui/get_model.py @@ -9,7 +9,7 @@ from askui.models.exceptions import QueryNoResponseError, QueryUnexpectedResponseError from askui.models.models import GetModel from askui.models.types.response_schemas import ResponseSchema -from askui.utils.image_utils import ImageSource +from askui.utils.source_utils import Source class AskUiGetModel(GetModel): @@ -39,7 +39,7 @@ def __init__( def get( self, query: str, - image: ImageSource, + source: Source, response_schema: Type[ResponseSchema] | None, model_choice: str, ) -> ResponseSchema | str: @@ -47,7 +47,7 @@ def get( logger.debug("Attempting to use Google GenAI API") return self._google_genai_api.get( query=query, - image=image, + source=source, response_schema=response_schema, model_choice=model_choice, ) @@ -66,7 +66,7 @@ def get( ) return self._inference_api.get( query=query, - image=image, + source=source, response_schema=response_schema, model_choice=model_choice, ) diff --git a/src/askui/models/askui/google_genai_api.py b/src/askui/models/askui/google_genai_api.py index 49b72d03..8d691023 100644 --- a/src/askui/models/askui/google_genai_api.py +++ b/src/askui/models/askui/google_genai_api.py @@ -23,9 +23,11 @@ from askui.models.types.response_schemas import ResponseSchema, to_response_schema from askui.utils.http_utils import parse_retry_after_header from askui.utils.image_utils import ImageSource +from askui.utils.source_utils import Source ASKUI_MODEL_CHOICE_PREFIX = "askui/" ASKUI_MODEL_CHOICE_PREFIX_LEN = len(ASKUI_MODEL_CHOICE_PREFIX) +MAX_FILE_SIZE_BYTES = 20 * 1024 * 1024 class _wait_for_retry_after_header(wait_base): @@ -112,7 +114,7 @@ def __init__(self, settings: AskUiInferenceApiSettings | None = None) -> None: def get( self, query: str, - image: ImageSource, + source: Source, response_schema: Type[ResponseSchema] | None, model_choice: str, ) -> ResponseSchema | str: @@ -120,12 +122,10 @@ def get( _response_schema = to_response_schema(response_schema) json_schema = _response_schema.model_json_schema() logger.debug(f"json_schema:\n{json_lib.dumps(json_schema)}") + part = self._create_genai_part_from_source(source) content = genai_types.Content( parts=[ - genai_types.Part.from_bytes( - data=image.to_bytes(), - mime_type="image/png", - ), + part, genai_types.Part.from_text(text=query), ], role="user", @@ -158,3 +158,41 @@ def get( "Recursive response schemas are not supported by AskUiGoogleGenAiApi" ) raise NotImplementedError(error_message) from e + + def _create_genai_part_from_source(self, source: Source) -> genai_types.Part: + """Create a genai Part from a Source object. + + Only ImageSource and PdfSource are currently supported. + + Args: + source (Source): The source object to convert. + + Returns: + genai_types.Part: The genai Part object. + + Raises: + NotImplementedError: If source type is not ImageSource or PdfSource. + ValueError: If the source data exceeds the size limit. + """ + if isinstance(source, ImageSource): + data = source.to_bytes() + if len(data) > MAX_FILE_SIZE_BYTES: + _err_msg = ( + f"Image file size exceeds the limit of {MAX_FILE_SIZE_BYTES} bytes." + ) + raise ValueError(_err_msg) + return genai_types.Part.from_bytes( + data=data, + mime_type="image/png", + ) + with source.reader as r: + data = r.read() + if len(data) > MAX_FILE_SIZE_BYTES: + _err_msg = ( + f"PDF file size exceeds the limit of {MAX_FILE_SIZE_BYTES} bytes." + ) + raise ValueError(_err_msg) + return genai_types.Part.from_bytes( + data=data, + mime_type="application/pdf", + ) diff --git a/src/askui/models/askui/inference_api.py b/src/askui/models/askui/inference_api.py index d40c5150..b30d40cb 100644 --- a/src/askui/models/askui/inference_api.py +++ b/src/askui/models/askui/inference_api.py @@ -27,6 +27,8 @@ from askui.models.shared.tools import ToolCollection from askui.models.types.response_schemas import ResponseSchema from askui.utils.image_utils import ImageSource +from askui.utils.pdf_utils import PdfSource +from askui.utils.source_utils import Source from ..types.response_schemas import to_response_schema @@ -196,12 +198,15 @@ def locate( def get( self, query: str, - image: ImageSource, + source: Source, response_schema: Type[ResponseSchema] | None, model_choice: str, ) -> ResponseSchema | str: + if isinstance(source, PdfSource): + err_msg = f"PDF processing is not supported for the model {model_choice}" + raise NotImplementedError(err_msg) json: dict[str, Any] = { - "image": image.to_data_url(), + "image": source.to_data_url(), "prompt": query, } _response_schema = to_response_schema(response_schema) diff --git a/src/askui/models/model_router.py b/src/askui/models/model_router.py index 457ec10d..394e7a3e 100644 --- a/src/askui/models/model_router.py +++ b/src/askui/models/model_router.py @@ -32,6 +32,7 @@ from askui.models.types.response_schemas import ResponseSchema from askui.reporting import NULL_REPORTER, CompositeReporter, Reporter from askui.utils.image_utils import ImageSource +from askui.utils.source_utils import Source from ..logger import logger from .askui.inference_api import AskUiInferenceApi @@ -199,13 +200,13 @@ def act( def get( self, query: str, - image: ImageSource, + source: Source, model_choice: str, response_schema: Type[ResponseSchema] | None = None, ) -> ResponseSchema | str: m = self._get_model(model_choice, "get") logger.debug(f'Routing "get" to model "{model_choice}"') - return m.get(query, image, response_schema, model_choice) + return m.get(query, source, response_schema, model_choice) def locate( self, diff --git a/src/askui/models/models.py b/src/askui/models/models.py index 5bd65ddc..22420da9 100644 --- a/src/askui/models/models.py +++ b/src/askui/models/models.py @@ -14,6 +14,7 @@ from askui.models.shared.tools import Tool from askui.models.types.response_schemas import ResponseSchema from askui.utils.image_utils import ImageSource +from askui.utils.source_utils import Source class ModelName(str, Enum): @@ -231,23 +232,22 @@ def act( class GetModel(abc.ABC): - """Abstract base class for models that can extract information from images. + """Abstract base class for models that can extract information from images and PDFs. Models implementing this interface can be used with the `get()` method of `VisionAgent` - to extract information from screenshots or other images. These models analyze visual - content and return structured or unstructured information based on queries. - + to extract information from screenshots, other images or PDFs. These models analyze + visual content and return structured or unstructured information based on queries. Example: ```python - from askui import GetModel, VisionAgent, ResponseSchema, ImageSource + from askui import GetModel, VisionAgent, ResponseSchema, Source from typing import Type class MyGetModel(GetModel): def get( self, query: str, - image: ImageSource, + source: Source, response_schema: Type[ResponseSchema] | None, model_choice: str, ) -> ResponseSchema | str: @@ -263,15 +263,14 @@ def get( def get( self, query: str, - image: ImageSource, + source: Source, response_schema: Type[ResponseSchema] | None, model_choice: str, ) -> ResponseSchema | str: - """Extract information from an image based on a query. - + """Extract information from a source based on a query. Args: query (str): A description of what information to extract - image (ImageSource): The image to analyze (screenshot or provided image) + source (Source): The source to analyze (screenshot, image or PDF) response_schema (Type[ResponseSchema] | None): Optional Pydantic model class defining the expected response structure model_choice (str): The name of the model being used (useful for models that diff --git a/src/askui/models/openrouter/model.py b/src/askui/models/openrouter/model.py index 3ece12f4..a5a6882c 100644 --- a/src/askui/models/openrouter/model.py +++ b/src/askui/models/openrouter/model.py @@ -10,7 +10,8 @@ from askui.models.models import GetModel from askui.models.shared.prompts import SYSTEM_PROMPT_GET from askui.models.types.response_schemas import ResponseSchema, to_response_schema -from askui.utils.image_utils import ImageSource +from askui.utils.pdf_utils import PdfSource +from askui.utils.source_utils import Source from .settings import OpenRouterSettings @@ -169,12 +170,15 @@ def _predict( def get( self, query: str, - image: ImageSource, + source: Source, response_schema: Type[ResponseSchema] | None, model_choice: str, ) -> ResponseSchema | str: + if isinstance(source, PdfSource): + err_msg = f"PDF processing is not supported for the model {model_choice}" + raise NotImplementedError(err_msg) response = self._predict( - image_url=image.to_data_url(), + image_url=source.to_data_url(), instruction=query, prompt=SYSTEM_PROMPT_GET, response_schema=response_schema, diff --git a/src/askui/models/shared/facade.py b/src/askui/models/shared/facade.py index 9789f3b1..a26c9cfd 100644 --- a/src/askui/models/shared/facade.py +++ b/src/askui/models/shared/facade.py @@ -10,6 +10,7 @@ from askui.models.shared.tools import Tool from askui.models.types.response_schemas import ResponseSchema from askui.utils.image_utils import ImageSource +from askui.utils.source_utils import Source class ModelFacade(ActModel, GetModel, LocateModel): @@ -44,11 +45,11 @@ def act( def get( self, query: str, - image: ImageSource, + source: Source, response_schema: Type[ResponseSchema] | None, model_choice: str, ) -> ResponseSchema | str: - return self._get_model.get(query, image, response_schema, model_choice) + return self._get_model.get(query, source, response_schema, model_choice) @override def locate( diff --git a/src/askui/models/ui_tars_ep/ui_tars_api.py b/src/askui/models/ui_tars_ep/ui_tars_api.py index 4c2c84ee..84cf46a1 100644 --- a/src/askui/models/ui_tars_ep/ui_tars_api.py +++ b/src/askui/models/ui_tars_ep/ui_tars_api.py @@ -19,6 +19,8 @@ from askui.models.types.response_schemas import ResponseSchema from askui.reporting import Reporter from askui.utils.image_utils import ImageSource, image_to_base64 +from askui.utils.pdf_utils import PdfSource +from askui.utils.source_utils import Source from .parser import UITarsEPMessage from .prompts import PROMPT, PROMPT_QA @@ -176,15 +178,18 @@ def locate( def get( self, query: str, - image: ImageSource, + source: Source, response_schema: Type[ResponseSchema] | None, model_choice: str, ) -> ResponseSchema | str: + if isinstance(source, PdfSource): + err_msg = f"PDF processing is not supported for the model {model_choice}" + raise NotImplementedError(err_msg) if response_schema is not None: error_msg = f'Response schema is not supported for model "{model_choice}"' raise NotImplementedError(error_msg) response = self._predict( - image_url=image.to_data_url(), + image_url=source.to_data_url(), instruction=query, prompt=PROMPT_QA, ) diff --git a/src/askui/utils/image_utils.py b/src/askui/utils/image_utils.py index bfefc4ac..4f166579 100644 --- a/src/askui/utils/image_utils.py +++ b/src/askui/utils/image_utils.py @@ -2,61 +2,13 @@ import binascii import io import pathlib -import re from dataclasses import dataclass from pathlib import Path -from typing import Any, Literal, Union +from typing import Literal, Union from PIL import Image, ImageDraw, UnidentifiedImageError from PIL import Image as PILImage -from pydantic import ConfigDict, RootModel, field_validator - -# Regex to capture any kind of valid base64 data url (with optional media type and ;base64) -# e.g., data:image/png;base64,... or data:;base64,... or data:,... or just ,... -_DATA_URL_GENERIC_RE = re.compile(r"^(?:data:)?[^,]*?,(.*)$", re.DOTALL) - - -def load_image(source: Union[str, Path, Image.Image]) -> Image.Image: - """Load and validate an image from a PIL Image, a path, or any form of base64 data URL. - - Args: - source (Union[str, Path, Image.Image]): The image source to load from. - Can be a PIL Image, file path (`str` or `pathlib.Path`), or data URL. - - Returns: - Image.Image: A valid PIL Image object. - - Raises: - ValueError: If the input is not a valid or recognizable image. - """ - if isinstance(source, Image.Image): - return source - - if isinstance(source, Path) or (not source.startswith(("data:", ","))): - try: - return Image.open(source) - except (OSError, FileNotFoundError, UnidentifiedImageError) as e: - error_msg = f"Could not open image from file path: {source}" - raise ValueError(error_msg) from e - - else: - match = _DATA_URL_GENERIC_RE.match(source) - if match: - try: - image_data = base64.b64decode(match.group(1)) - return Image.open(io.BytesIO(image_data)) - except (binascii.Error, UnidentifiedImageError): - try: - return Image.open(source) - except (FileNotFoundError, UnidentifiedImageError) as e: - error_msg = ( - f"Could not decode or identify image from input:" - f"{source[:100]}{'...' if len(source) > 100 else ''}" - ) - raise ValueError(error_msg) from e - - error_msg = f"Unsupported image input type: {type(source)}" - raise ValueError(error_msg) +from pydantic import ConfigDict, RootModel def image_to_data_url(image: PILImage.Image) -> str: @@ -386,14 +338,6 @@ class ImageSource(RootModel): model_config = ConfigDict(arbitrary_types_allowed=True) root: PILImage.Image - def __init__(self, root: Img, **kwargs: dict[str, Any]) -> None: - super().__init__(root=root, **kwargs) - - @field_validator("root", mode="before") - @classmethod - def validate_root(cls, v: Any) -> PILImage.Image: - return load_image(v) - def to_data_url(self) -> str: """Convert the image to a data URL. @@ -422,7 +366,6 @@ def to_bytes(self) -> bytes: __all__ = [ - "load_image", "image_to_data_url", "data_url_to_image", "draw_point_on_image", diff --git a/src/askui/utils/pdf_utils.py b/src/askui/utils/pdf_utils.py new file mode 100644 index 00000000..2df0246d --- /dev/null +++ b/src/askui/utils/pdf_utils.py @@ -0,0 +1,42 @@ +from io import BufferedReader, BytesIO +from pathlib import Path +from typing import Union + +from pydantic import ConfigDict, RootModel + +Pdf = Union[str, Path] +"""Type of the input PDFs for `askui.VisionAgent.get()`, etc. + +Accepts: +- Relative or absolute file path (`str` or `pathlib.Path`) +""" + + +class PdfSource(RootModel): + """A class that represents a PDF source. + It provides methods to convert it to different formats. + + The class can be initialized with: + - A file path (str or pathlib.Path) + + Attributes: + root (bytes): The underlying PDF bytes. + + Args: + root (Pdf): The PDF source to load from. + """ + + model_config = ConfigDict(arbitrary_types_allowed=True) + root: bytes | Path + + @property + def reader(self) -> BufferedReader | BytesIO: + if isinstance(self.root, Path): + return self.root.open("rb") + return BytesIO(self.root) + + +__all__ = [ + "PdfSource", + "Pdf", +] diff --git a/src/askui/utils/source_utils.py b/src/askui/utils/source_utils.py new file mode 100644 index 00000000..619f134a --- /dev/null +++ b/src/askui/utils/source_utils.py @@ -0,0 +1,170 @@ +import base64 +import re +from dataclasses import dataclass +from enum import Enum +from io import BytesIO +from pathlib import Path +from typing import Literal, Union + +from filetype import guess # type: ignore[import-untyped] +from PIL import Image as PILImage + +from askui.utils.image_utils import ImageSource +from askui.utils.pdf_utils import PdfSource + +Source = Union[ImageSource, PdfSource] + +_DATA_URL_WITH_MIMETYPE_RE = re.compile(r"^data:([^;,]+)([^,]*)?,(.*)$", re.DOTALL) + +_SupportedImageMimeTypes = Literal["image/png", "image/jpeg", "image/gif", "image/webp"] +_SupportedApplicationMimeTypes = Literal["application/pdf"] +_SupportedMimeTypes = _SupportedImageMimeTypes | _SupportedApplicationMimeTypes + +_SUPPORTED_MIME_TYPES: list[_SupportedMimeTypes] = [ + "image/png", + "image/jpeg", + "image/gif", + "image/webp", + "application/pdf", +] + + +class _SourceType(Enum): + DATA_URL = "data_url" + FILE = "file" + UNKNOWN = "unknown" + + +@dataclass +class _SourceAnalysis: + type: _SourceType = _SourceType.UNKNOWN + mime: str | None = None + content: Path | bytes | None = None + + @property + def is_supported(self) -> bool: + return bool(self.mime) and self.mime in _SUPPORTED_MIME_TYPES + + @property + def is_pdf(self) -> bool: + return self.mime == "application/pdf" + + @property + def is_image(self) -> bool: + if self.mime: + return self.mime.startswith("image/") + return False + + +def _analyze_data_url(source: str) -> _SourceAnalysis | None: + if ( + (match := _DATA_URL_WITH_MIMETYPE_RE.match(source)) + and (mime := match.group(1)) + and (is_base64 := match.group(2) == ";base64") + and (data := match.group(3)) + ): + data_decoded = base64.b64decode(data) if is_base64 else data.encode() + return _SourceAnalysis( + type=_SourceType.DATA_URL, + mime=mime, + content=data_decoded, + ) + return None + + +def _analyze_file(source: str | Path) -> _SourceAnalysis | None: + if (kind := guess(str(source))) and (mime := kind.mime): + return _SourceAnalysis( + type=_SourceType.FILE, + mime=mime, + content=Path(source), + ) + return None + + +def _analyze_source(source: Union[str, Path]) -> _SourceAnalysis: + """Analyze a source (data url (`str`) or file path (`str` or `Path`)). + + Args: + source (Union[str, Path]): The source to analyze. + + Returns: + SourceAnalysis: The analysis of the source. + + Raises: + binascii.Error: If the data within data url cannot be decoded. + FileNotFoundError: If the source is regarded to be a file path and does not + exist. + """ + if isinstance(source, str) and (result := _analyze_data_url(source)): + return result + if result := _analyze_file(source): + return result + return _SourceAnalysis(type=_SourceType.UNKNOWN) + + +def load_source(source: Union[str, Path, PILImage.Image]) -> Source: + """Load a source and return it as an ImageSource or PdfSource. + + Args: + source (Union[str, Path]): The source to load. + + Returns: + Source: The loaded source as an ImageSource or PdfSource. + + Raises: + ValueError: If the source is not a valid image or PDF file. + FileNotFoundError: If the source is regarded to be a file path and does not + exist. + binascii.Error: If the data within data url cannot be decoded. + """ + + if isinstance(source, PILImage.Image): + return ImageSource(source) + source_analysis = _analyze_source(source) + if not source_analysis.is_supported: + msg = ( + f"Unsupported mime type: {source_analysis.mime} " + f"(supported: {_SUPPORTED_MIME_TYPES})" + ) + raise ValueError(msg) + if not source_analysis.content: + msg = "No content to read from" + raise ValueError(msg) + if source_analysis.is_pdf: + return PdfSource(source_analysis.content) + if source_analysis.is_image: + return ImageSource( + PILImage.open( + BytesIO(source_analysis.content) + if isinstance(source_analysis.content, bytes) + else source_analysis.content + ) + ) + msg = "Unsupported source type" + raise ValueError(msg) + + +def load_image_source(source: Union[str, Path, PILImage.Image]) -> ImageSource: + """Load a source and return it as an ImageSource. + + Args: + source (Union[str, Path]): The source to load. + + Returns: + ImageSource: The loaded source. + + Raises: + ValueError: If the source is not a valid image. + FileNotFoundError: If the source is regarded to be a file path and does not + exist. + binascii.Error: If the data within data url cannot be decoded. + """ + result = load_source(source) + if not isinstance(result, ImageSource): + msg = "Source is not an image" + raise TypeError(msg) + return result + + +__all__ = ["Source", "load_source", "load_image_source"] diff --git a/tests/conftest.py b/tests/conftest.py index c8dab188..72b7cba4 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -28,6 +28,18 @@ def path_fixtures_screenshots(path_fixtures: pathlib.Path) -> pathlib.Path: return path_fixtures / "screenshots" +@pytest.fixture +def path_fixtures_pdf(path_fixtures: pathlib.Path) -> pathlib.Path: + """Fixture providing the path to the pdf directory.""" + return path_fixtures / "pdf" + + +@pytest.fixture +def path_fixtures_dummy_pdf(path_fixtures_pdf: pathlib.Path) -> pathlib.Path: + """Fixture providing the path to the dummy pdf.""" + return path_fixtures_pdf / "dummy.pdf" + + @pytest.fixture def github_login_screenshot(path_fixtures_screenshots: pathlib.Path) -> Image.Image: """Fixture providing the GitHub login screenshot.""" diff --git a/tests/e2e/agent/test_get.py b/tests/e2e/agent/test_get.py index 5dc1d43a..d3a54119 100644 --- a/tests/e2e/agent/test_get.py +++ b/tests/e2e/agent/test_get.py @@ -1,8 +1,10 @@ +import pathlib from typing import Literal import pytest from PIL import Image as PILImage from pydantic import BaseModel, RootModel +from pytest_mock import MockerFixture from askui import ResponseSchemaBase, VisionAgent from askui.models import ModelName @@ -43,12 +45,86 @@ def test_get( ) -> None: url = vision_agent.get( "What is the current url shown in the url bar?\nUrl: ", - image=github_login_screenshot, + source=github_login_screenshot, model=model, ) assert url in ["github.com/login", "https://github.com/login"] +def test_get_with_pdf_with_non_gemini_model_raises_not_implemented( + vision_agent: VisionAgent, path_fixtures_dummy_pdf: pathlib.Path +) -> None: + with pytest.raises(NotImplementedError): + vision_agent.get( + "What is in the PDF?", + source=path_fixtures_dummy_pdf, + model=ModelName.ANTHROPIC__CLAUDE__3_5__SONNET__20241022, + ) + + +@pytest.mark.parametrize( + "model", + [ + ModelName.ASKUI__GEMINI__2_5__FLASH, + ModelName.ASKUI__GEMINI__2_5__PRO, + ], +) +def test_get_with_pdf_with_gemini_model( + vision_agent: VisionAgent, model: str, path_fixtures_dummy_pdf: pathlib.Path +) -> None: + response = vision_agent.get( + "What is in the PDF? explain in 1 sentence", + source=path_fixtures_dummy_pdf, + model=model, + ) + assert isinstance(response, str) + assert "is a test " in response.lower() + + +@pytest.mark.parametrize( + "model", + [ + ModelName.ASKUI__GEMINI__2_5__FLASH, + ModelName.ASKUI__GEMINI__2_5__PRO, + ], +) +def test_get_with_pdf_too_large( + vision_agent: VisionAgent, + model: str, + path_fixtures_dummy_pdf: pathlib.Path, + mocker: MockerFixture, +) -> None: + mocker.patch( + "askui.models.askui.google_genai_api.MAX_FILE_SIZE_BYTES", + 1, + ) + with pytest.raises(ValueError, match="PDF file size exceeds the limit"): + vision_agent.get( + "What is in the PDF?", + source=path_fixtures_dummy_pdf, + model=model, + ) + + +def test_get_with_pdf_too_large_with_default_model( + vision_agent: VisionAgent, + path_fixtures_dummy_pdf: pathlib.Path, + mocker: MockerFixture, +) -> None: + mocker.patch( + "askui.models.askui.google_genai_api.MAX_FILE_SIZE_BYTES", + 1, + ) + + # This should raise a ValueError because the default model is Gemini and it falls + # back to inference askui which does not support pdfs + with pytest.raises(ValueError, match="PDF file size exceeds the limit"): + vision_agent.get( + "What is in the PDF?", + source=path_fixtures_dummy_pdf, + ) + + def test_get_with_model_composition_should_use_default_model( agent_toolbox_mock: AgentToolbox, askui_facade: ModelFacade, @@ -76,7 +152,7 @@ def test_get_with_model_composition_should_use_default_model( ) as vision_agent: url = vision_agent.get( "What is the current url shown in the url bar?", - image=github_login_screenshot, + source=github_login_screenshot, ) assert url in ["github.com/login", "https://github.com/login"] @@ -92,7 +168,7 @@ def test_get_with_response_schema_without_additional_properties_with_askui_model with pytest.raises(Exception): # noqa: B017 vision_agent.get( "What is the current url shown in the url bar?", - image=github_login_screenshot, + source=github_login_screenshot, response_schema=UrlResponseBaseModel, # type: ignore[type-var] model=ModelName.ASKUI, ) @@ -108,7 +184,7 @@ def test_get_with_response_schema_with_default_value( ) -> None: response = vision_agent.get( "What is the current url shown in the url bar?", - image=github_login_screenshot, + source=github_login_screenshot, response_schema=OptionalUrlResponse, model=ModelName.ASKUI, ) @@ -124,7 +200,7 @@ def test_get_with_response_schema( ) -> None: response = vision_agent.get( "What is the current url shown in the url bar?", - image=github_login_screenshot, + source=github_login_screenshot, response_schema=UrlResponse, model=model, ) @@ -139,7 +215,7 @@ def test_get_with_response_schema_with_anthropic_model_raises_not_implemented( with pytest.raises(NotImplementedError): vision_agent.get( "What is the current url shown in the url bar?", - image=github_login_screenshot, + source=github_login_screenshot, response_schema=UrlResponse, model=ModelName.CLAUDE__SONNET__4__20250514, ) @@ -153,7 +229,7 @@ def test_get_with_nested_and_inherited_response_schema( ) -> None: response = vision_agent.get( "What is the current browser context?", - image=github_login_screenshot, + source=github_login_screenshot, response_schema=BrowserContextResponse, model=model, ) @@ -177,7 +253,7 @@ def test_get_with_recursive_response_schema( response = vision_agent.get( "Can you extract all segments (domain, path etc.) from the url as a linked list, " "e.g. 'https://google.com/test' -> 'google.com->test->None'?", - image=github_login_screenshot, + source=github_login_screenshot, response_schema=LinkedListNode, model=model, ) @@ -200,7 +276,7 @@ def test_get_with_string_schema( ) -> None: response = vision_agent.get( "What is the current url shown in the url bar?", - image=github_login_screenshot, + source=github_login_screenshot, response_schema=str, model=model, ) @@ -215,7 +291,7 @@ def test_get_with_boolean_schema( ) -> None: response = vision_agent.get( "Is this a login page?", - image=github_login_screenshot, + source=github_login_screenshot, response_schema=bool, model=model, ) @@ -231,7 +307,7 @@ def test_get_with_integer_schema( ) -> None: response = vision_agent.get( "How many input fields are visible on this page?", - image=github_login_screenshot, + source=github_login_screenshot, response_schema=int, model=model, ) @@ -247,7 +323,7 @@ def test_get_with_float_schema( ) -> None: response = vision_agent.get( "Return a floating point number between 0 and 1 as a rating for how you well this page is designed (0 is the worst, 1 is the best)", - image=github_login_screenshot, + source=github_login_screenshot, response_schema=float, model=model, ) @@ -263,7 +339,7 @@ def test_get_returns_str_when_no_schema_specified( ) -> None: response = vision_agent.get( "What is the display showing?", - image=github_login_screenshot, + source=github_login_screenshot, model=model, ) assert isinstance(response, str) @@ -281,7 +357,7 @@ def test_get_with_basis_schema( ) -> None: response = vision_agent.get( "What is the display showing?", - image=github_login_screenshot, + source=github_login_screenshot, response_schema=Basis, model=model, ) @@ -305,7 +381,7 @@ def test_get_with_nested_root_model( ) -> None: response = vision_agent.get( "What is the display showing?", - image=github_login_screenshot, + source=github_login_screenshot, response_schema=BasisWithNestedRootModel, model=model, ) @@ -353,7 +429,7 @@ def test_get_with_deeply_nested_response_schema_with_model_that_does_not_support """ response = vision_agent.get( "Create a possible dom of the page that goes 4 levels deep", - image=github_login_screenshot, + source=github_login_screenshot, response_schema=PageDom, model=model, ) diff --git a/tests/fixtures/pdf/dummy.pdf b/tests/fixtures/pdf/dummy.pdf new file mode 100644 index 00000000..e0191a71 Binary files /dev/null and b/tests/fixtures/pdf/dummy.pdf differ diff --git a/tests/integration/models/openrouter/test_openrouter.py b/tests/integration/models/openrouter/test_openrouter.py index 25615610..51866a5b 100644 --- a/tests/integration/models/openrouter/test_openrouter.py +++ b/tests/integration/models/openrouter/test_openrouter.py @@ -37,7 +37,7 @@ def test_basic_query_returns_string( result = openrouter_model.get( query="What is in the image?", - image=image_source_github_login_screenshot, + source=image_source_github_login_screenshot, response_schema=None, model_choice="test-model", ) @@ -64,7 +64,7 @@ def test_query_with_response_schema_returns_validated_object( result = openrouter_model.get( query="What is in the image?", - image=image_source_github_login_screenshot, + source=image_source_github_login_screenshot, response_schema=TestResponse, model_choice="test-model", ) @@ -87,7 +87,7 @@ def test_no_response_from_model( with pytest.raises(QueryNoResponseError): openrouter_model.get( query="What is in the image?", - image=image_source_github_login_screenshot, + source=image_source_github_login_screenshot, response_schema=None, model_choice="test-model", ) @@ -106,7 +106,7 @@ def test_malformed_json_from_model( with pytest.raises(ValueError): openrouter_model.get( query="What is in the image?", - image=image_source_github_login_screenshot, + source=image_source_github_login_screenshot, response_schema=TestResponse, model_choice="test-model", ) diff --git a/tests/integration/test_custom_models.py b/tests/integration/test_custom_models.py index c1991b49..962def95 100644 --- a/tests/integration/test_custom_models.py +++ b/tests/integration/test_custom_models.py @@ -1,5 +1,6 @@ """Integration tests for custom model registration and selection.""" +import pathlib from typing import Any, Optional, Type, Union import pytest @@ -23,6 +24,7 @@ from askui.models.shared.tools import Tool from askui.tools.toolbox import AgentToolbox from askui.utils.image_utils import ImageSource +from askui.utils.source_utils import Source class SimpleActModel(ActModel): @@ -50,7 +52,7 @@ class SimpleGetModel(GetModel): def __init__(self, response: str | ResponseSchemaBase = "test response") -> None: self.queries: list[str] = [] - self.images: list[ImageSource] = [] + self.sources: list[Source] = [] self.schemas: list[Any] = [] self.model_choices: list[str] = [] self.response = response @@ -58,12 +60,12 @@ def __init__(self, response: str | ResponseSchemaBase = "test response") -> None def get( self, query: str, - image: ImageSource, + source: Source, response_schema: Optional[Type[ResponseSchema]], model_choice: str, ) -> Union[ResponseSchema, str]: self.queries.append(query) - self.images.append(image) + self.sources.append(source) self.schemas.append(response_schema) self.model_choices.append(model_choice) if ( @@ -163,6 +165,23 @@ def test_register_and_use_custom_get_model( assert get_model.queries == ["test query"] assert get_model.model_choices == ["custom-get"] + def test_register_and_use_custom_get_model_with_pdf( + self, + model_registry: ModelRegistry, + get_model: SimpleGetModel, + agent_toolbox_mock: AgentToolbox, + path_fixtures_dummy_pdf: pathlib.Path, + ) -> None: + """Test registering and using a custom get model with a PDF.""" + with VisionAgent(models=model_registry, tools=agent_toolbox_mock) as agent: + result = agent.get( + "test query", model="custom-get", source=path_fixtures_dummy_pdf + ) + + assert result == "test response" + assert get_model.queries == ["test query"] + assert get_model.model_choices == ["custom-get"] + def test_register_and_use_custom_locate_model( self, model_registry: ModelRegistry, diff --git a/tests/integration/utils/test_pdf_utils.py b/tests/integration/utils/test_pdf_utils.py new file mode 100644 index 00000000..7c7e52b3 --- /dev/null +++ b/tests/integration/utils/test_pdf_utils.py @@ -0,0 +1,31 @@ +import base64 +import pathlib + +import pytest + +from askui.utils.source_utils import load_source + + +class TestLoadPdf: + def test_load_pdf_from_path(self, path_fixtures_dummy_pdf: pathlib.Path) -> None: + loaded = load_source(path_fixtures_dummy_pdf) + assert isinstance(loaded.root, bytes | pathlib.Path) + with loaded.reader as r: + assert len(r.read()) > 0 + + def test_load_pdf_nonexistent_file(self) -> None: + with pytest.raises(FileNotFoundError): + load_source("nonexistent_file.pdf") + + def test_pdf_source_from_data_url( + self, path_fixtures_dummy_pdf: pathlib.Path + ) -> None: + # Load test image and convert to base64 + with pathlib.Path.open(path_fixtures_dummy_pdf, "rb") as f: + pdf_bytes = f.read() + pdf_str = base64.b64encode(pdf_bytes).decode() + data_url = f"data:application/pdf;base64,{pdf_str}" + source = load_source(data_url) + assert isinstance(source.root, bytes | pathlib.Path) + with source.reader as r: + assert len(r.read()) > 0 diff --git a/tests/unit/locators/test_locators.py b/tests/unit/locators/test_locators.py index 7aee9024..29af0dff 100644 --- a/tests/unit/locators/test_locators.py +++ b/tests/unit/locators/test_locators.py @@ -158,7 +158,7 @@ def test_initialization_with_custom_params( ) def test_initialization_with_invalid_args(self, test_image: PILImage.Image) -> None: - with pytest.raises(ValueError): + with pytest.raises(FileNotFoundError): Image(image="not_an_image") with pytest.raises(ValueError): diff --git a/tests/unit/utils/test_image_utils.py b/tests/unit/utils/test_image_utils.py index f739655a..9cf23116 100644 --- a/tests/unit/utils/test_image_utils.py +++ b/tests/unit/utils/test_image_utils.py @@ -12,109 +12,20 @@ draw_point_on_image, image_to_base64, image_to_data_url, - load_image, scale_coordinates, scale_image_to_fit, ) -class TestLoadImage: - def test_load_image_from_pil( - self, path_fixtures_github_com__icon: pathlib.Path - ) -> None: - img = Image.open(path_fixtures_github_com__icon) - loaded = load_image(img) - assert loaded == img - - def test_load_image_from_path( - self, path_fixtures_github_com__icon: pathlib.Path - ) -> None: - # Test loading from Path - loaded = load_image(path_fixtures_github_com__icon) - assert isinstance(loaded, Image.Image) - assert loaded.size == (128, 125) # GitHub icon size - - # Test loading from str path - loaded = load_image(str(path_fixtures_github_com__icon)) - assert isinstance(loaded, Image.Image) - assert loaded.size == (128, 125) - - def test_load_image_from_base64( - self, path_fixtures_github_com__icon: pathlib.Path - ) -> None: - # Load test image and convert to base64 - with pathlib.Path.open(path_fixtures_github_com__icon, "rb") as f: - img_bytes = f.read() - img_str = base64.b64encode(img_bytes).decode() - - # Test different base64 formats - formats = [ - f"data:image/png;base64,{img_str}", - f"data:;base64,{img_str}", - f"data:,{img_str}", - f",{img_str}", - ] - - for fmt in formats: - loaded = load_image(fmt) - assert isinstance(loaded, Image.Image) - assert loaded.size == (128, 125) - - def test_load_image_invalid( - self, path_fixtures_github_com__icon: pathlib.Path - ) -> None: - with pytest.raises(ValueError): - load_image("invalid_path.png") - - with pytest.raises(ValueError): - load_image("invalid_base64") - - with pytest.raises(ValueError): - with pathlib.Path.open(path_fixtures_github_com__icon, "rb") as f: - img_bytes = f.read() - img_str = base64.b64encode(img_bytes).decode() - load_image(img_str) - - def test_load_image_nonexistent_file(self) -> None: - with pytest.raises(ValueError, match="Could not open image from file path"): - load_image("nonexistent_file.png") - - class TestImageSource: - def test_image_source(self, path_fixtures_github_com__icon: pathlib.Path) -> None: - # Test with PIL Image - img = Image.open(path_fixtures_github_com__icon) - source = ImageSource(root=img) - assert source.root == img - - # Test with path - source = ImageSource(root=path_fixtures_github_com__icon) - assert isinstance(source.root, Image.Image) - assert source.root.size == (128, 125) - - # Test with base64 - with pathlib.Path.open(path_fixtures_github_com__icon, "rb") as f: - img_bytes = f.read() - img_str = base64.b64encode(img_bytes).decode() - source = ImageSource(root=f"data:image/png;base64,{img_str}") - assert isinstance(source.root, Image.Image) - assert source.root.size == (128, 125) - - def test_image_source_invalid(self) -> None: - with pytest.raises(ValueError): - ImageSource(root="invalid_path.png") - - with pytest.raises(ValueError): - ImageSource(root="invalid_base64") - def test_to_data_url(self, path_fixtures_github_com__icon: pathlib.Path) -> None: - source = ImageSource(root=path_fixtures_github_com__icon) + source = ImageSource(Image.open(path_fixtures_github_com__icon)) data_url = source.to_data_url() assert data_url.startswith("data:image/png;base64,") assert len(data_url) > 100 # Should have some base64 content def test_to_base64(self, path_fixtures_github_com__icon: pathlib.Path) -> None: - source = ImageSource(root=path_fixtures_github_com__icon) + source = ImageSource(Image.open(path_fixtures_github_com__icon)) base64_str = source.to_base64() assert len(base64_str) > 100 # Should have some base64 content diff --git a/tests/unit/utils/test_source_utils.py b/tests/unit/utils/test_source_utils.py new file mode 100644 index 00000000..fb72925c --- /dev/null +++ b/tests/unit/utils/test_source_utils.py @@ -0,0 +1,56 @@ +import base64 +import pathlib + +import pytest +from PIL import Image + +from askui.utils.source_utils import load_image_source + + +class TestLoadImageSource: + def test_image_source_from_pil( + self, path_fixtures_github_com__icon: pathlib.Path + ) -> None: + source = load_image_source(path_fixtures_github_com__icon) + assert source.root == Image.open(path_fixtures_github_com__icon) + + def test_image_source_from_path( + self, path_fixtures_github_com__icon: pathlib.Path + ) -> None: + # Test loading from Path + source = load_image_source(path_fixtures_github_com__icon) + assert isinstance(source.root, Image.Image) + assert source.root.size == (128, 125) # GitHub icon size + + # Test loading from str path + source = load_image_source(str(path_fixtures_github_com__icon)) + assert isinstance(source.root, Image.Image) + assert source.root.size == (128, 125) + + def test_image_source_from_data_url( + self, path_fixtures_github_com__icon: pathlib.Path + ) -> None: + # Load test image and convert to base64 + with pathlib.Path.open(path_fixtures_github_com__icon, "rb") as f: + img_bytes = f.read() + img_str = base64.b64encode(img_bytes).decode() + data_url = f"data:image/png;base64,{img_str}" + + source = load_image_source(data_url) + assert isinstance(source.root, Image.Image) + assert source.root.size == (128, 125) + + def test_image_source_invalid( + self, path_fixtures_github_com__icon: pathlib.Path + ) -> None: + with pytest.raises(FileNotFoundError): + load_image_source("invalid_path.png") + + with pytest.raises(FileNotFoundError): + load_image_source("invalid_base64") + + with pytest.raises(OSError): + with pathlib.Path.open(path_fixtures_github_com__icon, "rb") as f: + img_bytes = f.read() + img_str = base64.b64encode(img_bytes).decode() + load_image_source(img_str)