diff --git a/src/pdfrest/client.py b/src/pdfrest/client.py index 8818899f..5ed0ed81 100644 --- a/src/pdfrest/client.py +++ b/src/pdfrest/client.py @@ -66,41 +66,78 @@ PdfRestFileBasedResponse, PdfRestFileID, PdfRestInfoResponse, + SummarizePdfTextResponse, + TranslatePdfTextFileResponse, + TranslatePdfTextResponse, UpResponse, ) - -__all__ = ("AsyncPdfRestClient", "PdfRestClient") - from .models._internal import ( BasePdfRestGraphicPayload, BmpPdfRestPayload, + ConvertToMarkdownPayload, DeletePayload, + ExtractImagesPayload, + ExtractTextPayload, GifPdfRestPayload, JpegPdfRestPayload, + OcrPdfPayload, + PdfBlankPayload, PdfCompressPayload, + PdfConvertColorsPayload, + PdfFlattenAnnotationsPayload, PdfFlattenFormsPayload, + PdfFlattenLayersPayload, + PdfFlattenTransparenciesPayload, PdfInfoPayload, + PdfLinearizePayload, PdfMergePayload, + PdfRasterizePayload, PdfRedactionApplyPayload, PdfRedactionPreviewPayload, PdfRestRawFileResponse, PdfSplitPayload, + PdfToExcelPayload, + PdfToPdfaPayload, PdfToPdfxPayload, + PdfToPowerpointPayload, PdfToWordPayload, + PdfXfaToAcroformsPayload, PngPdfRestPayload, + SummarizePdfTextPayload, TiffPdfRestPayload, + TranslatePdfTextPayload, UploadURLs, ) from .types import ( ALL_PDF_INFO_QUERIES, + BmpColorModel, + CompressionLevel, + ExtractTextGranularity, + FlattenQuality, + GifColorModel, + GraphicSmoothing, + JpegColorModel, + OcrLanguage, + PdfAType, + PdfColorProfile, PdfInfoQuery, PdfMergeInput, + PdfPageOrientation, PdfPageSelection, + PdfPageSize, PdfRedactionInstruction, PdfRGBColor, PdfXType, + PngColorModel, + SummaryFormat, + SummaryOutputFormat, + TiffColorModel, + TranslateOutputFormat, ) +__all__ = ("AsyncPdfRestClient", "PdfRestClient") +FileResponseModel = TypeVar("FileResponseModel", bound=PdfRestFileBasedResponse) + DEFAULT_BASE_URL = "https://api.pdfrest.com" API_KEY_ENV_VAR = "PDFREST_API_KEY" API_KEY_HEADER_NAME = "Api-Key" @@ -965,11 +1002,12 @@ def _post_file_operation( endpoint: str, payload: dict[str, Any], payload_model: type[BaseModel], + response_model: type[FileResponseModel] = PdfRestFileBasedResponse, extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, timeout: TimeoutTypes | None = None, - ) -> PdfRestFileBasedResponse: + ) -> FileResponseModel: job_options = payload_model.model_validate(payload) json_body = job_options.model_dump( mode="json", by_alias=True, exclude_none=True, exclude_unset=True @@ -997,15 +1035,18 @@ def _post_file_operation( for file_id in output_ids ] - return PdfRestFileBasedResponse.model_validate( - { - "input_id": [str(file_id) for file_id in raw_response.input_id], - "output_file": [ - file.model_dump(mode="json", by_alias=True) for file in output_files - ], - "warning": raw_response.warning, - } - ) + input_ids = raw_response.input_id or (raw_response.ids or []) + response_payload: dict[str, Any] = { + "input_id": [str(file_id) for file_id in input_ids], + "output_file": [ + file.model_dump(mode="json", by_alias=True) for file in output_files + ], + "warning": raw_response.warning, + } + if raw_response.model_extra: + response_payload.update(raw_response.model_extra) + + return response_model.model_validate(response_payload) def send_request(self, request: _RequestModel) -> Any: return self._send_request(request) @@ -1229,11 +1270,12 @@ async def _post_file_operation( endpoint: str, payload: dict[str, Any], payload_model: type[BaseModel], + response_model: type[FileResponseModel] = PdfRestFileBasedResponse, extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, timeout: TimeoutTypes | None = None, - ) -> PdfRestFileBasedResponse: + ) -> FileResponseModel: job_options = payload_model.model_validate(payload) request = self.prepare_request( "POST", @@ -1269,15 +1311,18 @@ async def throttled_fetch_file_info(file_id: str) -> PdfRestFile: ) ) - return PdfRestFileBasedResponse.model_validate( - { - "input_id": [str(file_id) for file_id in raw_response.input_id], - "output_file": [ - file.model_dump(mode="json", by_alias=True) for file in output_files - ], - "warning": raw_response.warning, - } - ) + input_ids = raw_response.input_id or (raw_response.ids or []) + response_payload: dict[str, Any] = { + "input_id": [str(file_id) for file_id in input_ids], + "output_file": [ + file.model_dump(mode="json", by_alias=True) for file in output_files + ], + "warning": raw_response.warning, + } + if raw_response.model_extra: + response_payload.update(raw_response.model_extra) + + return response_model.model_validate(response_payload) async def send_request(self, request: _RequestModel) -> Any: return await self._send_request(request) @@ -2105,442 +2150,1041 @@ def query_pdf_info( raw_payload = self._send_request(request) return PdfRestInfoResponse.model_validate(raw_payload) - def preview_redactions( + def summarize_text( self, file: PdfRestFile | Sequence[PdfRestFile], *, - redactions: PdfRedactionInstruction | Sequence[PdfRedactionInstruction], + target_word_count: int = 400, + summary_format: SummaryFormat = "overview", + pages: PdfPageSelection | None = None, + output_format: SummaryOutputFormat = "markdown", output: str | None = None, extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, timeout: TimeoutTypes | None = None, - ) -> PdfRestFileBasedResponse: - """Generate a PDF redaction preview with annotated redaction rectangles.""" + ) -> SummarizePdfTextResponse: + """Summarize the textual content of a PDF, Markdown, or text document. + + Always requests JSON output and returns the inline summary response defined in + the pdfRest API reference. + """ payload: dict[str, Any] = { "files": file, - "redactions": redactions, + "target_word_count": target_word_count, + "summary_format": summary_format, + "output_format": output_format, + "output_type": "json", } + if pages is not None: + payload["pages"] = pages if output is not None: payload["output"] = output - return self._post_file_operation( - endpoint="/pdf-with-redacted-text-preview", - payload=payload, - payload_model=PdfRedactionPreviewPayload, + validated_payload = SummarizePdfTextPayload.model_validate(payload) + request = self.prepare_request( + "POST", + "/summarized-pdf-text", + json_body=validated_payload.model_dump( + mode="json", by_alias=True, exclude_none=True, exclude_unset=True + ), extra_query=extra_query, extra_headers=extra_headers, extra_body=extra_body, timeout=timeout, ) + raw_payload = self._send_request(request) + return SummarizePdfTextResponse.model_validate(raw_payload) - def apply_redactions( + def summarize_text_to_file( self, file: PdfRestFile | Sequence[PdfRestFile], *, - rgb_color: PdfRGBColor | Sequence[int] | None = None, + target_word_count: int = 400, + summary_format: SummaryFormat = "overview", + pages: PdfPageSelection | None = None, + output_format: SummaryOutputFormat = "markdown", output: str | None = None, extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, timeout: TimeoutTypes | None = None, ) -> PdfRestFileBasedResponse: - """Apply previously previewed redactions and return the final redacted PDF.""" + """Summarize a document and return the result as a downloadable file.""" payload: dict[str, Any] = { "files": file, + "target_word_count": target_word_count, + "summary_format": summary_format, + "output_format": output_format, + "output_type": "file", } - if rgb_color is not None: - payload["rgb_color"] = rgb_color + if pages is not None: + payload["pages"] = pages if output is not None: payload["output"] = output return self._post_file_operation( - endpoint="/pdf-with-redacted-text-applied", + endpoint="/summarized-pdf-text", payload=payload, - payload_model=PdfRedactionApplyPayload, + payload_model=SummarizePdfTextPayload, extra_query=extra_query, extra_headers=extra_headers, extra_body=extra_body, timeout=timeout, ) - def split_pdf( + def convert_to_markdown( self, file: PdfRestFile | Sequence[PdfRestFile], *, - page_groups: Sequence[PdfPageSelection] | PdfPageSelection | None = None, - output_prefix: str | None = None, + pages: PdfPageSelection | None = None, + page_break_comments: bool = False, + output: str | None = None, extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, timeout: TimeoutTypes | None = None, ) -> PdfRestFileBasedResponse: - """Split a PDF into one or more PDF files based on the provided page groups.""" + """Convert a PDF to Markdown and return a file-based response.""" - payload: dict[str, Any] = {"files": file} - if page_groups is not None: - payload["page_groups"] = page_groups - if output_prefix is not None: - payload["output_prefix"] = output_prefix + payload: dict[str, Any] = { + "files": file, + "output_type": "file", + "page_break_comments": page_break_comments, + } + if pages is not None: + payload["pages"] = pages + if output is not None: + payload["output"] = output return self._post_file_operation( - endpoint="/split-pdf", + endpoint="/markdown", payload=payload, - payload_model=PdfSplitPayload, + payload_model=ConvertToMarkdownPayload, extra_query=extra_query, extra_headers=extra_headers, extra_body=extra_body, timeout=timeout, ) - def merge_pdfs( + def ocr_pdf( self, - sources: Sequence[PdfMergeInput], + file: PdfRestFile | Sequence[PdfRestFile], *, - output_prefix: str | None = None, + languages: OcrLanguage | Sequence[OcrLanguage] = "English", + pages: PdfPageSelection | None = None, + output: str | None = None, extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, timeout: TimeoutTypes | None = None, ) -> PdfRestFileBasedResponse: - """Merge multiple PDFs (or page subsets) into a single PDF file.""" + """Perform OCR on a PDF to make text searchable and extractable.""" - payload: dict[str, Any] = {"sources": sources} - if output_prefix is not None: - payload["output_prefix"] = output_prefix + payload: dict[str, Any] = {"files": file, "languages": languages} + if pages is not None: + payload["pages"] = pages + if output is not None: + payload["output"] = output return self._post_file_operation( - endpoint="/merged-pdf", + endpoint="/pdf-with-ocr-text", payload=payload, - payload_model=PdfMergePayload, + payload_model=OcrPdfPayload, extra_query=extra_query, extra_headers=extra_headers, extra_body=extra_body, timeout=timeout, ) - def convert_to_word( + def translate_pdf_text( self, file: PdfRestFile | Sequence[PdfRestFile], *, + output_language: str, + pages: PdfPageSelection | None = None, + output_format: TranslateOutputFormat = "markdown", output: str | None = None, extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, timeout: TimeoutTypes | None = None, - ) -> PdfRestFileBasedResponse: - """Convert a PDF to a Word document.""" + ) -> TranslatePdfTextResponse: + """Translate the textual content of a PDF, Markdown, or text document (JSON).""" - payload: dict[str, Any] = {"files": file} + payload: dict[str, Any] = { + "files": file, + "output_language": output_language, + "output_format": output_format, + "output_type": "json", + } + if pages is not None: + payload["pages"] = pages if output is not None: payload["output"] = output - return self._post_file_operation( - endpoint="/word", - payload=payload, - payload_model=PdfToWordPayload, + validated_payload = TranslatePdfTextPayload.model_validate(payload) + request = self.prepare_request( + "POST", + "/translated-pdf-text", + json_body=validated_payload.model_dump( + mode="json", by_alias=True, exclude_none=True, exclude_unset=True + ), extra_query=extra_query, extra_headers=extra_headers, extra_body=extra_body, timeout=timeout, ) + raw_payload = self._send_request(request) + return TranslatePdfTextResponse.model_validate(raw_payload) - def flatten_pdf_forms( + def translate_pdf_text_to_file( self, file: PdfRestFile | Sequence[PdfRestFile], *, + output_language: str, + pages: PdfPageSelection | None = None, + output_format: TranslateOutputFormat = "markdown", output: str | None = None, extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, timeout: TimeoutTypes | None = None, - ) -> PdfRestFileBasedResponse: - """Flatten form fields in a PDF so they are no longer editable.""" + ) -> TranslatePdfTextFileResponse: + """Translate textual content and receive a file-based response.""" - payload: dict[str, Any] = {"files": file} + payload: dict[str, Any] = { + "files": file, + "output_language": output_language, + "output_format": output_format, + "output_type": "file", + } + if pages is not None: + payload["pages"] = pages if output is not None: payload["output"] = output return self._post_file_operation( - endpoint="/flattened-forms-pdf", + endpoint="/translated-pdf-text", payload=payload, - payload_model=PdfFlattenFormsPayload, + payload_model=TranslatePdfTextPayload, extra_query=extra_query, extra_headers=extra_headers, extra_body=extra_body, timeout=timeout, + response_model=TranslatePdfTextFileResponse, ) - def compress_pdf( + def extract_images( self, file: PdfRestFile | Sequence[PdfRestFile], *, - compression_level: Literal["low", "medium", "high", "custom"], - profile: PdfRestFile | Sequence[PdfRestFile] | None = None, + pages: PdfPageSelection | None = None, output: str | None = None, extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, timeout: TimeoutTypes | None = None, ) -> PdfRestFileBasedResponse: - """Compress a PDF using preset or custom compression profiles.""" + """Extract embedded images from a PDF.""" - payload: dict[str, Any] = { - "files": file, - "compression_level": compression_level, - } - if profile is not None: - payload["profile"] = profile + payload: dict[str, Any] = {"files": file} + if pages is not None: + payload["pages"] = pages if output is not None: payload["output"] = output return self._post_file_operation( - endpoint="/compressed-pdf", + endpoint="/extracted-images", payload=payload, - payload_model=PdfCompressPayload, + payload_model=ExtractImagesPayload, extra_query=extra_query, extra_headers=extra_headers, extra_body=extra_body, timeout=timeout, ) - def convert_to_pdfx( + def extract_pdf_text_to_file( self, file: PdfRestFile | Sequence[PdfRestFile], *, - output_type: PdfXType, + pages: PdfPageSelection | None = None, + full_text: ExtractTextGranularity = "document", + preserve_line_breaks: bool = False, + word_style: bool = False, + word_coordinates: bool = False, output: str | None = None, extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, timeout: TimeoutTypes | None = None, ) -> PdfRestFileBasedResponse: - """Convert a PDF to a specified PDF/X version.""" + """Extract text content from a PDF and return a file-based response.""" - payload: dict[str, Any] = {"files": file, "output_type": output_type} + payload: dict[str, Any] = { + "files": file, + "full_text": full_text, + "preserve_line_breaks": preserve_line_breaks, + "word_style": word_style, + "word_coordinates": word_coordinates, + "output_type": "file", + } + if pages is not None: + payload["pages"] = pages if output is not None: payload["output"] = output return self._post_file_operation( - endpoint="/pdfx", + endpoint="/extracted-text", payload=payload, - payload_model=PdfToPdfxPayload, + payload_model=ExtractTextPayload, extra_query=extra_query, extra_headers=extra_headers, extra_body=extra_body, timeout=timeout, ) - def convert_to_png( + def preview_redactions( self, - files: PdfRestFile | Sequence[PdfRestFile], + file: PdfRestFile | Sequence[PdfRestFile], *, - output_prefix: str | None = None, - page_range: str | Sequence[str] | None = None, - resolution: int = 300, - color_model: Literal["rgb", "rgba", "gray"] = "rgb", - smoothing: Literal["none", "all", "text", "line", "image"] - | Sequence[Literal["none", "all", "text", "line", "image"]] - | None = None, + redactions: PdfRedactionInstruction | Sequence[PdfRedactionInstruction], + output: str | None = None, extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, timeout: TimeoutTypes | None = None, ) -> PdfRestFileBasedResponse: - """Convert one or more pdfRest files to PNG images.""" + """Generate a PDF redaction preview with annotated redaction rectangles.""" payload: dict[str, Any] = { - "files": files, - "resolution": resolution, - "color_model": color_model, + "files": file, + "redactions": redactions, } - if output_prefix is not None: - payload["output_prefix"] = output_prefix - if page_range is not None: - payload["page_range"] = page_range - if smoothing is not None: - payload["smoothing"] = smoothing + if output is not None: + payload["output"] = output - return self._convert_to_graphic( - endpoint="/png", + return self._post_file_operation( + endpoint="/pdf-with-redacted-text-preview", payload=payload, - payload_model=PngPdfRestPayload, + payload_model=PdfRedactionPreviewPayload, extra_query=extra_query, extra_headers=extra_headers, extra_body=extra_body, timeout=timeout, ) - def convert_to_bmp( + def apply_redactions( self, - files: PdfRestFile | Sequence[PdfRestFile], + file: PdfRestFile | Sequence[PdfRestFile], *, - output_prefix: str | None = None, - page_range: str | Sequence[str] | None = None, - resolution: int = 300, - color_model: Literal["rgb", "gray"] = "rgb", - smoothing: Literal["none", "all", "text", "line", "image"] - | Sequence[Literal["none", "all", "text", "line", "image"]] - | None = None, + rgb_color: PdfRGBColor | Sequence[int] | None = None, + output: str | None = None, extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, timeout: TimeoutTypes | None = None, ) -> PdfRestFileBasedResponse: - """Convert one or more pdfRest files to BMP images.""" + """Apply previously previewed redactions and return the final redacted PDF.""" payload: dict[str, Any] = { - "files": files, - "resolution": resolution, - "color_model": color_model, + "files": file, } - if output_prefix is not None: - payload["output_prefix"] = output_prefix - if page_range is not None: - payload["page_range"] = page_range - if smoothing is not None: - payload["smoothing"] = smoothing + if rgb_color is not None: + payload["rgb_color"] = rgb_color + if output is not None: + payload["output"] = output - return self._convert_to_graphic( - endpoint="/bmp", + return self._post_file_operation( + endpoint="/pdf-with-redacted-text-applied", payload=payload, - payload_model=BmpPdfRestPayload, + payload_model=PdfRedactionApplyPayload, extra_query=extra_query, extra_headers=extra_headers, extra_body=extra_body, timeout=timeout, ) - def convert_to_gif( + def split_pdf( self, - files: PdfRestFile | Sequence[PdfRestFile], + file: PdfRestFile | Sequence[PdfRestFile], *, + page_groups: Sequence[PdfPageSelection] | PdfPageSelection | None = None, output_prefix: str | None = None, - page_range: str | Sequence[str] | None = None, - resolution: int = 300, - color_model: Literal["rgb", "gray"] = "rgb", - smoothing: Literal["none", "all", "text", "line", "image"] - | Sequence[Literal["none", "all", "text", "line", "image"]] - | None = None, extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, timeout: TimeoutTypes | None = None, ) -> PdfRestFileBasedResponse: - """Convert one or more pdfRest files to GIF images.""" + """Split a PDF into one or more PDF files based on the provided page groups.""" - payload: dict[str, Any] = { - "files": files, - "resolution": resolution, - "color_model": color_model, - } + payload: dict[str, Any] = {"files": file} + if page_groups is not None: + payload["page_groups"] = page_groups if output_prefix is not None: payload["output_prefix"] = output_prefix - if page_range is not None: - payload["page_range"] = page_range - if smoothing is not None: - payload["smoothing"] = smoothing - return self._convert_to_graphic( - endpoint="/gif", + return self._post_file_operation( + endpoint="/split-pdf", payload=payload, - payload_model=GifPdfRestPayload, + payload_model=PdfSplitPayload, extra_query=extra_query, extra_headers=extra_headers, extra_body=extra_body, timeout=timeout, ) - def convert_to_jpeg( + def merge_pdfs( self, - files: PdfRestFile | Sequence[PdfRestFile], + sources: Sequence[PdfMergeInput], *, output_prefix: str | None = None, - page_range: str | Sequence[str] | None = None, - resolution: int = 300, - color_model: Literal["rgb", "cmyk", "gray"] = "rgb", - smoothing: Literal["none", "all", "text", "line", "image"] - | Sequence[Literal["none", "all", "text", "line", "image"]] - | None = None, - jpeg_quality: int | None = None, extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, timeout: TimeoutTypes | None = None, ) -> PdfRestFileBasedResponse: - """Convert one or more pdfRest files to JPEG images.""" + """Merge multiple PDFs (or page subsets) into a single PDF file.""" - payload: dict[str, Any] = { - "files": files, - "resolution": resolution, - "color_model": color_model, - } + payload: dict[str, Any] = {"sources": sources} if output_prefix is not None: payload["output_prefix"] = output_prefix - if page_range is not None: - payload["page_range"] = page_range - if smoothing is not None: - payload["smoothing"] = smoothing - if jpeg_quality is not None: - payload["jpeg_quality"] = jpeg_quality - return self._convert_to_graphic( - endpoint="/jpg", + return self._post_file_operation( + endpoint="/merged-pdf", payload=payload, - payload_model=JpegPdfRestPayload, + payload_model=PdfMergePayload, extra_query=extra_query, extra_headers=extra_headers, extra_body=extra_body, timeout=timeout, ) - def convert_to_tiff( + def convert_to_excel( self, - files: PdfRestFile | Sequence[PdfRestFile], + file: PdfRestFile | Sequence[PdfRestFile], *, - output_prefix: str | None = None, - page_range: str | Sequence[str] | None = None, - resolution: int = 300, - color_model: Literal["rgb", "rgba", "cmyk", "lab", "gray"] = "rgb", - smoothing: Literal["none", "all", "text", "line", "image"] - | Sequence[Literal["none", "all", "text", "line", "image"]] - | None = None, + output: str | None = None, extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, timeout: TimeoutTypes | None = None, ) -> PdfRestFileBasedResponse: - """Convert one or more pdfRest files to TIFF images.""" + """Convert a PDF to an Excel spreadsheet.""" - payload: dict[str, Any] = { - "files": files, - "resolution": resolution, - "color_model": color_model, - } - if output_prefix is not None: - payload["output_prefix"] = output_prefix - if page_range is not None: - payload["page_range"] = page_range - if smoothing is not None: - payload["smoothing"] = smoothing + payload: dict[str, Any] = {"files": file} + if output is not None: + payload["output"] = output - return self._convert_to_graphic( - endpoint="/tif", + return self._post_file_operation( + endpoint="/excel", payload=payload, - payload_model=TiffPdfRestPayload, + payload_model=PdfToExcelPayload, extra_query=extra_query, extra_headers=extra_headers, extra_body=extra_body, timeout=timeout, ) - -class AsyncPdfRestClient(_AsyncApiClient): - """Asynchronous client for interacting with the pdfrest API.""" - - def __init__( + def convert_to_powerpoint( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Convert a PDF to a PowerPoint presentation.""" + + payload: dict[str, Any] = {"files": file} + if output is not None: + payload["output"] = output + + return self._post_file_operation( + endpoint="/powerpoint", + payload=payload, + payload_model=PdfToPowerpointPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + def convert_xfa_to_acroforms( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Convert an XFA PDF to an AcroForm-enabled PDF.""" + + payload: dict[str, Any] = {"files": file} + if output is not None: + payload["output"] = output + + return self._post_file_operation( + endpoint="/pdf-with-acroforms", + payload=payload, + payload_model=PdfXfaToAcroformsPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + def convert_to_word( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Convert a PDF to a Word document.""" + + payload: dict[str, Any] = {"files": file} + if output is not None: + payload["output"] = output + + return self._post_file_operation( + endpoint="/word", + payload=payload, + payload_model=PdfToWordPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + def flatten_pdf_forms( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Flatten form fields in a PDF so they are no longer editable.""" + + payload: dict[str, Any] = {"files": file} + if output is not None: + payload["output"] = output + + return self._post_file_operation( + endpoint="/flattened-forms-pdf", + payload=payload, + payload_model=PdfFlattenFormsPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + def compress_pdf( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + compression_level: CompressionLevel, + profile: PdfRestFile | Sequence[PdfRestFile] | None = None, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Compress a PDF using preset or custom compression profiles.""" + + payload: dict[str, Any] = { + "files": file, + "compression_level": compression_level, + } + if profile is not None: + payload["profile"] = profile + if output is not None: + payload["output"] = output + + return self._post_file_operation( + endpoint="/compressed-pdf", + payload=payload, + payload_model=PdfCompressPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + def convert_colors( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + color_profile: PdfColorProfile, + profile: PdfRestFile | Sequence[PdfRestFile] | None = None, + preserve_black: bool = False, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Convert PDF colors using preset or custom ICC profiles.""" + + payload: dict[str, Any] = { + "files": file, + "color_profile": color_profile, + "preserve_black": preserve_black, + } + if profile is not None: + payload["profile"] = profile + if output is not None: + payload["output"] = output + + return self._post_file_operation( + endpoint="/pdf-with-converted-colors", + payload=payload, + payload_model=PdfConvertColorsPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + def blank_pdf( + self, + *, + page_size: PdfPageSize, + page_count: int, + page_orientation: PdfPageOrientation | None = None, + custom_height: float | None = None, + custom_width: float | None = None, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Create a blank PDF with the specified size, count, and orientation.""" + + payload: dict[str, Any] = { + "page_size": page_size, + "page_count": page_count, + } + if page_orientation is not None: + payload["page_orientation"] = page_orientation + if custom_height is not None: + payload["custom_height"] = custom_height + if custom_width is not None: + payload["custom_width"] = custom_width + if output is not None: + payload["output"] = output + + return self._post_file_operation( + endpoint="/blank-pdf", + payload=payload, + payload_model=PdfBlankPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + def flatten_transparencies( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output: str | None = None, + quality: FlattenQuality = "medium", + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Flatten transparent objects in a PDF.""" + + payload: dict[str, Any] = {"files": file, "quality": quality} + if output is not None: + payload["output"] = output + + return self._post_file_operation( + endpoint="/flattened-transparencies-pdf", + payload=payload, + payload_model=PdfFlattenTransparenciesPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + def linearize_pdf( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Linearize a PDF for optimized fast web view.""" + + payload: dict[str, Any] = {"files": file} + if output is not None: + payload["output"] = output + + return self._post_file_operation( + endpoint="/linearized-pdf", + payload=payload, + payload_model=PdfLinearizePayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + def flatten_annotations( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Flatten annotations into the PDF content.""" + + payload: dict[str, Any] = {"files": file} + if output is not None: + payload["output"] = output + + return self._post_file_operation( + endpoint="/flattened-annotations-pdf", + payload=payload, + payload_model=PdfFlattenAnnotationsPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + def flatten_layers( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Flatten all layers in a PDF into a single layer.""" + + payload: dict[str, Any] = {"files": file} + if output is not None: + payload["output"] = output + + return self._post_file_operation( + endpoint="/flattened-layers-pdf", + payload=payload, + payload_model=PdfFlattenLayersPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + def rasterize_pdf( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Rasterize a PDF into a flattened bitmap-based PDF.""" + + payload: dict[str, Any] = {"files": file} + if output is not None: + payload["output"] = output + + return self._post_file_operation( + endpoint="/rasterized-pdf", + payload=payload, + payload_model=PdfRasterizePayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + def convert_to_pdfa( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output_type: PdfAType, + output: str | None = None, + rasterize_if_errors_encountered: bool = False, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Convert a PDF to a specified PDF/A version.""" + + payload: dict[str, Any] = { + "files": file, + "output_type": output_type, + "rasterize_if_errors_encountered": rasterize_if_errors_encountered, + } + if output is not None: + payload["output"] = output + return self._post_file_operation( + endpoint="/pdfa", + payload=payload, + payload_model=PdfToPdfaPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + def convert_to_pdfx( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output_type: PdfXType, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Convert a PDF to a specified PDF/X version.""" + + payload: dict[str, Any] = {"files": file, "output_type": output_type} + if output is not None: + payload["output"] = output + + return self._post_file_operation( + endpoint="/pdfx", + payload=payload, + payload_model=PdfToPdfxPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + def convert_to_png( + self, + files: PdfRestFile | Sequence[PdfRestFile], + *, + output_prefix: str | None = None, + page_range: str | Sequence[str] | None = None, + resolution: int = 300, + color_model: PngColorModel = "rgb", + smoothing: GraphicSmoothing | Sequence[GraphicSmoothing] | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Convert one or more pdfRest files to PNG images.""" + + payload: dict[str, Any] = { + "files": files, + "resolution": resolution, + "color_model": color_model, + } + if output_prefix is not None: + payload["output_prefix"] = output_prefix + if page_range is not None: + payload["page_range"] = page_range + if smoothing is not None: + payload["smoothing"] = smoothing + + return self._convert_to_graphic( + endpoint="/png", + payload=payload, + payload_model=PngPdfRestPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + def convert_to_bmp( + self, + files: PdfRestFile | Sequence[PdfRestFile], + *, + output_prefix: str | None = None, + page_range: str | Sequence[str] | None = None, + resolution: int = 300, + color_model: BmpColorModel = "rgb", + smoothing: GraphicSmoothing | Sequence[GraphicSmoothing] | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Convert one or more pdfRest files to BMP images.""" + + payload: dict[str, Any] = { + "files": files, + "resolution": resolution, + "color_model": color_model, + } + if output_prefix is not None: + payload["output_prefix"] = output_prefix + if page_range is not None: + payload["page_range"] = page_range + if smoothing is not None: + payload["smoothing"] = smoothing + + return self._convert_to_graphic( + endpoint="/bmp", + payload=payload, + payload_model=BmpPdfRestPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + def convert_to_gif( + self, + files: PdfRestFile | Sequence[PdfRestFile], + *, + output_prefix: str | None = None, + page_range: str | Sequence[str] | None = None, + resolution: int = 300, + color_model: GifColorModel = "rgb", + smoothing: GraphicSmoothing | Sequence[GraphicSmoothing] | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Convert one or more pdfRest files to GIF images.""" + + payload: dict[str, Any] = { + "files": files, + "resolution": resolution, + "color_model": color_model, + } + if output_prefix is not None: + payload["output_prefix"] = output_prefix + if page_range is not None: + payload["page_range"] = page_range + if smoothing is not None: + payload["smoothing"] = smoothing + + return self._convert_to_graphic( + endpoint="/gif", + payload=payload, + payload_model=GifPdfRestPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + def convert_to_jpeg( + self, + files: PdfRestFile | Sequence[PdfRestFile], + *, + output_prefix: str | None = None, + page_range: str | Sequence[str] | None = None, + resolution: int = 300, + color_model: JpegColorModel = "rgb", + smoothing: GraphicSmoothing | Sequence[GraphicSmoothing] | None = None, + jpeg_quality: int = 75, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Convert one or more pdfRest files to JPEG images.""" + + payload: dict[str, Any] = { + "files": files, + "resolution": resolution, + "color_model": color_model, + "jpeg_quality": jpeg_quality, + } + if output_prefix is not None: + payload["output_prefix"] = output_prefix + if page_range is not None: + payload["page_range"] = page_range + if smoothing is not None: + payload["smoothing"] = smoothing + + return self._convert_to_graphic( + endpoint="/jpg", + payload=payload, + payload_model=JpegPdfRestPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + def convert_to_tiff( + self, + files: PdfRestFile | Sequence[PdfRestFile], + *, + output_prefix: str | None = None, + page_range: str | Sequence[str] | None = None, + resolution: int = 300, + color_model: TiffColorModel = "rgb", + smoothing: GraphicSmoothing | Sequence[GraphicSmoothing] | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Convert one or more pdfRest files to TIFF images.""" + + payload: dict[str, Any] = { + "files": files, + "resolution": resolution, + "color_model": color_model, + } + if output_prefix is not None: + payload["output_prefix"] = output_prefix + if page_range is not None: + payload["page_range"] = page_range + if smoothing is not None: + payload["smoothing"] = smoothing + + return self._convert_to_graphic( + endpoint="/tif", + payload=payload, + payload_model=TiffPdfRestPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + +class AsyncPdfRestClient(_AsyncApiClient): + """Asynchronous client for interacting with the pdfrest API.""" + + def __init__( self, *, api_key: str | None = None, @@ -2603,8 +3247,304 @@ async def query_pdf_info( extra_body=extra_body, timeout=timeout, ) - raw_payload = await self._send_request(request) - return PdfRestInfoResponse.model_validate(raw_payload) + raw_payload = await self._send_request(request) + return PdfRestInfoResponse.model_validate(raw_payload) + + async def summarize_text( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + target_word_count: int = 400, + summary_format: SummaryFormat = "overview", + pages: PdfPageSelection | None = None, + output_format: SummaryOutputFormat = "markdown", + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> SummarizePdfTextResponse: + """Summarize the textual content of a PDF, Markdown, or text document. + + Always requests JSON output and returns the inline summary response defined in + the pdfRest API reference. + """ + + payload: dict[str, Any] = { + "files": file, + "target_word_count": target_word_count, + "summary_format": summary_format, + "output_format": output_format, + "output_type": "json", + } + if pages is not None: + payload["pages"] = pages + if output is not None: + payload["output"] = output + + validated_payload = SummarizePdfTextPayload.model_validate(payload) + request = self.prepare_request( + "POST", + "/summarized-pdf-text", + json_body=validated_payload.model_dump( + mode="json", by_alias=True, exclude_none=True, exclude_unset=True + ), + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + raw_payload = await self._send_request(request) + return SummarizePdfTextResponse.model_validate(raw_payload) + + async def summarize_text_to_file( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + target_word_count: int = 400, + summary_format: SummaryFormat = "overview", + pages: PdfPageSelection | None = None, + output_format: SummaryOutputFormat = "markdown", + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Summarize a document and return the result as a downloadable file.""" + + payload: dict[str, Any] = { + "files": file, + "target_word_count": target_word_count, + "summary_format": summary_format, + "output_format": output_format, + "output_type": "file", + } + if pages is not None: + payload["pages"] = pages + if output is not None: + payload["output"] = output + + return await self._post_file_operation( + endpoint="/summarized-pdf-text", + payload=payload, + payload_model=SummarizePdfTextPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + async def convert_to_markdown( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + pages: PdfPageSelection | None = None, + page_break_comments: bool = False, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Convert a PDF to Markdown and return a file-based response.""" + + payload: dict[str, Any] = { + "files": file, + "output_type": "file", + "page_break_comments": page_break_comments, + } + if pages is not None: + payload["pages"] = pages + if output is not None: + payload["output"] = output + + return await self._post_file_operation( + endpoint="/markdown", + payload=payload, + payload_model=ConvertToMarkdownPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + async def ocr_pdf( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + languages: OcrLanguage | Sequence[OcrLanguage] = "English", + pages: PdfPageSelection | None = None, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Perform OCR on a PDF to make text searchable and extractable.""" + + payload: dict[str, Any] = {"files": file, "languages": languages} + if pages is not None: + payload["pages"] = pages + if output is not None: + payload["output"] = output + + return await self._post_file_operation( + endpoint="/pdf-with-ocr-text", + payload=payload, + payload_model=OcrPdfPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + async def translate_pdf_text( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output_language: str, + pages: PdfPageSelection | None = None, + output_format: TranslateOutputFormat = "markdown", + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> TranslatePdfTextResponse: + """Translate the textual content of a PDF, Markdown, or text document (JSON).""" + + payload: dict[str, Any] = { + "files": file, + "output_language": output_language, + "output_format": output_format, + "output_type": "json", + } + if pages is not None: + payload["pages"] = pages + if output is not None: + payload["output"] = output + + validated_payload = TranslatePdfTextPayload.model_validate(payload) + request = self.prepare_request( + "POST", + "/translated-pdf-text", + json_body=validated_payload.model_dump( + mode="json", by_alias=True, exclude_none=True, exclude_unset=True + ), + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + raw_payload = await self._send_request(request) + return TranslatePdfTextResponse.model_validate(raw_payload) + + async def translate_pdf_text_to_file( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output_language: str, + pages: PdfPageSelection | None = None, + output_format: TranslateOutputFormat = "markdown", + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> TranslatePdfTextFileResponse: + """Translate textual content and receive a file-based response.""" + + payload: dict[str, Any] = { + "files": file, + "output_language": output_language, + "output_format": output_format, + "output_type": "file", + } + if pages is not None: + payload["pages"] = pages + if output is not None: + payload["output"] = output + + return await self._post_file_operation( + endpoint="/translated-pdf-text", + payload=payload, + payload_model=TranslatePdfTextPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + response_model=TranslatePdfTextFileResponse, + ) + + async def extract_images( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + pages: PdfPageSelection | None = None, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Extract embedded images from a PDF.""" + + payload: dict[str, Any] = {"files": file} + if pages is not None: + payload["pages"] = pages + if output is not None: + payload["output"] = output + + return await self._post_file_operation( + endpoint="/extracted-images", + payload=payload, + payload_model=ExtractImagesPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + async def extract_pdf_text_to_file( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + pages: PdfPageSelection | None = None, + full_text: ExtractTextGranularity = "document", + preserve_line_breaks: bool = False, + word_style: bool = False, + word_coordinates: bool = False, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Extract text content from a PDF and return a file-based response.""" + + payload: dict[str, Any] = { + "files": file, + "full_text": full_text, + "preserve_line_breaks": preserve_line_breaks, + "word_style": word_style, + "word_coordinates": word_coordinates, + "output_type": "file", + } + if pages is not None: + payload["pages"] = pages + if output is not None: + payload["output"] = output + + return await self._post_file_operation( + endpoint="/extracted-text", + payload=payload, + payload_model=ExtractTextPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) async def preview_redactions( self, @@ -2764,6 +3704,84 @@ async def merge_pdfs( timeout=timeout, ) + async def convert_to_excel( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Asynchronously convert a PDF to an Excel spreadsheet.""" + + payload: dict[str, Any] = {"files": file} + if output is not None: + payload["output"] = output + + return await self._post_file_operation( + endpoint="/excel", + payload=payload, + payload_model=PdfToExcelPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + async def convert_to_powerpoint( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Asynchronously convert a PDF to a PowerPoint presentation.""" + + payload: dict[str, Any] = {"files": file} + if output is not None: + payload["output"] = output + + return await self._post_file_operation( + endpoint="/powerpoint", + payload=payload, + payload_model=PdfToPowerpointPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + async def convert_xfa_to_acroforms( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Asynchronously convert an XFA PDF to an AcroForm-enabled PDF.""" + + payload: dict[str, Any] = {"files": file} + if output is not None: + payload["output"] = output + + return await self._post_file_operation( + endpoint="/pdf-with-acroforms", + payload=payload, + payload_model=PdfXfaToAcroformsPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + async def convert_to_word( self, file: PdfRestFile | Sequence[PdfRestFile], @@ -2820,7 +3838,7 @@ async def compress_pdf( self, file: PdfRestFile | Sequence[PdfRestFile], *, - compression_level: Literal["low", "medium", "high", "custom"], + compression_level: CompressionLevel, profile: PdfRestFile | Sequence[PdfRestFile] | None = None, output: str | None = None, extra_query: Query | None = None, @@ -2849,6 +3867,243 @@ async def compress_pdf( timeout=timeout, ) + async def convert_colors( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + color_profile: PdfColorProfile, + profile: PdfRestFile | Sequence[PdfRestFile] | None = None, + preserve_black: bool = False, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Asynchronously convert PDF colors using preset or custom ICC profiles.""" + + payload: dict[str, Any] = { + "files": file, + "color_profile": color_profile, + "preserve_black": preserve_black, + } + if profile is not None: + payload["profile"] = profile + if output is not None: + payload["output"] = output + + return await self._post_file_operation( + endpoint="/pdf-with-converted-colors", + payload=payload, + payload_model=PdfConvertColorsPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + async def blank_pdf( + self, + *, + page_size: PdfPageSize, + page_count: int, + page_orientation: PdfPageOrientation | None = None, + custom_height: float | None = None, + custom_width: float | None = None, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Asynchronously create a blank PDF with the specified size.""" + + payload: dict[str, Any] = { + "page_size": page_size, + "page_count": page_count, + } + if page_orientation is not None: + payload["page_orientation"] = page_orientation + if custom_height is not None: + payload["custom_height"] = custom_height + if custom_width is not None: + payload["custom_width"] = custom_width + if output is not None: + payload["output"] = output + + return await self._post_file_operation( + endpoint="/blank-pdf", + payload=payload, + payload_model=PdfBlankPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + async def flatten_transparencies( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output: str | None = None, + quality: FlattenQuality = "medium", + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Asynchronously flatten transparent objects in a PDF.""" + + payload: dict[str, Any] = {"files": file, "quality": quality} + if output is not None: + payload["output"] = output + + return await self._post_file_operation( + endpoint="/flattened-transparencies-pdf", + payload=payload, + payload_model=PdfFlattenTransparenciesPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + async def linearize_pdf( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Asynchronously linearize a PDF for optimized fast web view.""" + + payload: dict[str, Any] = {"files": file} + if output is not None: + payload["output"] = output + + return await self._post_file_operation( + endpoint="/linearized-pdf", + payload=payload, + payload_model=PdfLinearizePayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + async def flatten_annotations( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Asynchronously flatten annotations into the PDF content.""" + + payload: dict[str, Any] = {"files": file} + if output is not None: + payload["output"] = output + + return await self._post_file_operation( + endpoint="/flattened-annotations-pdf", + payload=payload, + payload_model=PdfFlattenAnnotationsPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + async def flatten_layers( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Asynchronously flatten all layers in a PDF.""" + + payload: dict[str, Any] = {"files": file} + if output is not None: + payload["output"] = output + + return await self._post_file_operation( + endpoint="/flattened-layers-pdf", + payload=payload, + payload_model=PdfFlattenLayersPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + async def rasterize_pdf( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Asynchronously rasterize a PDF into a flattened bitmap-based PDF.""" + + payload: dict[str, Any] = {"files": file} + if output is not None: + payload["output"] = output + + return await self._post_file_operation( + endpoint="/rasterized-pdf", + payload=payload, + payload_model=PdfRasterizePayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + async def convert_to_pdfa( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output_type: PdfAType, + output: str | None = None, + rasterize_if_errors_encountered: bool = False, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Asynchronously convert a PDF to a specified PDF/A version.""" + + payload: dict[str, Any] = { + "files": file, + "output_type": output_type, + "rasterize_if_errors_encountered": rasterize_if_errors_encountered, + } + if output is not None: + payload["output"] = output + + return await self._post_file_operation( + endpoint="/pdfa", + payload=payload, + payload_model=PdfToPdfaPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + async def convert_to_pdfx( self, file: PdfRestFile | Sequence[PdfRestFile], @@ -2883,10 +4138,8 @@ async def convert_to_png( output_prefix: str | None = None, page_range: str | Sequence[str] | None = None, resolution: int = 300, - color_model: Literal["rgb", "rgba", "gray"] = "rgb", - smoothing: Literal["none", "all", "text", "line", "image"] - | Sequence[Literal["none", "all", "text", "line", "image"]] - | None = None, + color_model: PngColorModel = "rgb", + smoothing: GraphicSmoothing | Sequence[GraphicSmoothing] | None = None, extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, @@ -2923,10 +4176,8 @@ async def convert_to_bmp( output_prefix: str | None = None, page_range: str | Sequence[str] | None = None, resolution: int = 300, - color_model: Literal["rgb", "gray"] = "rgb", - smoothing: Literal["none", "all", "text", "line", "image"] - | Sequence[Literal["none", "all", "text", "line", "image"]] - | None = None, + color_model: BmpColorModel = "rgb", + smoothing: GraphicSmoothing | Sequence[GraphicSmoothing] | None = None, extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, @@ -2963,10 +4214,8 @@ async def convert_to_gif( output_prefix: str | None = None, page_range: str | Sequence[str] | None = None, resolution: int = 300, - color_model: Literal["rgb", "gray"] = "rgb", - smoothing: Literal["none", "all", "text", "line", "image"] - | Sequence[Literal["none", "all", "text", "line", "image"]] - | None = None, + color_model: GifColorModel = "rgb", + smoothing: GraphicSmoothing | Sequence[GraphicSmoothing] | None = None, extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, @@ -3003,11 +4252,9 @@ async def convert_to_jpeg( output_prefix: str | None = None, page_range: str | Sequence[str] | None = None, resolution: int = 300, - color_model: Literal["rgb", "cmyk", "gray"] = "rgb", - smoothing: Literal["none", "all", "text", "line", "image"] - | Sequence[Literal["none", "all", "text", "line", "image"]] - | None = None, - jpeg_quality: int | None = None, + color_model: JpegColorModel = "rgb", + smoothing: GraphicSmoothing | Sequence[GraphicSmoothing] | None = None, + jpeg_quality: int = 75, extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, @@ -3019,6 +4266,7 @@ async def convert_to_jpeg( "files": files, "resolution": resolution, "color_model": color_model, + "jpeg_quality": jpeg_quality, } if output_prefix is not None: payload["output_prefix"] = output_prefix @@ -3026,8 +4274,6 @@ async def convert_to_jpeg( payload["page_range"] = page_range if smoothing is not None: payload["smoothing"] = smoothing - if jpeg_quality is not None: - payload["jpeg_quality"] = jpeg_quality return await self._convert_to_graphic( endpoint="/jpg", @@ -3046,10 +4292,8 @@ async def convert_to_tiff( output_prefix: str | None = None, page_range: str | Sequence[str] | None = None, resolution: int = 300, - color_model: Literal["rgb", "rgba", "cmyk", "lab", "gray"] = "rgb", - smoothing: Literal["none", "all", "text", "line", "image"] - | Sequence[Literal["none", "all", "text", "line", "image"]] - | None = None, + color_model: TiffColorModel = "rgb", + smoothing: GraphicSmoothing | Sequence[GraphicSmoothing] | None = None, extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, diff --git a/src/pdfrest/models/__init__.py b/src/pdfrest/models/__init__.py index 54c9aeb4..ef10e565 100644 --- a/src/pdfrest/models/__init__.py +++ b/src/pdfrest/models/__init__.py @@ -5,6 +5,9 @@ PdfRestFileBasedResponse, PdfRestFileID, PdfRestInfoResponse, + SummarizePdfTextResponse, + TranslatePdfTextFileResponse, + TranslatePdfTextResponse, UpResponse, ) @@ -15,5 +18,8 @@ "PdfRestFileBasedResponse", "PdfRestFileID", "PdfRestInfoResponse", + "SummarizePdfTextResponse", + "TranslatePdfTextFileResponse", + "TranslatePdfTextResponse", "UpResponse", ] diff --git a/src/pdfrest/models/_internal.py b/src/pdfrest/models/_internal.py index 33cb8747..6fe476ad 100644 --- a/src/pdfrest/models/_internal.py +++ b/src/pdfrest/models/_internal.py @@ -21,7 +21,19 @@ from pdfrest.types.public import PdfRedactionPreset -from ..types import PdfInfoQuery, PdfXType +from ..types import ( + OcrLanguage, + PdfAType, + PdfColorProfile, + PdfInfoQuery, + PdfPageOrientation, + PdfPageSize, + PdfXType, + SummaryFormat, + SummaryOutputFormat, + SummaryOutputType, + TranslateOutputFormat, +) from . import PdfRestFile from .public import PdfRestFileID @@ -112,6 +124,12 @@ def _serialize_file_ids(value: list[PdfRestFile]) -> str: return ",".join(str(file.id) for file in value) +def _bool_to_on_off(value: Any) -> Any: + if isinstance(value, bool): + return "on" if value else "off" + return value + + def _serialize_page_ranges(value: list[str | int | tuple[str | int, ...]]) -> str: def join_tuple(value: str | int | tuple[str | int, ...]) -> str: if isinstance(value, tuple): @@ -248,6 +266,264 @@ class PdfInfoPayload(BaseModel): ] +class SummarizePdfTextPayload(BaseModel): + """Adapt caller options into a pdfRest-ready summarize request payload.""" + + files: Annotated[ + list[PdfRestFile], + Field( + min_length=1, + max_length=1, + validation_alias=AliasChoices("file", "files"), + serialization_alias="id", + ), + BeforeValidator(_ensure_list), + AfterValidator( + _allowed_mime_types( + "application/pdf", + "text/markdown", + "text/plain", + error_msg="Must be a PDF, Markdown, or plain text file", + ) + ), + PlainSerializer(_serialize_as_first_file_id), + ] + target_word_count: Annotated[ + int | None, Field(serialization_alias="target_word_count", ge=1, default=400) + ] = 400 + summary_format: Annotated[ + SummaryFormat, Field(serialization_alias="summary_format", default="overview") + ] = "overview" + pages: Annotated[ + list[AscendingPageRange] | None, + Field(serialization_alias="pages", min_length=1, default=None), + BeforeValidator(_ensure_list), + BeforeValidator(_split_comma_list), + BeforeValidator(_int_to_string), + PlainSerializer(_serialize_page_ranges), + ] = None + output_format: Annotated[ + SummaryOutputFormat, + Field(serialization_alias="output_format", default="markdown"), + ] = "markdown" + output_type: Annotated[ + SummaryOutputType, Field(serialization_alias="output_type", default="json") + ] = "json" + output: Annotated[ + str | None, + Field(serialization_alias="output", min_length=1, default=None), + AfterValidator(_validate_output_prefix), + ] = None + + +class OcrPdfPayload(BaseModel): + """Adapt caller options into a pdfRest-ready OCR request payload.""" + + files: Annotated[ + list[PdfRestFile], + Field( + min_length=1, + max_length=1, + validation_alias=AliasChoices("file", "files"), + serialization_alias="id", + ), + BeforeValidator(_ensure_list), + AfterValidator( + _allowed_mime_types("application/pdf", error_msg="Must be a PDF file") + ), + PlainSerializer(_serialize_as_first_file_id), + ] + languages: Annotated[ + list[OcrLanguage], + Field( + serialization_alias="languages", + validation_alias=AliasChoices("languages", "language"), + min_length=1, + default_factory=lambda: ["English"], + ), + BeforeValidator(_ensure_list), + BeforeValidator(_split_comma_list), + PlainSerializer(_serialize_as_comma_separated_string), + ] = ["English"] + pages: Annotated[ + list[AscendingPageRange] | None, + Field(serialization_alias="pages", min_length=1, default=None), + BeforeValidator(_ensure_list), + BeforeValidator(_split_comma_list), + BeforeValidator(_int_to_string), + PlainSerializer(_serialize_page_ranges), + ] = None + output: Annotated[ + str | None, + Field(serialization_alias="output", min_length=1, default=None), + AfterValidator(_validate_output_prefix), + ] = None + + +class ExtractTextPayload(BaseModel): + """Adapt caller options into a pdfRest-ready extract text request payload.""" + + files: Annotated[ + list[PdfRestFile], + Field( + min_length=1, + max_length=1, + validation_alias=AliasChoices("file", "files"), + serialization_alias="id", + ), + BeforeValidator(_ensure_list), + AfterValidator( + _allowed_mime_types("application/pdf", error_msg="Must be a PDF file") + ), + PlainSerializer(_serialize_as_first_file_id), + ] + pages: Annotated[ + list[AscendingPageRange] | None, + Field(serialization_alias="pages", min_length=1, default=None), + BeforeValidator(_ensure_list), + BeforeValidator(_split_comma_list), + BeforeValidator(_int_to_string), + PlainSerializer(_serialize_page_ranges), + ] = None + full_text: Literal["off", "by_page", "document"] = "document" + preserve_line_breaks: Annotated[ + Literal["off", "on"], BeforeValidator(_bool_to_on_off) + ] = "off" + word_style: Annotated[Literal["off", "on"], BeforeValidator(_bool_to_on_off)] = ( + "off" + ) + word_coordinates: Annotated[ + Literal["off", "on"], BeforeValidator(_bool_to_on_off) + ] = "off" + output_type: Literal["json", "file"] = "json" + output: Annotated[ + str | None, + Field(serialization_alias="output", min_length=1, default=None), + AfterValidator(_validate_output_prefix), + ] = None + + +class ConvertToMarkdownPayload(BaseModel): + """Adapt caller options into a pdfRest-ready markdown conversion payload.""" + + files: Annotated[ + list[PdfRestFile], + Field( + min_length=1, + max_length=1, + validation_alias=AliasChoices("file", "files"), + serialization_alias="id", + ), + BeforeValidator(_ensure_list), + AfterValidator( + _allowed_mime_types("application/pdf", error_msg="Must be a PDF file") + ), + PlainSerializer(_serialize_as_first_file_id), + ] + pages: Annotated[ + list[AscendingPageRange] | None, + Field(serialization_alias="pages", min_length=1, default=None), + BeforeValidator(_ensure_list), + BeforeValidator(_split_comma_list), + BeforeValidator(_int_to_string), + PlainSerializer(_serialize_page_ranges), + ] = None + output_type: Annotated[ + SummaryOutputType, Field(serialization_alias="output_type", default="json") + ] = "json" + page_break_comments: Annotated[ + Literal["on", "off"] | None, + Field(serialization_alias="page_break_comments", default=None), + BeforeValidator(_bool_to_on_off), + ] = None + output: Annotated[ + str | None, + Field(serialization_alias="output", min_length=1, default=None), + AfterValidator(_validate_output_prefix), + ] = None + + +class TranslatePdfTextPayload(BaseModel): + """Adapt caller options into a pdfRest-ready translate request payload.""" + + files: Annotated[ + list[PdfRestFile], + Field( + min_length=1, + max_length=1, + validation_alias=AliasChoices("file", "files"), + serialization_alias="id", + ), + BeforeValidator(_ensure_list), + AfterValidator( + _allowed_mime_types( + "application/pdf", + "text/markdown", + "text/plain", + error_msg="Must be a PDF, Markdown, or plain text file", + ) + ), + PlainSerializer(_serialize_as_first_file_id), + ] + output_language: Annotated[ + str, + Field(serialization_alias="output_language", min_length=1), + ] + pages: Annotated[ + list[AscendingPageRange] | None, + Field(serialization_alias="pages", min_length=1, default=None), + BeforeValidator(_ensure_list), + BeforeValidator(_split_comma_list), + BeforeValidator(_int_to_string), + PlainSerializer(_serialize_page_ranges), + ] = None + output_format: Annotated[ + TranslateOutputFormat, + Field(serialization_alias="output_format", default="markdown"), + ] = "markdown" + output_type: Annotated[ + Literal["json", "file"], + Field(serialization_alias="output_type", default="json"), + ] = "json" + output: Annotated[ + str | None, + Field(serialization_alias="output", min_length=1, default=None), + AfterValidator(_validate_output_prefix), + ] = None + + +class ExtractImagesPayload(BaseModel): + """Adapt caller options into a pdfRest-ready extract images request payload.""" + + files: Annotated[ + list[PdfRestFile], + Field( + min_length=1, + max_length=1, + validation_alias=AliasChoices("file", "files"), + serialization_alias="id", + ), + BeforeValidator(_ensure_list), + AfterValidator( + _allowed_mime_types("application/pdf", error_msg="Must be a PDF file") + ), + PlainSerializer(_serialize_as_first_file_id), + ] + pages: Annotated[ + list[AscendingPageRange] | None, + Field(serialization_alias="pages", min_length=1, default=None), + BeforeValidator(_ensure_list), + BeforeValidator(_split_comma_list), + BeforeValidator(_int_to_string), + PlainSerializer(_serialize_page_ranges), + ] = None + output: Annotated[ + str | None, + Field(serialization_alias="output", min_length=1, default=None), + AfterValidator(_validate_output_prefix), + ] = None + + RgbChannel = Annotated[int, Field(ge=0, le=255)] @@ -519,6 +795,87 @@ class PdfToWordPayload(BaseModel): ] = None +class PdfToExcelPayload(BaseModel): + """Adapt caller options into a pdfRest-ready Excel request payload.""" + + files: Annotated[ + list[PdfRestFile], + Field( + min_length=1, + max_length=1, + validation_alias=AliasChoices("file", "files"), + serialization_alias="id", + ), + BeforeValidator(_ensure_list), + AfterValidator( + _allowed_mime_types("application/pdf", error_msg="Must be a PDF file") + ), + PlainSerializer(_serialize_as_first_file_id), + ] + output: Annotated[ + str | None, + Field(serialization_alias="output", min_length=1, default=None), + AfterValidator(_validate_output_prefix), + ] = None + + +class PdfToPowerpointPayload(BaseModel): + """Adapt caller options into a pdfRest-ready PowerPoint request payload.""" + + files: Annotated[ + list[PdfRestFile], + Field( + min_length=1, + max_length=1, + validation_alias=AliasChoices("file", "files"), + serialization_alias="id", + ), + BeforeValidator(_ensure_list), + AfterValidator( + _allowed_mime_types("application/pdf", error_msg="Must be a PDF file") + ), + PlainSerializer(_serialize_as_first_file_id), + ] + output: Annotated[ + str | None, + Field(serialization_alias="output", min_length=1, default=None), + AfterValidator(_validate_output_prefix), + ] = None + + +class PdfToPdfaPayload(BaseModel): + """Adapt caller options into a pdfRest-ready PDF/A request payload.""" + + files: Annotated[ + list[PdfRestFile], + Field( + min_length=1, + max_length=1, + validation_alias=AliasChoices("file", "files"), + serialization_alias="id", + ), + BeforeValidator(_ensure_list), + AfterValidator( + _allowed_mime_types("application/pdf", error_msg="Must be a PDF file") + ), + PlainSerializer(_serialize_as_first_file_id), + ] + output_type: Annotated[PdfAType, Field(serialization_alias="output_type")] + output: Annotated[ + str | None, + Field(serialization_alias="output", min_length=1, default=None), + AfterValidator(_validate_output_prefix), + ] = None + rasterize_if_errors_encountered: Annotated[ + Literal["on", "off"] | None, + Field( + serialization_alias="rasterize_if_errors_encountered", + default=None, + ), + BeforeValidator(_bool_to_on_off), + ] = None + + class PdfToPdfxPayload(BaseModel): """Adapt caller options into a pdfRest-ready PDF/X request payload.""" @@ -626,6 +983,264 @@ def _validate_profile_dependency(self) -> PdfCompressPayload: return self +class PdfXfaToAcroformsPayload(BaseModel): + """Adapt caller options into a pdfRest-ready XFA-to-AcroForms request payload.""" + + files: Annotated[ + list[PdfRestFile], + Field( + min_length=1, + max_length=1, + validation_alias=AliasChoices("file", "files"), + serialization_alias="id", + ), + BeforeValidator(_ensure_list), + AfterValidator( + _allowed_mime_types("application/pdf", error_msg="Must be a PDF file") + ), + PlainSerializer(_serialize_as_first_file_id), + ] + output: Annotated[ + str | None, + Field(serialization_alias="output", min_length=1, default=None), + AfterValidator(_validate_output_prefix), + ] = None + + +class PdfLinearizePayload(BaseModel): + """Adapt caller options into a pdfRest-ready linearize PDF request payload.""" + + files: Annotated[ + list[PdfRestFile], + Field( + min_length=1, + max_length=1, + validation_alias=AliasChoices("file", "files"), + serialization_alias="id", + ), + BeforeValidator(_ensure_list), + AfterValidator( + _allowed_mime_types("application/pdf", error_msg="Must be a PDF file") + ), + PlainSerializer(_serialize_as_first_file_id), + ] + output: Annotated[ + str | None, + Field(serialization_alias="output", min_length=1, default=None), + AfterValidator(_validate_output_prefix), + ] = None + + +class PdfRasterizePayload(BaseModel): + """Adapt caller options into a pdfRest-ready rasterize PDF request payload.""" + + files: Annotated[ + list[PdfRestFile], + Field( + min_length=1, + max_length=1, + validation_alias=AliasChoices("file", "files"), + serialization_alias="id", + ), + BeforeValidator(_ensure_list), + AfterValidator( + _allowed_mime_types("application/pdf", error_msg="Must be a PDF file") + ), + PlainSerializer(_serialize_as_first_file_id), + ] + output: Annotated[ + str | None, + Field(serialization_alias="output", min_length=1, default=None), + AfterValidator(_validate_output_prefix), + ] = None + + +class PdfFlattenTransparenciesPayload(BaseModel): + """Adapt caller options into a pdfRest-ready flatten-transparencies request payload.""" + + files: Annotated[ + list[PdfRestFile], + Field( + min_length=1, + max_length=1, + validation_alias=AliasChoices("file", "files"), + serialization_alias="id", + ), + BeforeValidator(_ensure_list), + AfterValidator( + _allowed_mime_types("application/pdf", error_msg="Must be a PDF file") + ), + PlainSerializer(_serialize_as_first_file_id), + ] + output: Annotated[ + str | None, + Field(serialization_alias="output", min_length=1, default=None), + AfterValidator(_validate_output_prefix), + ] = None + quality: Literal["low", "medium", "high"] = "medium" + + +class PdfFlattenAnnotationsPayload(BaseModel): + """Adapt caller options into a pdfRest-ready flatten-annotations request payload.""" + + files: Annotated[ + list[PdfRestFile], + Field( + min_length=1, + max_length=1, + validation_alias=AliasChoices("file", "files"), + serialization_alias="id", + ), + BeforeValidator(_ensure_list), + AfterValidator( + _allowed_mime_types("application/pdf", error_msg="Must be a PDF file") + ), + PlainSerializer(_serialize_as_first_file_id), + ] + output: Annotated[ + str | None, + Field(serialization_alias="output", min_length=1, default=None), + AfterValidator(_validate_output_prefix), + ] = None + + +class PdfFlattenLayersPayload(BaseModel): + """Adapt caller options into a pdfRest-ready flatten-layers request payload.""" + + files: Annotated[ + list[PdfRestFile], + Field( + min_length=1, + max_length=1, + validation_alias=AliasChoices("file", "files"), + serialization_alias="id", + ), + BeforeValidator(_ensure_list), + AfterValidator( + _allowed_mime_types("application/pdf", error_msg="Must be a PDF file") + ), + PlainSerializer(_serialize_as_first_file_id), + ] + output: Annotated[ + str | None, + Field(serialization_alias="output", min_length=1, default=None), + AfterValidator(_validate_output_prefix), + ] = None + + +class PdfBlankPayload(BaseModel): + """Adapt caller options into a pdfRest-ready blank PDF request payload.""" + + page_size: Annotated[ + PdfPageSize, + Field(serialization_alias="page_size"), + ] + page_count: Annotated[ + int, + Field(serialization_alias="page_count", ge=1, le=1000), + ] + page_orientation: Annotated[ + PdfPageOrientation | None, + Field(serialization_alias="page_orientation", default=None), + ] = None + custom_height: Annotated[ + float | None, + Field(serialization_alias="custom_height", gt=0, default=None), + ] = None + custom_width: Annotated[ + float | None, + Field(serialization_alias="custom_width", gt=0, default=None), + ] = None + output: Annotated[ + str | None, + Field(serialization_alias="output", min_length=1, default=None), + AfterValidator(_validate_output_prefix), + ] = None + + @model_validator(mode="after") + def _validate_page_configuration(self) -> PdfBlankPayload: + is_custom = self.page_size == "custom" + has_custom_height = self.custom_height is not None + has_custom_width = self.custom_width is not None + if is_custom: + if not (has_custom_height and has_custom_width): + msg = "custom_height and custom_width are required when page_size is 'custom'." + raise ValueError(msg) + if self.page_orientation is not None: + msg = "page_orientation must be omitted when page_size is 'custom'." + raise ValueError(msg) + else: + if self.page_orientation is None: + msg = "page_orientation is required when page_size is not 'custom'." + raise ValueError(msg) + if has_custom_height or has_custom_width: + msg = "custom_height and custom_width can only be provided when page_size is 'custom'." + raise ValueError(msg) + return self + + +class PdfConvertColorsPayload(BaseModel): + """Adapt caller options into a pdfRest-ready convert-colors request payload.""" + + files: Annotated[ + list[PdfRestFile], + Field( + min_length=1, + max_length=1, + validation_alias=AliasChoices("file", "files"), + serialization_alias="id", + ), + BeforeValidator(_ensure_list), + AfterValidator( + _allowed_mime_types("application/pdf", error_msg="Must be a PDF file") + ), + PlainSerializer(_serialize_as_first_file_id), + ] + color_profile: Annotated[ + PdfColorProfile, + Field(serialization_alias="color_profile"), + ] + profile: Annotated[ + list[PdfRestFile] | None, + Field( + default=None, + min_length=1, + max_length=1, + validation_alias=AliasChoices("profile", "profiles"), + serialization_alias="profile_id", + ), + BeforeValidator(_ensure_list), + AfterValidator( + _allowed_mime_types( + "application/vnd.iccprofile", + "application/octet-stream", + error_msg="Profile must be an ICC file", + ) + ), + PlainSerializer(_serialize_as_first_file_id), + ] = None + preserve_black: Annotated[ + bool | None, + Field(serialization_alias="preserve_black", default=None), + ] = None + output: Annotated[ + str | None, + Field(serialization_alias="output", min_length=1, default=None), + AfterValidator(_validate_output_prefix), + ] = None + + @model_validator(mode="after") + def _validate_profile_dependency(self) -> PdfConvertColorsPayload: + if self.color_profile == "custom": + if not self.profile: + msg = "color_profile 'custom' requires a profile to be provided." + raise ValueError(msg) + elif self.profile: + msg = "A profile can only be provided when color_profile is 'custom'." + raise ValueError(msg) + return self + + class BmpPdfRestPayload(BasePdfRestGraphicPayload[Literal["rgb", "gray"]]): """Adapt caller options into a pdfRest-ready BMP request payload.""" @@ -677,7 +1292,11 @@ class PdfRestRawFileResponse(BaseModel): input_id: Annotated[ list[PdfRestFileID], - Field(alias="inputId", description="The id of the input file"), + Field( + alias="inputId", + description="The id of the input file", + default_factory=list, + ), BeforeValidator(_ensure_list), ] output_urls: Annotated[ diff --git a/src/pdfrest/models/public.py b/src/pdfrest/models/public.py index 3de11476..e4dc8a3a 100644 --- a/src/pdfrest/models/public.py +++ b/src/pdfrest/models/public.py @@ -26,6 +26,9 @@ "PdfRestFileBasedResponse", "PdfRestFileID", "PdfRestInfoResponse", + "SummarizePdfTextResponse", + "TranslatePdfTextFileResponse", + "TranslatePdfTextResponse", "UpResponse", ) @@ -312,6 +315,93 @@ class PdfRestDeletionResponse(BaseModel): ] +class SummarizePdfTextResponse(BaseModel): + """Response returned by the summarize-pdf-text tool.""" + + model_config = ConfigDict(extra="allow") + + summary: Annotated[ + str | None, + Field( + description="Summary content", + default=None, + ), + ] = None + input_id: Annotated[ + PdfRestFileID, + Field( + validation_alias=AliasChoices("input_id", "inputId"), + description="The id of the input file.", + ), + ] + + +class TranslatePdfTextResponse(BaseModel): + """Response returned by the translated-pdf-text tool.""" + + model_config = ConfigDict(extra="allow") + + source_languages: Annotated[ + list[str] | None, + Field( + alias="source_languages", + validation_alias=AliasChoices("source_languages", "sourceLanguages"), + description="Languages detected in the source content.", + default=None, + ), + ] = None + output_language: Annotated[ + str | None, + Field( + alias="output_language", + validation_alias=AliasChoices("output_language", "outputLanguage"), + description="Target language used for the translation.", + default=None, + ), + ] = None + translated_text: Annotated[ + str | None, + Field( + alias="translated_text", + validation_alias=AliasChoices("translated_text", "translatedText"), + description="Inline translation content when output_type is json.", + default=None, + ), + ] = None + input_id: Annotated[ + PdfRestFileID, + Field( + validation_alias=AliasChoices("input_id", "inputId"), + description="The id of the input file.", + ), + ] + + +class TranslatePdfTextFileResponse(PdfRestFileBasedResponse): + """File-based response returned by the translated-pdf-text tool.""" + + model_config = ConfigDict(extra="allow") + + source_languages: Annotated[ + list[str] | None, + Field( + alias="source_languages", + validation_alias=AliasChoices("source_languages", "sourceLanguages"), + description="Languages detected in the source content.", + default=None, + ), + ] = None + output_language: Annotated[ + str | None, + Field( + alias="output_language", + validation_alias=AliasChoices("output_language", "outputLanguage"), + description="Target language used for the translation.", + default=None, + ), + ] = None + + class PdfRestInfoResponse(BaseModel): """A response containing the output from the /info route.""" diff --git a/src/pdfrest/types/__init__.py b/src/pdfrest/types/__init__.py index 9bc36a87..070cac75 100644 --- a/src/pdfrest/types/__init__.py +++ b/src/pdfrest/types/__init__.py @@ -1,27 +1,65 @@ """Public import surface for shared pdfrest types.""" from .public import ( + ALL_OCR_LANGUAGES, ALL_PDF_INFO_QUERIES, + BmpColorModel, + CompressionLevel, + ExtractTextGranularity, + FlattenQuality, + GifColorModel, + GraphicSmoothing, + JpegColorModel, + OcrLanguage, + PdfAType, + PdfColorProfile, PdfInfoQuery, PdfMergeInput, PdfMergeSource, + PdfPageOrientation, PdfPageSelection, + PdfPageSize, PdfRedactionInstruction, PdfRedactionPreset, PdfRedactionType, PdfRGBColor, PdfXType, + PngColorModel, + SummaryFormat, + SummaryOutputFormat, + SummaryOutputType, + TiffColorModel, + TranslateOutputFormat, ) __all__ = [ + "ALL_OCR_LANGUAGES", "ALL_PDF_INFO_QUERIES", + "BmpColorModel", + "CompressionLevel", + "ExtractTextGranularity", + "FlattenQuality", + "GifColorModel", + "GraphicSmoothing", + "JpegColorModel", + "OcrLanguage", + "PdfAType", + "PdfColorProfile", "PdfInfoQuery", "PdfMergeInput", "PdfMergeSource", + "PdfPageOrientation", "PdfPageSelection", + "PdfPageSize", "PdfRGBColor", "PdfRedactionInstruction", "PdfRedactionPreset", "PdfRedactionType", "PdfXType", + "PngColorModel", + "SummaryFormat", + "SummaryOutputFormat", + "SummaryOutputType", + "TiffColorModel", + "TranslateOutputFormat", ] diff --git a/src/pdfrest/types/public.py b/src/pdfrest/types/public.py index 1df53284..c9ddc8a1 100644 --- a/src/pdfrest/types/public.py +++ b/src/pdfrest/types/public.py @@ -13,16 +13,35 @@ PdfRestFile = Any __all__ = ( + "ALL_OCR_LANGUAGES", "ALL_PDF_INFO_QUERIES", + "BmpColorModel", + "CompressionLevel", + "ExtractTextGranularity", + "FlattenQuality", + "GifColorModel", + "GraphicSmoothing", + "JpegColorModel", + "OcrLanguage", + "PdfAType", + "PdfColorProfile", "PdfInfoQuery", "PdfMergeInput", "PdfMergeSource", + "PdfPageOrientation", "PdfPageSelection", + "PdfPageSize", "PdfRGBColor", "PdfRedactionInstruction", "PdfRedactionPreset", "PdfRedactionType", "PdfXType", + "PngColorModel", + "SummaryFormat", + "SummaryOutputFormat", + "SummaryOutputType", + "TiffColorModel", + "TranslateOutputFormat", ) PdfInfoQuery = Literal[ @@ -98,4 +117,69 @@ class PdfMergeSource(TypedDict, total=False): PdfMergeInput = PdfRestFile | PdfMergeSource | tuple[PdfRestFile, PdfPageSelection] +PdfAType = Literal["PDF/A-1b", "PDF/A-2b", "PDF/A-2u", "PDF/A-3b", "PDF/A-3u"] PdfXType = Literal["PDF/X-1a", "PDF/X-3", "PDF/X-4", "PDF/X-6"] +ExtractTextGranularity = Literal["off", "by_page", "document"] +CompressionLevel = Literal["low", "medium", "high", "custom"] +FlattenQuality = Literal["low", "medium", "high"] +PngColorModel = Literal["rgb", "rgba", "gray"] +BmpColorModel = Literal["rgb", "gray"] +GifColorModel = Literal["rgb", "gray"] +JpegColorModel = Literal["rgb", "cmyk", "gray"] +TiffColorModel = Literal["rgb", "rgba", "cmyk", "lab", "gray"] +GraphicSmoothing = Literal["none", "all", "text", "line", "image"] + +SummaryFormat = Literal[ + "overview", + "highlight", + "abstract", + "bullet_points", + "numbered_list", + "table_of_contents", + "outline", + "question_answer", + "action_items", +] + +SummaryOutputFormat = Literal["plaintext", "markdown"] +SummaryOutputType = Literal["json", "file"] + +TranslateOutputFormat = Literal["plaintext", "markdown"] + +OcrLanguage = Literal[ + "ChineseSimplified", + "ChineseTraditional", + "Dutch", + "English", + "French", + "German", + "Italian", + "Japanese", + "Korean", + "Portuguese", + "Spanish", +] + +ALL_OCR_LANGUAGES: tuple[OcrLanguage, ...] = cast( + tuple[OcrLanguage, ...], get_args(OcrLanguage) +) +PdfColorProfile = Literal[ + "lab-d50", + "srgb", + "apple-rgb", + "color-match-rgb", + "gamma-18", + "gamma-22", + "dot-gain-10", + "dot-gain-15", + "dot-gain-20", + "dot-gain-25", + "dot-gain-30", + "monitor-rgb", + "acrobat5-cmyk", + "acrobat9-cmyk", + "custom", +] + +PdfPageSize = Literal["letter", "legal", "ledger", "A3", "A4", "A5", "custom"] +PdfPageOrientation = Literal["portrait", "landscape"] diff --git a/tests/live/test_live_blank_pdf.py b/tests/live/test_live_blank_pdf.py new file mode 100644 index 00000000..07f281e1 --- /dev/null +++ b/tests/live/test_live_blank_pdf.py @@ -0,0 +1,103 @@ +from __future__ import annotations + +import pytest + +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient + + +@pytest.mark.parametrize( + "output_name", + [ + pytest.param(None, id="default-output"), + pytest.param("blank-doc", id="custom-output"), + ], +) +def test_live_blank_pdf_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + output_name: str | None, +) -> None: + kwargs: dict[str, str | int] = { + "page_size": "letter", + "page_count": 1, + "page_orientation": "portrait", + } + if output_name is not None: + kwargs["output"] = output_name + + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = client.blank_pdf(**kwargs) + + assert response.output_files + output_file = response.output_file + assert output_file.type == "application/pdf" + assert output_file.size > 0 + assert response.warning is None + if output_name is not None: + assert output_file.name.startswith(output_name) + else: + assert output_file.name.endswith(".pdf") + + +@pytest.mark.asyncio +async def test_live_async_blank_pdf_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = await client.blank_pdf( + page_size="A4", + page_count=2, + page_orientation="landscape", + output="async-blank", + ) + + assert response.output_files + output_file = response.output_file + assert output_file.name.startswith("async-blank") + assert output_file.type == "application/pdf" + assert output_file.size > 0 + assert response.warning is None + + +def test_live_blank_pdf_invalid_request( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + with ( + PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client, + pytest.raises(PdfRestApiError, match=r"(?i)(page|size)"), + ): + client.blank_pdf( + page_size="letter", + page_count=1, + page_orientation="portrait", + extra_body={"page_size": "not-a-size"}, + ) + + +@pytest.mark.asyncio +async def test_live_async_blank_pdf_invalid_request( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + with pytest.raises(PdfRestApiError, match=r"(?i)(page|size)"): + await client.blank_pdf( + page_size="letter", + page_count=1, + page_orientation="portrait", + extra_body={"page_size": "bad-size"}, + ) diff --git a/tests/live/test_live_compress_pdf.py b/tests/live/test_live_compress_pdf.py index 6ee8b365..0b3cdf66 100644 --- a/tests/live/test_live_compress_pdf.py +++ b/tests/live/test_live_compress_pdf.py @@ -158,7 +158,7 @@ def test_live_compress_pdf_invalid_level( api_key=pdfrest_api_key, base_url=pdfrest_live_base_url, ) as client, - pytest.raises(PdfRestApiError), + pytest.raises(PdfRestApiError, match=r"(?i)compression"), ): client.compress_pdf( uploaded_pdf_for_compression, @@ -177,7 +177,7 @@ async def test_live_async_compress_pdf_invalid_level( api_key=pdfrest_api_key, base_url=pdfrest_live_base_url, ) as client: - with pytest.raises(PdfRestApiError): + with pytest.raises(PdfRestApiError, match=r"(?i)compression"): await client.compress_pdf( uploaded_pdf_for_compression, compression_level="low", diff --git a/tests/live/test_live_convert_colors.py b/tests/live/test_live_convert_colors.py new file mode 100644 index 00000000..20a1f480 --- /dev/null +++ b/tests/live/test_live_convert_colors.py @@ -0,0 +1,118 @@ +from __future__ import annotations + +import pytest + +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient +from pdfrest.models import PdfRestFile + +from ..resources import get_test_resource_path + + +@pytest.fixture(scope="module") +def uploaded_pdf_for_color_conversion( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> PdfRestFile: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + return client.files.create_from_paths([resource])[0] + + +@pytest.mark.parametrize( + "output_name", + [ + pytest.param(None, id="default-output"), + pytest.param("converted-colors", id="custom-output"), + ], +) +def test_live_convert_colors_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_color_conversion: PdfRestFile, + output_name: str | None, +) -> None: + kwargs: dict[str, str | bool] = {"color_profile": "srgb"} + if output_name is not None: + kwargs["output"] = output_name + + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = client.convert_colors(uploaded_pdf_for_color_conversion, **kwargs) + + assert response.output_files + output_file = response.output_file + assert output_file.type == "application/pdf" + assert output_file.size > 0 + assert response.warning is None + assert str(response.input_id) == str(uploaded_pdf_for_color_conversion.id) + if output_name is not None: + assert output_file.name.startswith(output_name) + else: + assert output_file.name.endswith(".pdf") + + +@pytest.mark.asyncio +async def test_live_async_convert_colors_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_color_conversion: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = await client.convert_colors( + uploaded_pdf_for_color_conversion, + color_profile="srgb", + output="async", + ) + + assert response.output_files + output_file = response.output_file + assert output_file.name.startswith("async") + assert output_file.type == "application/pdf" + assert output_file.size > 0 + assert response.warning is None + assert str(response.input_id) == str(uploaded_pdf_for_color_conversion.id) + + +def test_live_convert_colors_invalid_file_id( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_color_conversion: PdfRestFile, +) -> None: + with ( + PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client, + pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"), + ): + client.convert_colors( + uploaded_pdf_for_color_conversion, + color_profile="srgb", + extra_body={"id": "00000000-0000-0000-0000-000000000000"}, + ) + + +@pytest.mark.asyncio +async def test_live_async_convert_colors_invalid_file_id( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_color_conversion: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + with pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"): + await client.convert_colors( + uploaded_pdf_for_color_conversion, + color_profile="srgb", + extra_body={"id": "ffffffff-ffff-ffff-ffff-ffffffffffff"}, + ) diff --git a/tests/live/test_live_convert_to_excel.py b/tests/live/test_live_convert_to_excel.py new file mode 100644 index 00000000..f592aa40 --- /dev/null +++ b/tests/live/test_live_convert_to_excel.py @@ -0,0 +1,118 @@ +from __future__ import annotations + +import pytest + +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient +from pdfrest.models import PdfRestFile + +from ..resources import get_test_resource_path + + +@pytest.fixture(scope="module") +def uploaded_pdf_for_excel( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> PdfRestFile: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + return client.files.create_from_paths([resource])[0] + + +@pytest.mark.parametrize( + "output_name", + [ + pytest.param(None, id="default-output"), + pytest.param("live-excel", id="custom-output"), + ], +) +def test_live_convert_to_excel_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_excel: PdfRestFile, + output_name: str | None, +) -> None: + kwargs: dict[str, str] = {} + if output_name is not None: + kwargs["output"] = output_name + + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = client.convert_to_excel(uploaded_pdf_for_excel, **kwargs) + + assert response.output_files + output_file = response.output_file + assert ( + output_file.type + == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" + ) + assert output_file.size > 0 + assert response.warning is None + assert str(response.input_id) == str(uploaded_pdf_for_excel.id) + if output_name is not None: + assert output_file.name.startswith(output_name) + else: + assert output_file.name.endswith(".xlsx") + + +@pytest.mark.asyncio +async def test_live_async_convert_to_excel_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_excel: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = await client.convert_to_excel(uploaded_pdf_for_excel, output="async") + + assert response.output_files + output_file = response.output_file + assert output_file.name.startswith("async") + assert ( + output_file.type + == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" + ) + assert output_file.size > 0 + assert response.warning is None + assert str(response.input_id) == str(uploaded_pdf_for_excel.id) + + +def test_live_convert_to_excel_invalid_file_id( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_excel: PdfRestFile, +) -> None: + with ( + PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client, + pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"), + ): + client.convert_to_excel( + uploaded_pdf_for_excel, + extra_body={"id": "00000000-0000-0000-0000-000000000000"}, + ) + + +@pytest.mark.asyncio +async def test_live_async_convert_to_excel_invalid_file_id( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_excel: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + with pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"): + await client.convert_to_excel( + uploaded_pdf_for_excel, + extra_body={"id": "ffffffff-ffff-ffff-ffff-ffffffffffff"}, + ) diff --git a/tests/live/test_live_convert_to_markdown.py b/tests/live/test_live_convert_to_markdown.py new file mode 100644 index 00000000..760e1798 --- /dev/null +++ b/tests/live/test_live_convert_to_markdown.py @@ -0,0 +1,88 @@ +from __future__ import annotations + +import pytest + +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient +from pdfrest.models import PdfRestFileBasedResponse + +from ..resources import get_test_resource_path + + +def test_live_convert_to_markdown_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = client.files.create_from_paths([resource])[0] + response = client.convert_to_markdown(uploaded) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_files + output_file = response.output_file + assert output_file.name.endswith(".md") + assert output_file.type == "text/markdown" + assert output_file.size > 0 + assert response.warning is None + assert response.input_id == uploaded.id + + +@pytest.mark.asyncio +async def test_live_async_convert_to_markdown_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = (await client.files.create_from_paths([resource]))[0] + response = await client.convert_to_markdown(uploaded, output="async-md") + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_files + output_file = response.output_file + assert output_file.name.startswith("async-md") + assert output_file.type == "text/markdown" + assert output_file.size > 0 + assert response.warning is None + assert response.input_id == uploaded.id + + +def test_live_convert_to_markdown_invalid_pages( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = client.files.create_from_paths([resource])[0] + with pytest.raises(PdfRestApiError, match=r"(?i)page"): + client.convert_to_markdown( + uploaded, + extra_body={"pages": "last-1"}, + ) + + +@pytest.mark.asyncio +async def test_live_async_convert_to_markdown_invalid_pages( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = (await client.files.create_from_paths([resource]))[0] + with pytest.raises(PdfRestApiError, match=r"(?i)page"): + await client.convert_to_markdown( + uploaded, + extra_body={"pages": "last-1"}, + ) diff --git a/tests/live/test_live_convert_to_pdfa.py b/tests/live/test_live_convert_to_pdfa.py new file mode 100644 index 00000000..5d39d009 --- /dev/null +++ b/tests/live/test_live_convert_to_pdfa.py @@ -0,0 +1,147 @@ +from __future__ import annotations + +from typing import cast, get_args + +import pytest + +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient +from pdfrest.models import PdfRestFile +from pdfrest.types import PdfAType + +from ..resources import get_test_resource_path + +PDFA_TYPES: tuple[PdfAType, ...] = cast(tuple[PdfAType, ...], get_args(PdfAType)) +PDFA_TYPE_PARAMS = [ + pytest.param(output_type, id=output_type) for output_type in PDFA_TYPES +] + + +@pytest.fixture(scope="module") +def uploaded_pdf_for_pdfa( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> PdfRestFile: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + return client.files.create_from_paths([resource])[0] + + +@pytest.mark.parametrize("output_type", PDFA_TYPE_PARAMS) +def test_live_convert_to_pdfa_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_pdfa: PdfRestFile, + output_type: PdfAType, +) -> None: + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = client.convert_to_pdfa( + uploaded_pdf_for_pdfa, + output_type=output_type, + output="pdfa-live", + ) + + assert response.output_files + output_file = response.output_file + assert output_file.type == "application/pdf" + assert str(response.input_id) == str(uploaded_pdf_for_pdfa.id) + assert output_file.name.startswith("pdfa-live") + + +@pytest.mark.asyncio +@pytest.mark.parametrize("output_type", PDFA_TYPE_PARAMS) +async def test_live_async_convert_to_pdfa_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_pdfa: PdfRestFile, + output_type: PdfAType, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = await client.convert_to_pdfa( + uploaded_pdf_for_pdfa, + output_type=output_type, + output="async-pdfa", + ) + + assert response.output_files + output_file = response.output_file + assert output_file.name.startswith("async-pdfa") + assert output_file.type == "application/pdf" + assert str(response.input_id) == str(uploaded_pdf_for_pdfa.id) + + +def test_live_convert_to_pdfa_with_rasterize_option( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_pdfa: PdfRestFile, +) -> None: + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = client.convert_to_pdfa( + uploaded_pdf_for_pdfa, + output_type="PDF/A-2b", + rasterize_if_errors_encountered="on", + output="pdfa-rasterize", + ) + + assert response.output_files + output_file = response.output_file + assert output_file.name.startswith("pdfa-rasterize") + assert output_file.type == "application/pdf" + assert str(response.input_id) == str(uploaded_pdf_for_pdfa.id) + + +@pytest.mark.parametrize( + "invalid_output_type", + [ + pytest.param("PDF/A-0", id="pdfa-0"), + pytest.param("PDF/A-99", id="pdfa-99"), + pytest.param("pdf/a-2b", id="lowercase"), + ], +) +def test_live_convert_to_pdfa_invalid_output_type( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_pdfa: PdfRestFile, + invalid_output_type: str, +) -> None: + with ( + PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client, + pytest.raises(PdfRestApiError, match=r"(?i)pdf.?a"), + ): + client.convert_to_pdfa( + uploaded_pdf_for_pdfa, + output_type="PDF/A-1b", + extra_body={"output_type": invalid_output_type}, + ) + + +@pytest.mark.asyncio +async def test_live_async_convert_to_pdfa_invalid_output_type( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_pdfa: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + with pytest.raises(PdfRestApiError, match=r"(?i)pdf.?a"): + await client.convert_to_pdfa( + uploaded_pdf_for_pdfa, + output_type="PDF/A-1b", + extra_body={"output_type": "PDF/A-0"}, + ) diff --git a/tests/live/test_live_convert_to_pdfx.py b/tests/live/test_live_convert_to_pdfx.py index a08088b0..df0e6695 100644 --- a/tests/live/test_live_convert_to_pdfx.py +++ b/tests/live/test_live_convert_to_pdfx.py @@ -4,7 +4,7 @@ import pytest -from pdfrest import PdfRestApiError, PdfRestClient +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient from pdfrest.models import PdfRestFile from pdfrest.types import PdfXType @@ -50,6 +50,31 @@ def test_live_convert_to_pdfx_success( assert output_file.name.startswith("pdfx-live") +@pytest.mark.asyncio +@pytest.mark.parametrize("output_type", PDFX_TYPES, ids=list(PDFX_TYPES)) +async def test_live_async_convert_to_pdfx_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_pdfx: PdfRestFile, + output_type: PdfXType, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = await client.convert_to_pdfx( + uploaded_pdf_for_pdfx, + output_type=output_type, + output="async-pdfx", + ) + + assert response.output_files + output_file = response.output_file + assert output_file.name.startswith("async-pdfx") + assert output_file.type == "application/pdf" + assert str(response.input_id) == str(uploaded_pdf_for_pdfx.id) + + @pytest.mark.parametrize( "invalid_output_type", [ @@ -69,10 +94,28 @@ def test_live_convert_to_pdfx_invalid_output_type( api_key=pdfrest_api_key, base_url=pdfrest_live_base_url, ) as client, - pytest.raises(PdfRestApiError), + pytest.raises(PdfRestApiError, match=r"(?i)pdf.?x"), ): client.convert_to_pdfx( uploaded_pdf_for_pdfx, output_type="PDF/X-1a", extra_body={"output_type": invalid_output_type}, ) + + +@pytest.mark.asyncio +async def test_live_async_convert_to_pdfx_invalid_output_type( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_pdfx: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + with pytest.raises(PdfRestApiError, match=r"(?i)pdf.?x"): + await client.convert_to_pdfx( + uploaded_pdf_for_pdfx, + output_type="PDF/X-1a", + extra_body={"output_type": "PDF/X-0"}, + ) diff --git a/tests/live/test_live_convert_to_powerpoint.py b/tests/live/test_live_convert_to_powerpoint.py new file mode 100644 index 00000000..8a1209a2 --- /dev/null +++ b/tests/live/test_live_convert_to_powerpoint.py @@ -0,0 +1,120 @@ +from __future__ import annotations + +import pytest + +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient +from pdfrest.models import PdfRestFile + +from ..resources import get_test_resource_path + + +@pytest.fixture(scope="module") +def uploaded_pdf_for_powerpoint( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> PdfRestFile: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + return client.files.create_from_paths([resource])[0] + + +@pytest.mark.parametrize( + "output_name", + [ + pytest.param(None, id="default-output"), + pytest.param("live-powerpoint", id="custom-output"), + ], +) +def test_live_convert_to_powerpoint_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_powerpoint: PdfRestFile, + output_name: str | None, +) -> None: + kwargs: dict[str, str] = {} + if output_name is not None: + kwargs["output"] = output_name + + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = client.convert_to_powerpoint(uploaded_pdf_for_powerpoint, **kwargs) + + assert response.output_files + output_file = response.output_file + assert ( + output_file.type + == "application/vnd.openxmlformats-officedocument.presentationml.presentation" + ) + assert output_file.size > 0 + assert response.warning is None + assert str(response.input_id) == str(uploaded_pdf_for_powerpoint.id) + if output_name is not None: + assert output_file.name.startswith(output_name) + else: + assert output_file.name.endswith(".pptx") + + +@pytest.mark.asyncio +async def test_live_async_convert_to_powerpoint_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_powerpoint: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = await client.convert_to_powerpoint( + uploaded_pdf_for_powerpoint, output="async" + ) + + assert response.output_files + output_file = response.output_file + assert output_file.name.startswith("async") + assert ( + output_file.type + == "application/vnd.openxmlformats-officedocument.presentationml.presentation" + ) + assert output_file.size > 0 + assert response.warning is None + assert str(response.input_id) == str(uploaded_pdf_for_powerpoint.id) + + +def test_live_convert_to_powerpoint_invalid_file_id( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_powerpoint: PdfRestFile, +) -> None: + with ( + PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client, + pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"), + ): + client.convert_to_powerpoint( + uploaded_pdf_for_powerpoint, + extra_body={"id": "00000000-0000-0000-0000-000000000000"}, + ) + + +@pytest.mark.asyncio +async def test_live_async_convert_to_powerpoint_invalid_file_id( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_powerpoint: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + with pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"): + await client.convert_to_powerpoint( + uploaded_pdf_for_powerpoint, + extra_body={"id": "ffffffff-ffff-ffff-ffff-ffffffffffff"}, + ) diff --git a/tests/live/test_live_convert_to_word.py b/tests/live/test_live_convert_to_word.py index c3c5822e..3ec6a334 100644 --- a/tests/live/test_live_convert_to_word.py +++ b/tests/live/test_live_convert_to_word.py @@ -2,7 +2,7 @@ import pytest -from pdfrest import PdfRestApiError, PdfRestClient +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient from pdfrest.models import PdfRestFile from ..resources import get_test_resource_path @@ -57,6 +57,31 @@ def test_live_convert_to_word_success( assert output_file.name.endswith(".docx") +@pytest.mark.asyncio +async def test_live_async_convert_to_word_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_word: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = await client.convert_to_word( + uploaded_pdf_for_word, + output="async-word", + ) + + assert response.output_files + output_file = response.output_file + assert output_file.name.startswith("async-word") + assert ( + output_file.type + == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + ) + assert str(response.input_id) == str(uploaded_pdf_for_word.id) + + def test_live_convert_to_word_invalid_file_id( pdfrest_api_key: str, pdfrest_live_base_url: str, @@ -67,9 +92,26 @@ def test_live_convert_to_word_invalid_file_id( api_key=pdfrest_api_key, base_url=pdfrest_live_base_url, ) as client, - pytest.raises(PdfRestApiError), + pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"), ): client.convert_to_word( uploaded_pdf_for_word, extra_body={"id": "00000000-0000-0000-0000-000000000000"}, ) + + +@pytest.mark.asyncio +async def test_live_async_convert_to_word_invalid_file_id( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_word: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + with pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"): + await client.convert_to_word( + uploaded_pdf_for_word, + extra_body={"id": "ffffffff-ffff-ffff-ffff-ffffffffffff"}, + ) diff --git a/tests/live/test_live_convert_xfa_to_acroforms.py b/tests/live/test_live_convert_xfa_to_acroforms.py new file mode 100644 index 00000000..dba38304 --- /dev/null +++ b/tests/live/test_live_convert_xfa_to_acroforms.py @@ -0,0 +1,126 @@ +from __future__ import annotations + +import pytest + +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient +from pdfrest.models import PdfRestFile + +from ..resources import get_test_resource_path + +WARNING_NO_XFA_FORMS = ( + "No XFA forms were detected in the input PDF. No output was produced." +) + + +@pytest.fixture(scope="module") +def uploaded_pdf_for_acroforms( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> PdfRestFile: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + return client.files.create_from_paths([resource])[0] + + +@pytest.mark.parametrize( + "output_name", + [ + pytest.param(None, id="default-output"), + pytest.param("live-acroforms", id="custom-output"), + ], +) +def test_live_convert_xfa_to_acroforms_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_acroforms: PdfRestFile, + output_name: str | None, +) -> None: + kwargs: dict[str, str] = {} + if output_name is not None: + kwargs["output"] = output_name + + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = client.convert_xfa_to_acroforms(uploaded_pdf_for_acroforms, **kwargs) + + assert str(response.input_id) == str(uploaded_pdf_for_acroforms.id) + if response.warning is not None: + assert response.warning == WARNING_NO_XFA_FORMS + assert response.output_files == [] + return + + assert response.output_files + output_file = response.output_file + assert output_file.type == "application/pdf" + assert output_file.size > 0 + if output_name is not None: + assert output_file.name.startswith(output_name) + else: + assert output_file.name.endswith(".pdf") + + +def test_live_convert_xfa_to_acroforms_invalid_file_id( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_acroforms: PdfRestFile, +) -> None: + with ( + PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client, + pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"), + ): + client.convert_xfa_to_acroforms( + uploaded_pdf_for_acroforms, + extra_body={"id": "00000000-0000-0000-0000-000000000000"}, + ) + + +@pytest.mark.asyncio +async def test_live_async_convert_xfa_to_acroforms_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_acroforms: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = await client.convert_xfa_to_acroforms( + uploaded_pdf_for_acroforms, output="async" + ) + + assert str(response.input_id) == str(uploaded_pdf_for_acroforms.id) + if response.warning is not None: + assert response.warning == WARNING_NO_XFA_FORMS + assert response.output_files == [] + return + + assert response.output_files + output_file = response.output_file + assert output_file.name.startswith("async") + assert output_file.type == "application/pdf" + assert output_file.size > 0 + + +@pytest.mark.asyncio +async def test_live_async_convert_xfa_to_acroforms_invalid_file_id( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_acroforms: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + with pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"): + await client.convert_xfa_to_acroforms( + uploaded_pdf_for_acroforms, + extra_body={"id": "ffffffff-ffff-ffff-ffff-ffffffffffff"}, + ) diff --git a/tests/live/test_live_delete.py b/tests/live/test_live_delete.py index 75727fef..52bdf6fd 100644 --- a/tests/live/test_live_delete.py +++ b/tests/live/test_live_delete.py @@ -57,7 +57,7 @@ def test_live_delete_files_invalid_id( base_url=pdfrest_live_base_url, ) as client: uploaded = client.files.create_from_paths([resource])[0] - with pytest.raises(ValidationError): + with pytest.raises(ValidationError, match=r"(?i)ids?"): client.files.delete(uploaded, extra_body={"ids": token_urlsafe(16)}) @@ -72,7 +72,7 @@ async def test_live_async_delete_files_invalid_id( base_url=pdfrest_live_base_url, ) as client: uploaded = (await client.files.create_from_paths([resource]))[0] - with pytest.raises(ValidationError): + with pytest.raises(ValidationError, match=r"(?i)ids?"): await client.files.delete(uploaded, extra_body={"ids": token_urlsafe(16)}) diff --git a/tests/live/test_live_extract_images.py b/tests/live/test_live_extract_images.py new file mode 100644 index 00000000..3410622a --- /dev/null +++ b/tests/live/test_live_extract_images.py @@ -0,0 +1,95 @@ +from __future__ import annotations + +import pytest + +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient +from pdfrest.models import PdfRestFileBasedResponse + +from ..resources import get_test_resource_path + + +def test_live_extract_images_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("duckhat.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = client.files.create_from_paths([resource])[0] + response = client.extract_images(uploaded) + + assert isinstance(response, PdfRestFileBasedResponse) + output_files = response.output_files + assert output_files + assert all(file.name for file in output_files) + assert all( + file.type and (file.type.startswith("image/") or file.type == "application/zip") + for file in output_files + ) + assert all(file.size > 0 for file in output_files) + assert response.warning is None + assert response.input_id == uploaded.id + + +@pytest.mark.asyncio +async def test_live_async_extract_images_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("duckhat.pdf") + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = (await client.files.create_from_paths([resource]))[0] + response = await client.extract_images(uploaded, output="async-images") + + assert isinstance(response, PdfRestFileBasedResponse) + output_files = response.output_files + assert output_files + assert output_files[0].name.startswith("async-images") + assert all(file.name for file in output_files) + assert all( + file.type and (file.type.startswith("image/") or file.type == "application/zip") + for file in output_files + ) + assert all(file.size > 0 for file in output_files) + assert response.warning is None + assert response.input_id == uploaded.id + + +def test_live_extract_images_invalid_pages( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("duckhat.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = client.files.create_from_paths([resource])[0] + with pytest.raises(PdfRestApiError, match=r"(?i)page"): + client.extract_images( + uploaded, + extra_body={"pages": "last-1"}, + ) + + +@pytest.mark.asyncio +async def test_live_async_extract_images_invalid_pages( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("duckhat.pdf") + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = (await client.files.create_from_paths([resource]))[0] + with pytest.raises(PdfRestApiError, match=r"(?i)page"): + await client.extract_images( + uploaded, + extra_body={"pages": "last-1"}, + ) diff --git a/tests/live/test_live_extract_pdf_text_to_file.py b/tests/live/test_live_extract_pdf_text_to_file.py new file mode 100644 index 00000000..d6e58652 --- /dev/null +++ b/tests/live/test_live_extract_pdf_text_to_file.py @@ -0,0 +1,102 @@ +from __future__ import annotations + +import pytest + +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient +from pdfrest.models import PdfRestFileBasedResponse + +from ..resources import get_test_resource_path + + +def test_live_extract_pdf_text_to_file_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = client.files.create_from_paths([resource])[0] + response = client.extract_pdf_text_to_file( + uploaded, + full_text="document", + preserve_line_breaks="on", + word_style="off", + word_coordinates="off", + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_files + output_file = response.output_file + assert output_file.name.endswith(".json") + assert output_file.type == "application/json" + assert output_file.size > 0 + assert response.warning is None + assert response.input_id == uploaded.id + + +@pytest.mark.asyncio +async def test_live_async_extract_pdf_text_to_file_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = (await client.files.create_from_paths([resource]))[0] + response = await client.extract_pdf_text_to_file( + uploaded, + full_text="document", + preserve_line_breaks="on", + word_style="off", + word_coordinates="off", + output="async-text", + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_files + output_file = response.output_file + assert output_file.name.startswith("async-text") + assert output_file.name.endswith(".json") + assert output_file.type == "application/json" + assert output_file.size > 0 + assert response.warning is None + assert response.input_id == uploaded.id + + +def test_live_extract_pdf_text_to_file_invalid_pages( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = client.files.create_from_paths([resource])[0] + with pytest.raises(PdfRestApiError, match=r"(?i)page"): + client.extract_pdf_text_to_file( + uploaded, + extra_body={"pages": "last-1"}, + ) + + +@pytest.mark.asyncio +async def test_live_async_extract_pdf_text_to_file_invalid_pages( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = (await client.files.create_from_paths([resource]))[0] + with pytest.raises(PdfRestApiError, match=r"(?i)page"): + await client.extract_pdf_text_to_file( + uploaded, + extra_body={"pages": "last-1"}, + ) diff --git a/tests/live/test_live_flatten_annotations.py b/tests/live/test_live_flatten_annotations.py new file mode 100644 index 00000000..b97b08b0 --- /dev/null +++ b/tests/live/test_live_flatten_annotations.py @@ -0,0 +1,114 @@ +from __future__ import annotations + +import pytest + +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient +from pdfrest.models import PdfRestFile + +from ..resources import get_test_resource_path + + +@pytest.fixture(scope="module") +def uploaded_pdf_for_annotations( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> PdfRestFile: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + return client.files.create_from_paths([resource])[0] + + +@pytest.mark.parametrize( + "output_name", + [ + pytest.param(None, id="default-output"), + pytest.param("flatten-annotations", id="custom-output"), + ], +) +def test_live_flatten_annotations_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_annotations: PdfRestFile, + output_name: str | None, +) -> None: + kwargs: dict[str, str] = {} + if output_name is not None: + kwargs["output"] = output_name + + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = client.flatten_annotations(uploaded_pdf_for_annotations, **kwargs) + + assert response.output_files + output_file = response.output_file + assert output_file.type == "application/pdf" + assert output_file.size > 0 + assert response.warning is None + assert str(response.input_id) == str(uploaded_pdf_for_annotations.id) + if output_name is not None: + assert output_file.name.startswith(output_name) + else: + assert output_file.name.endswith(".pdf") + + +@pytest.mark.asyncio +async def test_live_async_flatten_annotations_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_annotations: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = await client.flatten_annotations( + uploaded_pdf_for_annotations, output="async" + ) + + assert response.output_files + output_file = response.output_file + assert output_file.name.startswith("async") + assert output_file.type == "application/pdf" + assert output_file.size > 0 + assert response.warning is None + assert str(response.input_id) == str(uploaded_pdf_for_annotations.id) + + +def test_live_flatten_annotations_invalid_file_id( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_annotations: PdfRestFile, +) -> None: + with ( + PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client, + pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"), + ): + client.flatten_annotations( + uploaded_pdf_for_annotations, + extra_body={"id": "00000000-0000-0000-0000-000000000000"}, + ) + + +@pytest.mark.asyncio +async def test_live_async_flatten_annotations_invalid_file_id( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_annotations: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + with pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"): + await client.flatten_annotations( + uploaded_pdf_for_annotations, + extra_body={"id": "ffffffff-ffff-ffff-ffff-ffffffffffff"}, + ) diff --git a/tests/live/test_live_flatten_layers.py b/tests/live/test_live_flatten_layers.py new file mode 100644 index 00000000..7343cab7 --- /dev/null +++ b/tests/live/test_live_flatten_layers.py @@ -0,0 +1,112 @@ +from __future__ import annotations + +import pytest + +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient +from pdfrest.models import PdfRestFile + +from ..resources import get_test_resource_path + + +@pytest.fixture(scope="module") +def uploaded_pdf_for_layers( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> PdfRestFile: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + return client.files.create_from_paths([resource])[0] + + +@pytest.mark.parametrize( + "output_name", + [ + pytest.param(None, id="default-output"), + pytest.param("flatten-layers", id="custom-output"), + ], +) +def test_live_flatten_layers_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_layers: PdfRestFile, + output_name: str | None, +) -> None: + kwargs: dict[str, str] = {} + if output_name is not None: + kwargs["output"] = output_name + + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = client.flatten_layers(uploaded_pdf_for_layers, **kwargs) + + assert response.output_files + output_file = response.output_file + assert output_file.type == "application/pdf" + assert output_file.size > 0 + assert response.warning is None + assert str(response.input_id) == str(uploaded_pdf_for_layers.id) + if output_name is not None: + assert output_file.name.startswith(output_name) + else: + assert output_file.name.endswith(".pdf") + + +@pytest.mark.asyncio +async def test_live_async_flatten_layers_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_layers: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = await client.flatten_layers(uploaded_pdf_for_layers, output="async") + + assert response.output_files + output_file = response.output_file + assert output_file.name.startswith("async") + assert output_file.type == "application/pdf" + assert output_file.size > 0 + assert response.warning is None + assert str(response.input_id) == str(uploaded_pdf_for_layers.id) + + +def test_live_flatten_layers_invalid_file_id( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_layers: PdfRestFile, +) -> None: + with ( + PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client, + pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"), + ): + client.flatten_layers( + uploaded_pdf_for_layers, + extra_body={"id": "00000000-0000-0000-0000-000000000000"}, + ) + + +@pytest.mark.asyncio +async def test_live_async_flatten_layers_invalid_file_id( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_layers: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + with pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"): + await client.flatten_layers( + uploaded_pdf_for_layers, + extra_body={"id": "ffffffff-ffff-ffff-ffff-ffffffffffff"}, + ) diff --git a/tests/live/test_live_flatten_pdf_forms.py b/tests/live/test_live_flatten_pdf_forms.py index c6ad7fdb..5bff7304 100644 --- a/tests/live/test_live_flatten_pdf_forms.py +++ b/tests/live/test_live_flatten_pdf_forms.py @@ -2,7 +2,7 @@ import pytest -from pdfrest import PdfRestApiError, PdfRestClient +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient from pdfrest.models import PdfRestFile from ..resources import get_test_resource_path @@ -54,6 +54,28 @@ def test_live_flatten_pdf_forms( assert output_file.name.endswith(".pdf") +@pytest.mark.asyncio +async def test_live_async_flatten_pdf_forms_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_with_forms: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = await client.flatten_pdf_forms( + uploaded_pdf_with_forms, + output="async-flattened", + ) + + assert response.output_files + output_file = response.output_file + assert output_file.name.startswith("async-flattened") + assert output_file.type == "application/pdf" + assert str(response.input_id) == str(uploaded_pdf_with_forms.id) + + def test_live_flatten_pdf_forms_invalid_file_id( pdfrest_api_key: str, pdfrest_live_base_url: str, @@ -64,9 +86,26 @@ def test_live_flatten_pdf_forms_invalid_file_id( api_key=pdfrest_api_key, base_url=pdfrest_live_base_url, ) as client, - pytest.raises(PdfRestApiError), + pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"), ): client.flatten_pdf_forms( uploaded_pdf_with_forms, extra_body={"id": "ffffffff-ffff-ffff-ffff-ffffffffffff"}, ) + + +@pytest.mark.asyncio +async def test_live_async_flatten_pdf_forms_invalid_file_id( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_with_forms: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + with pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"): + await client.flatten_pdf_forms( + uploaded_pdf_with_forms, + extra_body={"id": "00000000-0000-0000-0000-000000000000"}, + ) diff --git a/tests/live/test_live_flatten_transparencies.py b/tests/live/test_live_flatten_transparencies.py new file mode 100644 index 00000000..7da1eb40 --- /dev/null +++ b/tests/live/test_live_flatten_transparencies.py @@ -0,0 +1,117 @@ +from __future__ import annotations + +import pytest + +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient +from pdfrest.models import PdfRestFile + +from ..resources import get_test_resource_path + + +@pytest.fixture(scope="module") +def uploaded_pdf_for_transparencies( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> PdfRestFile: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + return client.files.create_from_paths([resource])[0] + + +@pytest.mark.parametrize( + ("output_name", "quality"), + [ + pytest.param(None, "medium", id="default-output"), + pytest.param("flatten-transparency", "high", id="custom-output-high"), + ], +) +def test_live_flatten_transparencies_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_transparencies: PdfRestFile, + output_name: str | None, + quality: str, +) -> None: + kwargs: dict[str, str] = {"quality": quality} + if output_name is not None: + kwargs["output"] = output_name + + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = client.flatten_transparencies( + uploaded_pdf_for_transparencies, **kwargs + ) + + assert response.output_files + output_file = response.output_file + assert output_file.type == "application/pdf" + assert output_file.size > 0 + assert response.warning is None + assert str(response.input_id) == str(uploaded_pdf_for_transparencies.id) + if output_name is not None: + assert output_file.name.startswith(output_name) + else: + assert output_file.name.endswith(".pdf") + + +@pytest.mark.asyncio +async def test_live_async_flatten_transparencies_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_transparencies: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = await client.flatten_transparencies( + uploaded_pdf_for_transparencies, output="async", quality="low" + ) + + assert response.output_files + output_file = response.output_file + assert output_file.name.startswith("async") + assert output_file.type == "application/pdf" + assert output_file.size > 0 + assert response.warning is None + assert str(response.input_id) == str(uploaded_pdf_for_transparencies.id) + + +def test_live_flatten_transparencies_invalid_file_id( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_transparencies: PdfRestFile, +) -> None: + with ( + PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client, + pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"), + ): + client.flatten_transparencies( + uploaded_pdf_for_transparencies, + extra_body={"id": "00000000-0000-0000-0000-000000000000"}, + ) + + +@pytest.mark.asyncio +async def test_live_async_flatten_transparencies_invalid_file_id( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_transparencies: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + with pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"): + await client.flatten_transparencies( + uploaded_pdf_for_transparencies, + extra_body={"id": "ffffffff-ffff-ffff-ffff-ffffffffffff"}, + ) diff --git a/tests/live/test_live_graphic_conversions.py b/tests/live/test_live_graphic_conversions.py index 2b68edb3..a78f8d0f 100644 --- a/tests/live/test_live_graphic_conversions.py +++ b/tests/live/test_live_graphic_conversions.py @@ -5,7 +5,7 @@ import pytest -from pdfrest import PdfRestApiError, PdfRestClient +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient from pdfrest.models import PdfRestFile from pdfrest.models._internal import ( BasePdfRestGraphicPayload, @@ -121,6 +121,28 @@ def uploaded_20_page_pdf( return client.files.create_from_paths([resource])[0] +@pytest.mark.asyncio +async def test_live_async_convert_to_png_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = (await client.files.create_from_paths([resource]))[0] + response = await client.convert_to_png( + uploaded, + output_prefix="async-png", + resolution=150, + ) + + assert response.output_files + assert all(file_info.type == "image/png" for file_info in response.output_files) + assert str(response.input_id) == str(uploaded.id) + + @pytest.mark.parametrize( ("_endpoint_label", "spec", "color_model"), _valid_color_cases(), @@ -168,7 +190,7 @@ def test_live_graphic_invalid_color_model( uploaded = client.files.create_from_paths([resource])[0] client_method = getattr(client, spec.method_name) resolution = _resolution_bounds(payload_model)[0] - with pytest.raises(PdfRestApiError): + with pytest.raises(PdfRestApiError, match=r"(?i)color"): client_method( uploaded, resolution=resolution, @@ -213,7 +235,7 @@ def test_live_graphic_resolution_bounds( if should_raise: call_kwargs["extra_body"] = {"resolution": base_resolution + offset} - with pytest.raises(PdfRestApiError): + with pytest.raises(PdfRestApiError, match=r"(?i)resolution"): client_method(uploaded, **call_kwargs) else: response = client_method(uploaded, **call_kwargs) @@ -261,7 +283,7 @@ def test_live_graphic_invalid_smoothing( ) as client: uploaded = client.files.create_from_paths([resource])[0] client_method = getattr(client, spec.method_name) - with pytest.raises(PdfRestApiError): + with pytest.raises(PdfRestApiError, match=r"(?i)smooth"): client_method( uploaded, smoothing="none", @@ -269,6 +291,25 @@ def test_live_graphic_invalid_smoothing( ) +@pytest.mark.asyncio +async def test_live_async_graphic_invalid_smoothing( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = (await client.files.create_from_paths([resource]))[0] + with pytest.raises(PdfRestApiError, match=r"(?i)smooth"): + await client.convert_to_png( + uploaded, + smoothing="none", + extra_body={"smoothing": "super-smooth"}, + ) + + @pytest.mark.parametrize( ("page_range", "expect_success"), [ @@ -316,7 +357,7 @@ def test_live_png_page_range_variants( ) assert str(response.input_id) == str(uploaded_20_page_pdf.id) else: - with pytest.raises(PdfRestApiError): + with pytest.raises(PdfRestApiError, match=r"(?i)page"): client.convert_to_png( uploaded_20_page_pdf, output_prefix=f"live-range-{case_id}", @@ -348,7 +389,10 @@ def test_live_png_page_range_invalid_overrides( api_key=pdfrest_api_key, base_url=pdfrest_live_base_url, ) as client, - pytest.raises(PdfRestApiError), + pytest.raises( + PdfRestApiError, + match=r"There was an issue processing your file\. Validate all fields and try again\.", + ), ): client.convert_to_png( uploaded_20_page_pdf, diff --git a/tests/live/test_live_linearize_pdf.py b/tests/live/test_live_linearize_pdf.py new file mode 100644 index 00000000..523ea0d5 --- /dev/null +++ b/tests/live/test_live_linearize_pdf.py @@ -0,0 +1,115 @@ +from __future__ import annotations + +import pytest + +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient +from pdfrest.models import PdfRestFile + +from ..resources import get_test_resource_path + + +@pytest.fixture(scope="module") +def uploaded_pdf_for_linearize( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> PdfRestFile: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + return client.files.create_from_paths([resource])[0] + + +@pytest.mark.parametrize( + "output_name", + [ + pytest.param(None, id="default-output"), + pytest.param("linearized-live", id="custom-output"), + ], +) +def test_live_linearize_pdf( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_linearize: PdfRestFile, + output_name: str | None, +) -> None: + kwargs: dict[str, str] = {} + if output_name is not None: + kwargs["output"] = output_name + + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = client.linearize_pdf(uploaded_pdf_for_linearize, **kwargs) + + assert response.output_files + output_file = response.output_file + assert output_file.type == "application/pdf" + assert output_file.size > 0 + assert response.warning is None + assert str(response.input_id) == str(uploaded_pdf_for_linearize.id) + if output_name is not None: + assert output_file.name.startswith(output_name) + else: + assert output_file.name.endswith(".pdf") + + +@pytest.mark.asyncio +async def test_live_async_linearize_pdf( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_linearize: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = await client.linearize_pdf( + uploaded_pdf_for_linearize, + output="async-linearized", + ) + + assert response.output_files + output_file = response.output_file + assert output_file.name.startswith("async-linearized") + assert output_file.type == "application/pdf" + assert output_file.size > 0 + assert response.warning is None + assert str(response.input_id) == str(uploaded_pdf_for_linearize.id) + + +def test_live_linearize_pdf_invalid_file_id( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_linearize: PdfRestFile, +) -> None: + with ( + PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client, + pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"), + ): + client.linearize_pdf( + uploaded_pdf_for_linearize, + extra_body={"id": "ffffffff-ffff-ffff-ffff-ffffffffffff"}, + ) + + +@pytest.mark.asyncio +async def test_live_async_linearize_pdf_invalid_file_id( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_linearize: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + with pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"): + await client.linearize_pdf( + uploaded_pdf_for_linearize, + extra_body={"id": "00000000-0000-0000-0000-000000000000"}, + ) diff --git a/tests/live/test_live_ocr_pdf.py b/tests/live/test_live_ocr_pdf.py new file mode 100644 index 00000000..5e9ede14 --- /dev/null +++ b/tests/live/test_live_ocr_pdf.py @@ -0,0 +1,87 @@ +from __future__ import annotations + +import pytest + +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient +from pdfrest.models import PdfRestFileBasedResponse + +from ..resources import get_test_resource_path + + +def test_live_ocr_pdf_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = client.files.create_from_paths([resource])[0] + response = client.ocr_pdf(uploaded, languages=["English", "German"]) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_files + output_file = response.output_file + assert output_file.name.endswith(".pdf") + assert output_file.type == "application/pdf" + assert output_file.size > 0 + assert response.warning is None + assert response.input_id == uploaded.id + + +@pytest.mark.asyncio +async def test_live_async_ocr_pdf_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = (await client.files.create_from_paths([resource]))[0] + response = await client.ocr_pdf(uploaded, output="async-ocr") + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_files + assert response.output_file.name.startswith("async-ocr") + assert response.output_file.type == "application/pdf" + assert response.output_file.size > 0 + assert response.warning is None + assert response.input_id == uploaded.id + + +def test_live_ocr_pdf_invalid_pages( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = client.files.create_from_paths([resource])[0] + with pytest.raises(PdfRestApiError, match=r"(?i)page"): + client.ocr_pdf( + uploaded, + extra_body={"pages": "last-1"}, + ) + + +@pytest.mark.asyncio +async def test_live_async_ocr_pdf_invalid_pages( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = (await client.files.create_from_paths([resource]))[0] + with pytest.raises(PdfRestApiError, match=r"(?i)page"): + await client.ocr_pdf( + uploaded, + extra_body={"pages": "last-1"}, + ) diff --git a/tests/live/test_live_pdf_info.py b/tests/live/test_live_pdf_info.py index 977fe87d..7ec91828 100644 --- a/tests/live/test_live_pdf_info.py +++ b/tests/live/test_live_pdf_info.py @@ -111,7 +111,7 @@ def test_live_pdf_info_invalid_query( PdfRestClient( api_key=pdfrest_api_key, base_url=pdfrest_live_base_url ) as client, - pytest.raises(PdfRestApiError), + pytest.raises(PdfRestApiError, match=r"(?i)quer"), ): client.query_pdf_info( uploaded_pdf, diff --git a/tests/live/test_live_pdf_redactions.py b/tests/live/test_live_pdf_redactions.py index 796785a1..3fda6d42 100644 --- a/tests/live/test_live_pdf_redactions.py +++ b/tests/live/test_live_pdf_redactions.py @@ -4,7 +4,7 @@ import pytest -from pdfrest import PdfRestApiError, PdfRestClient +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient from pdfrest.models import PdfRestFile from pdfrest.types import PdfRedactionInstruction, PdfRedactionPreset @@ -135,6 +135,36 @@ def test_live_redaction_preview_and_apply_multiple( assert final_file.type == "application/pdf" +@pytest.mark.asyncio +async def test_live_async_redaction_preview_and_apply( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_redaction: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + preview = await client.preview_redactions( + uploaded_pdf_for_redaction, + redactions=[{"type": "literal", "value": "quick brown fox"}], + output="async-redaction-preview", + ) + + preview_file = preview.output_files[0] + applied = await client.apply_redactions( + preview_file, + output="async-redaction-final", + ) + + assert preview.output_files + assert preview_file.name.endswith("async-redaction-preview.pdf") + assert applied.output_files + final_file = applied.output_files[0] + assert final_file.name.endswith("async-redaction-final.pdf") + assert final_file.type == "application/pdf" + + @pytest.mark.parametrize( "extra_body", [ @@ -153,7 +183,13 @@ def test_live_redactions_invalid_payloads( base_url=pdfrest_live_base_url, ) as client: if "redactions" in extra_body: - with pytest.raises(PdfRestApiError): + with pytest.raises( + PdfRestApiError, + match=( + r"The JSON data provided is not properly formatted\. Please check " + r"your syntax and try again\." + ), + ): client.preview_redactions( uploaded_pdf_for_redaction, redactions=[{"type": "literal", "value": "placeholder"}], @@ -165,5 +201,23 @@ def test_live_redactions_invalid_payloads( redactions=[{"type": "literal", "value": "placeholder"}], ) preview_file = preview.output_files[0] - with pytest.raises(PdfRestApiError): + with pytest.raises(PdfRestApiError, match=r"(?i)rgb"): client.apply_redactions(preview_file, extra_body=extra_body) + + +@pytest.mark.asyncio +async def test_live_async_redactions_invalid_payloads( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_redaction: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + with pytest.raises(PdfRestApiError, match=r"(?i)rgb"): + await client.preview_redactions( + uploaded_pdf_for_redaction, + redactions=[{"type": "literal", "value": "placeholder"}], + extra_body={"rgb_color": "-1,-1,-1"}, + ) diff --git a/tests/live/test_live_pdf_split_merge.py b/tests/live/test_live_pdf_split_merge.py index 979f7b1e..5a58912c 100644 --- a/tests/live/test_live_pdf_split_merge.py +++ b/tests/live/test_live_pdf_split_merge.py @@ -198,7 +198,7 @@ def test_live_split_pdf_invalid_pages( api_key=pdfrest_api_key, base_url=pdfrest_live_base_url, ) as client, - pytest.raises(PdfRestApiError), + pytest.raises(PdfRestApiError, match=r"(?i)page"), ): client.split_pdf( split_source, @@ -270,7 +270,7 @@ def test_live_merge_pdfs_invalid_pages( api_key=pdfrest_api_key, base_url=pdfrest_live_base_url, ) as client, - pytest.raises(PdfRestApiError), + pytest.raises(PdfRestApiError, match=r"(?i)page"), ): client.merge_pdfs( sources, @@ -373,7 +373,7 @@ def test_live_split_pdf_page_range_variants( output_pages = client.query_pdf_info(response.output_files[0]).page_count assert output_pages == len(expected_pages) else: - with pytest.raises(PdfRestApiError): + with pytest.raises(PdfRestApiError, match=r"(?i)page"): client.split_pdf( split_source, page_groups=[selection if not requires_override else "1"], @@ -446,7 +446,7 @@ def test_live_merge_pdf_page_range_variants( output_info = client.query_pdf_info(response.output_file) assert output_info.page_count == expected_total_pages else: - with pytest.raises(PdfRestApiError): + with pytest.raises(PdfRestApiError, match=r"(?i)page"): client.merge_pdfs( sources, output_prefix=f"live-merge-range-{case_id}", diff --git a/tests/live/test_live_rasterize_pdf.py b/tests/live/test_live_rasterize_pdf.py new file mode 100644 index 00000000..df7cb260 --- /dev/null +++ b/tests/live/test_live_rasterize_pdf.py @@ -0,0 +1,114 @@ +from __future__ import annotations + +import pytest + +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient +from pdfrest.models import PdfRestFile + +from ..resources import get_test_resource_path + + +@pytest.fixture(scope="module") +def uploaded_pdf_for_rasterize( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> PdfRestFile: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + return client.files.create_from_paths([resource])[0] + + +@pytest.mark.parametrize( + "output_name", + [ + pytest.param(None, id="default-output"), + pytest.param("rasterized-live", id="custom-output"), + ], +) +def test_live_rasterize_pdf_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_rasterize: PdfRestFile, + output_name: str | None, +) -> None: + kwargs: dict[str, str] = {} + if output_name is not None: + kwargs["output"] = output_name + + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = client.rasterize_pdf(uploaded_pdf_for_rasterize, **kwargs) + + assert response.output_files + output_file = response.output_file + assert output_file.type == "application/pdf" + assert output_file.size > 0 + assert response.warning is None + assert str(response.input_id) == str(uploaded_pdf_for_rasterize.id) + if output_name is not None: + assert output_file.name.startswith(output_name) + else: + assert output_file.name.endswith(".pdf") + + +@pytest.mark.asyncio +async def test_live_async_rasterize_pdf_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_rasterize: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = await client.rasterize_pdf( + uploaded_pdf_for_rasterize, output="async" + ) + + assert response.output_files + output_file = response.output_file + assert output_file.name.startswith("async") + assert output_file.type == "application/pdf" + assert output_file.size > 0 + assert response.warning is None + assert str(response.input_id) == str(uploaded_pdf_for_rasterize.id) + + +def test_live_rasterize_pdf_invalid_file_id( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_rasterize: PdfRestFile, +) -> None: + with ( + PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client, + pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"), + ): + client.rasterize_pdf( + uploaded_pdf_for_rasterize, + extra_body={"id": "00000000-0000-0000-0000-000000000000"}, + ) + + +@pytest.mark.asyncio +async def test_live_async_rasterize_pdf_invalid_file_id( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_rasterize: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + with pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"): + await client.rasterize_pdf( + uploaded_pdf_for_rasterize, + extra_body={"id": "ffffffff-ffff-ffff-ffff-ffffffffffff"}, + ) diff --git a/tests/live/test_live_summarize_pdf_text.py b/tests/live/test_live_summarize_pdf_text.py new file mode 100644 index 00000000..629c815a --- /dev/null +++ b/tests/live/test_live_summarize_pdf_text.py @@ -0,0 +1,112 @@ +from __future__ import annotations + +import pytest + +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient +from pdfrest.models import PdfRestFileBasedResponse, SummarizePdfTextResponse + +from ..resources import get_test_resource_path + + +def test_live_summarize_text_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = client.files.create_from_paths([resource])[0] + response = client.summarize_text( + uploaded, + target_word_count=40, + summary_format="overview", + ) + + assert isinstance(response, SummarizePdfTextResponse) + assert response.summary + assert response.input_id == uploaded.id + + +def test_live_summarize_text_to_file_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = client.files.create_from_paths([resource])[0] + response = client.summarize_text_to_file( + uploaded, + target_word_count=40, + summary_format="overview", + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_files + output_file = response.output_file + assert output_file.name.endswith(".md") + assert output_file.type == "text/markdown" + assert output_file.size > 0 + assert response.warning is None + assert response.input_id == uploaded.id + + +@pytest.mark.asyncio +async def test_live_async_summarize_text_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = (await client.files.create_from_paths([resource]))[0] + response = await client.summarize_text( + uploaded, + target_word_count=30, + summary_format="overview", + ) + + assert isinstance(response, SummarizePdfTextResponse) + assert response.summary + assert response.input_id == uploaded.id + + +def test_live_summarize_text_invalid_format( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = client.files.create_from_paths([resource])[0] + with pytest.raises(PdfRestApiError, match=r"(?i)summary"): + client.summarize_text( + uploaded, + extra_body={"summary_format": "invalid-style"}, + ) + + +@pytest.mark.asyncio +async def test_live_async_summarize_text_invalid_format( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = (await client.files.create_from_paths([resource]))[0] + with pytest.raises(PdfRestApiError, match=r"(?i)summary"): + await client.summarize_text( + uploaded, + extra_body={"summary_format": "invalid-style"}, + ) diff --git a/tests/live/test_live_translate_pdf_text.py b/tests/live/test_live_translate_pdf_text.py new file mode 100644 index 00000000..00701242 --- /dev/null +++ b/tests/live/test_live_translate_pdf_text.py @@ -0,0 +1,128 @@ +from __future__ import annotations + +import pytest + +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient +from pdfrest.models import ( + TranslatePdfTextFileResponse, + TranslatePdfTextResponse, +) + +from ..resources import get_test_resource_path + + +def test_live_translate_pdf_text_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = client.files.create_from_paths([resource])[0] + response = client.translate_pdf_text( + uploaded, + output_language="fr", + output_format="plaintext", + ) + + assert isinstance(response, TranslatePdfTextResponse) + assert response.translated_text + assert response.output_language == "fr" + assert response.source_languages + assert response.input_id == uploaded.id + + +@pytest.mark.asyncio +async def test_live_async_translate_pdf_text_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = (await client.files.create_from_paths([resource]))[0] + response = await client.translate_pdf_text( + uploaded, + output_language="es", + output_format="plaintext", + ) + + assert isinstance(response, TranslatePdfTextResponse) + assert response.translated_text + assert response.output_language == "es" + assert response.input_id == uploaded.id + + +def test_live_translate_pdf_text_invalid_output_format( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = client.files.create_from_paths([resource])[0] + with pytest.raises( + PdfRestApiError, + match=r"invalid-format is not a valid input for 'output_format'", + ): + client.translate_pdf_text( + uploaded, + output_language="es", + extra_body={"output_format": "invalid-format"}, + ) + + +@pytest.mark.asyncio +async def test_live_async_translate_pdf_text_invalid_output_format( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = (await client.files.create_from_paths([resource]))[0] + with pytest.raises( + PdfRestApiError, + match=r"invalid-format is not a valid input for 'output_format'", + ): + await client.translate_pdf_text( + uploaded, + output_language="de", + extra_body={"output_format": "invalid-format"}, + ) + + +def test_live_translate_pdf_text_file_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = client.files.create_from_paths([resource])[0] + response = client.translate_pdf_text_to_file( + uploaded, + output_language="fr", + output_format="plaintext", + ) + + assert isinstance(response, TranslatePdfTextFileResponse) + assert response.output_files + output_file = response.output_file + assert output_file.name.endswith(".txt") + assert output_file.type == "text/plain" + assert output_file.size > 0 + assert response.warning is None + assert response.output_language == "fr" + assert response.source_languages + assert response.input_id == uploaded.id diff --git a/tests/resources/duckhat.pdf b/tests/resources/duckhat.pdf new file mode 100644 index 00000000..8dbaff23 Binary files /dev/null and b/tests/resources/duckhat.pdf differ diff --git a/tests/test_blank_pdf.py b/tests/test_blank_pdf.py new file mode 100644 index 00000000..b38ee0e0 --- /dev/null +++ b/tests/test_blank_pdf.py @@ -0,0 +1,311 @@ +from __future__ import annotations + +import json + +import httpx +import pytest +from pydantic import ValidationError + +from pdfrest import AsyncPdfRestClient, PdfRestClient +from pdfrest.models import PdfRestFileBasedResponse, PdfRestFileID +from pdfrest.models._internal import PdfBlankPayload + +from .graphics_test_helpers import ASYNC_API_KEY, VALID_API_KEY, build_file_info_payload + + +def test_blank_pdf_success(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + output_id = str(PdfRestFileID.generate()) + + payload_dump = PdfBlankPayload.model_validate( + { + "page_size": "letter", + "page_count": 2, + "page_orientation": "portrait", + "output": "blank", + } + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/blank-pdf": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "blank.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.blank_pdf( + page_size="letter", + page_count=2, + page_orientation="portrait", + output="blank", + ) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + output_file = response.output_file + assert output_file.name == "blank.pdf" + assert output_file.type == "application/pdf" + assert response.warning is None + assert str(response.input_id) == output_id + + +def test_blank_pdf_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/blank-pdf": + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload["page_size"] == "custom" + assert payload["custom_height"] == 792 + assert payload["custom_width"] == 612 + assert "page_orientation" not in payload + assert payload["debug"] == "yes" + assert payload["output"] == "custom" + return httpx.Response( + 200, + json={ + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "custom.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.blank_pdf( + page_size="custom", + page_count=3, + custom_height=792, + custom_width=612, + output="custom", + extra_query={"trace": "true"}, + extra_headers={"X-Debug": "sync"}, + extra_body={"debug": "yes"}, + timeout=0.29, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "custom.pdf" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.29) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.29) + + +@pytest.mark.asyncio +async def test_async_blank_pdf_success(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + output_id = str(PdfRestFileID.generate()) + + payload_dump = PdfBlankPayload.model_validate( + { + "page_size": "A4", + "page_count": 1, + "page_orientation": "landscape", + } + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/blank-pdf": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "async.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.blank_pdf( + page_size="A4", + page_count=1, + page_orientation="landscape", + ) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "async.pdf" + assert response.output_file.type == "application/pdf" + assert str(response.input_id) == output_id + + +@pytest.mark.asyncio +async def test_async_blank_pdf_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/blank-pdf": + assert request.url.params["trace"] == "async" + assert request.headers["X-Debug"] == "async" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload["page_size"] == "custom" + assert payload["custom_height"] == 100 + assert payload["custom_width"] == 50 + assert "page_orientation" not in payload + assert payload["debug"] == "yes" + return httpx.Response( + 200, + json={ + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "async" + assert request.headers["X-Debug"] == "async" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "async-custom.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.blank_pdf( + page_size="custom", + page_count=1, + custom_height=100, + custom_width=50, + extra_query={"trace": "async"}, + extra_headers={"X-Debug": "async"}, + extra_body={"debug": "yes"}, + timeout=0.52, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "async-custom.pdf" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.52) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.52) + + +def test_blank_pdf_validation(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + transport = httpx.MockTransport(lambda request: (_ for _ in ()).throw(RuntimeError)) + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises(ValueError, match="page_orientation is required"), + ): + client.blank_pdf(page_size="letter", page_count=1) + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises(ValueError, match="custom_height and custom_width are required"), + ): + client.blank_pdf(page_size="custom", page_count=1, custom_height=50) + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises( + ValueError, match="custom_height and custom_width can only be provided" + ), + ): + client.blank_pdf( + page_size="A3", + page_count=1, + page_orientation="portrait", + custom_width=10, + ) + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises(ValueError, match="page_orientation must be omitted"), + ): + client.blank_pdf( + page_size="custom", + page_count=1, + page_orientation="portrait", + custom_width=10, + custom_height=10, + ) + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises( + ValidationError, match="Input should be less than or equal to 1000" + ), + ): + client.blank_pdf( + page_size="A4", + page_count=1001, + page_orientation="portrait", + ) diff --git a/tests/test_convert_colors.py b/tests/test_convert_colors.py new file mode 100644 index 00000000..4dec0dff --- /dev/null +++ b/tests/test_convert_colors.py @@ -0,0 +1,334 @@ +from __future__ import annotations + +import json + +import httpx +import pytest +from pydantic import ValidationError + +from pdfrest import AsyncPdfRestClient, PdfRestClient +from pdfrest.models import PdfRestFile, PdfRestFileBasedResponse, PdfRestFileID +from pdfrest.models._internal import PdfConvertColorsPayload + +from .graphics_test_helpers import ( + ASYNC_API_KEY, + VALID_API_KEY, + build_file_info_payload, + make_pdf_file, +) + + +def _make_icc_file() -> PdfRestFile: + return PdfRestFile.model_validate( + build_file_info_payload( + PdfRestFileID.generate(), + "profile.icc", + "application/vnd.iccprofile", + ) + ) + + +def test_convert_colors_success(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + + payload_dump = PdfConvertColorsPayload.model_validate( + {"files": [input_file], "color_profile": "srgb", "output": "converted"} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if ( + request.method == "POST" + and request.url.path == "/pdf-with-converted-colors" + ): + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "converted.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.convert_colors( + input_file, color_profile="srgb", output="converted" + ) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + output_file = response.output_file + assert output_file.name == "converted.pdf" + assert output_file.type == "application/pdf" + assert response.warning is None + assert str(response.input_id) == str(input_file.id) + + +def test_convert_colors_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + profile_file = _make_icc_file() + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if ( + request.method == "POST" + and request.url.path == "/pdf-with-converted-colors" + ): + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload["debug"] == "yes" + assert payload["color_profile"] == "custom" + assert payload["profile_id"] == str(profile_file.id) + assert payload["preserve_black"] is True + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "custom.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.convert_colors( + input_file, + color_profile="custom", + profile=profile_file, + preserve_black=True, + output="custom", + extra_query={"trace": "true"}, + extra_headers={"X-Debug": "sync"}, + extra_body={"debug": "yes"}, + timeout=0.29, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "custom.pdf" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.29) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.29) + + +@pytest.mark.asyncio +async def test_async_convert_colors_success( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + + payload_dump = PdfConvertColorsPayload.model_validate( + {"files": [input_file], "color_profile": "srgb"} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if ( + request.method == "POST" + and request.url.path == "/pdf-with-converted-colors" + ): + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "async.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.convert_colors(input_file, color_profile="srgb") + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "async.pdf" + assert response.output_file.type == "application/pdf" + assert str(response.input_id) == str(input_file.id) + + +@pytest.mark.asyncio +async def test_async_convert_colors_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + profile_file = _make_icc_file() + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if ( + request.method == "POST" + and request.url.path == "/pdf-with-converted-colors" + ): + assert request.url.params["trace"] == "async" + assert request.headers["X-Debug"] == "async" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload["debug"] == "yes" + assert payload["color_profile"] == "custom" + assert payload["profile_id"] == str(profile_file.id) + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "async" + assert request.headers["X-Debug"] == "async" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "async-custom.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.convert_colors( + input_file, + color_profile="custom", + profile=profile_file, + extra_query={"trace": "async"}, + extra_headers={"X-Debug": "async"}, + extra_body={"debug": "yes"}, + timeout=0.52, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "async-custom.pdf" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.52) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.52) + + +def test_convert_colors_validation(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + pdf_file = make_pdf_file(PdfRestFileID.generate(1)) + png_file = PdfRestFile.model_validate( + build_file_info_payload( + PdfRestFileID.generate(), + "example.png", + "image/png", + ) + ) + wrong_profile_file = PdfRestFile.model_validate( + build_file_info_payload( + PdfRestFileID.generate(), + "profile.txt", + "text/plain", + ) + ) + transport = httpx.MockTransport(lambda request: (_ for _ in ()).throw(RuntimeError)) + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises(ValidationError, match="Must be a PDF file"), + ): + client.convert_colors(png_file, color_profile="srgb") + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises( + ValidationError, match="List should have at most 1 item after validation" + ), + ): + client.convert_colors( + [pdf_file, make_pdf_file(PdfRestFileID.generate())], + color_profile="srgb", + ) + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises(ValueError, match="requires a profile"), + ): + client.convert_colors(pdf_file, color_profile="custom") + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises(ValueError, match="only be provided when color_profile"), + ): + client.convert_colors(pdf_file, color_profile="srgb", profile=_make_icc_file()) + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises(ValidationError, match="Profile must be an ICC file"), + ): + client.convert_colors( + pdf_file, + color_profile="custom", + profile=wrong_profile_file, + ) diff --git a/tests/test_convert_to_excel.py b/tests/test_convert_to_excel.py new file mode 100644 index 00000000..42346aac --- /dev/null +++ b/tests/test_convert_to_excel.py @@ -0,0 +1,275 @@ +from __future__ import annotations + +import json + +import httpx +import pytest +from pydantic import ValidationError + +from pdfrest import AsyncPdfRestClient, PdfRestClient +from pdfrest.models import PdfRestFile, PdfRestFileBasedResponse, PdfRestFileID +from pdfrest.models._internal import PdfToExcelPayload + +from .graphics_test_helpers import ( + ASYNC_API_KEY, + VALID_API_KEY, + build_file_info_payload, + make_pdf_file, +) + + +def test_convert_to_excel_success(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + + payload_dump = PdfToExcelPayload.model_validate( + {"files": [input_file], "output": "report"} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/excel": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "report.xlsx", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.convert_to_excel(input_file, output="report") + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + output_file = response.output_file + assert output_file.name == "report.xlsx" + assert ( + output_file.type + == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" + ) + assert response.warning is None + assert str(response.input_id) == str(input_file.id) + + +def test_convert_to_excel_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/excel": + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload["debug"] is True + assert payload["id"] == str(input_file.id) + assert payload["output"] == "custom" + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "custom.xlsx", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.convert_to_excel( + input_file, + output="custom", + extra_query={"trace": "true"}, + extra_headers={"X-Debug": "sync"}, + extra_body={"debug": True}, + timeout=0.4, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "custom.xlsx" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.4) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.4) + + +@pytest.mark.asyncio +async def test_async_convert_to_excel_success( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + + payload_dump = PdfToExcelPayload.model_validate({"files": [input_file]}).model_dump( + mode="json", by_alias=True, exclude_none=True, exclude_unset=True + ) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/excel": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "async.xlsx", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.convert_to_excel(input_file) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "async.xlsx" + assert ( + response.output_file.type + == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" + ) + assert str(response.input_id) == str(input_file.id) + + +@pytest.mark.asyncio +async def test_async_convert_to_excel_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/excel": + assert request.url.params["trace"] == "async" + assert request.headers["X-Debug"] == "async" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload["debug"] == "yes" + assert payload["id"] == str(input_file.id) + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "async" + assert request.headers["X-Debug"] == "async" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "async-custom.xlsx", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.convert_to_excel( + input_file, + extra_query={"trace": "async"}, + extra_headers={"X-Debug": "async"}, + extra_body={"debug": "yes"}, + timeout=0.55, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "async-custom.xlsx" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.55) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.55) + + +def test_convert_to_excel_validation(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + pdf_file = make_pdf_file(PdfRestFileID.generate(1)) + png_file = PdfRestFile.model_validate( + build_file_info_payload( + PdfRestFileID.generate(), + "example.png", + "image/png", + ) + ) + transport = httpx.MockTransport(lambda request: (_ for _ in ()).throw(RuntimeError)) + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises(ValidationError, match="Must be a PDF file"), + ): + client.convert_to_excel(png_file) + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises( + ValidationError, match="List should have at most 1 item after validation" + ), + ): + client.convert_to_excel([pdf_file, make_pdf_file(PdfRestFileID.generate())]) diff --git a/tests/test_convert_to_markdown.py b/tests/test_convert_to_markdown.py new file mode 100644 index 00000000..22876eb8 --- /dev/null +++ b/tests/test_convert_to_markdown.py @@ -0,0 +1,247 @@ +from __future__ import annotations + +import json + +import httpx +import pytest +from pydantic import ValidationError + +from pdfrest import AsyncPdfRestClient, PdfRestClient +from pdfrest.models import ( + PdfRestFile, + PdfRestFileBasedResponse, + PdfRestFileID, +) +from pdfrest.models._internal import ConvertToMarkdownPayload + +from .graphics_test_helpers import ASYNC_API_KEY, VALID_API_KEY, make_pdf_file + + +def _make_markdown_file(file_id: str, name: str = "markdown.md") -> PdfRestFile: + return PdfRestFile.model_validate( + { + "id": file_id, + "name": name, + "url": f"https://api.pdfrest.com/resource/{file_id}", + "type": "text/markdown", + "size": 64, + "modified": "2024-01-01T00:00:00Z", + "scheduledDeletionTimeUtc": None, + } + ) + + +def test_convert_to_markdown_payload_rejects_non_pdf() -> None: + file_id = str(PdfRestFileID.generate()) + text_file = PdfRestFile.model_validate( + { + "id": file_id, + "name": "notes.txt", + "url": f"https://api.pdfrest.com/resource/{file_id}", + "type": "text/plain", + "size": 64, + "modified": "2024-01-01T00:00:00Z", + "scheduledDeletionTimeUtc": None, + } + ) + with pytest.raises(ValidationError, match="Must be a PDF file"): + ConvertToMarkdownPayload.model_validate({"files": [text_file]}) + + +def test_convert_to_markdown_payload_invalid_page_range() -> None: + file_repr = make_pdf_file(PdfRestFileID.generate(1)) + with pytest.raises( + ValidationError, match="The start page must be less than or equal to the end" + ): + ConvertToMarkdownPayload.model_validate( + {"files": [file_repr], "pages": ["5-2"]} + ) + + +def test_convert_to_markdown_payload_invalid_page_break_comments() -> None: + file_repr = make_pdf_file(PdfRestFileID.generate(1)) + with pytest.raises(ValidationError, match="Input should be 'on' or 'off'"): + ConvertToMarkdownPayload.model_validate( + {"files": [file_repr], "page_break_comments": "maybe"} + ) + + +def test_convert_to_markdown_success(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + payload_dump = ConvertToMarkdownPayload.model_validate( + { + "files": [input_file], + "pages": ["1-3"], + "output": "md", + "output_type": "file", + "page_break_comments": "on", + } + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/markdown": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + for key, value in payload_dump.items(): + assert payload[key] == value + return httpx.Response( + 200, + json={ + "inputId": [str(input_file.id)], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=_make_markdown_file(output_id).model_dump( + mode="json", by_alias=True + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.convert_to_markdown( + input_file, + pages=["1-3"], + output="md", + page_break_comments="on", + ) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert response.input_id == input_file.id + assert len(response.output_files) == 1 + + +def test_convert_to_markdown_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + payload_dump = ConvertToMarkdownPayload.model_validate( + { + "files": [input_file], + "output_type": "file", + "page_break_comments": "off", + } + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/markdown": + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + captured_timeout["post"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + for key, value in payload_dump.items(): + assert payload[key] == value + assert payload["debug"] is True + return httpx.Response( + 200, + json={ + "inputId": [str(input_file.id)], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + captured_timeout["get"] = request.extensions.get("timeout") + return httpx.Response( + 200, + json=_make_markdown_file(output_id, "debug.md").model_dump( + mode="json", by_alias=True + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.convert_to_markdown( + input_file, + extra_query={"trace": "true"}, + extra_headers={"X-Debug": "sync"}, + extra_body={"debug": True}, + timeout=0.4, + page_break_comments="off", + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert len(response.output_files) == 1 + post_timeout = captured_timeout["post"] + get_timeout = captured_timeout["get"] + assert post_timeout is not None + assert get_timeout is not None + if isinstance(post_timeout, dict): + assert all( + component == pytest.approx(0.4) for component in post_timeout.values() + ) + else: + assert post_timeout == pytest.approx(0.4) + if isinstance(get_timeout, dict): + assert all( + component == pytest.approx(0.4) for component in get_timeout.values() + ) + else: + assert get_timeout == pytest.approx(0.4) + + +@pytest.mark.asyncio +async def test_async_convert_to_markdown_success( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + payload_dump = ConvertToMarkdownPayload.model_validate( + {"files": [input_file], "output_type": "file", "page_break_comments": "off"} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/markdown": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + for key, value in payload_dump.items(): + assert payload[key] == value + return httpx.Response( + 200, + json={ + "inputId": [str(input_file.id)], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=_make_markdown_file(output_id, "async.md").model_dump( + mode="json", by_alias=True + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.convert_to_markdown( + input_file, page_break_comments="off" + ) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert len(response.output_files) == 1 diff --git a/tests/test_convert_to_pdfa.py b/tests/test_convert_to_pdfa.py new file mode 100644 index 00000000..c678af17 --- /dev/null +++ b/tests/test_convert_to_pdfa.py @@ -0,0 +1,309 @@ +from __future__ import annotations + +import json + +import httpx +import pytest +from pydantic import ValidationError + +from pdfrest import AsyncPdfRestClient, PdfRestClient +from pdfrest.models import PdfRestFile, PdfRestFileBasedResponse, PdfRestFileID +from pdfrest.models._internal import PdfToPdfaPayload +from pdfrest.types import PdfAType + +from .graphics_test_helpers import ( + ASYNC_API_KEY, + VALID_API_KEY, + build_file_info_payload, + make_pdf_file, +) + + +@pytest.mark.parametrize( + "output_type", + [ + pytest.param("PDF/A-1b", id="pdfa-1b"), + pytest.param("PDF/A-2b", id="pdfa-2b"), + pytest.param("PDF/A-2u", id="pdfa-2u"), + pytest.param("PDF/A-3b", id="pdfa-3b"), + pytest.param("PDF/A-3u", id="pdfa-3u"), + ], +) +def test_convert_to_pdfa_success( + monkeypatch: pytest.MonkeyPatch, output_type: PdfAType +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + payload_dump = PdfToPdfaPayload.model_validate( + {"files": [input_file], "output_type": output_type, "output": "archive"} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/pdfa": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, "archive.pdf", "application/pdf" + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.convert_to_pdfa( + input_file, + output_type=output_type, + output="archive", + ) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "archive.pdf" + assert response.output_file.type == "application/pdf" + assert str(response.input_id) == str(input_file.id) + assert response.warning is None + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "output_type", + [ + pytest.param("PDF/A-1b", id="async-pdfa-1b"), + pytest.param("PDF/A-2b", id="async-pdfa-2b"), + pytest.param("PDF/A-2u", id="async-pdfa-2u"), + pytest.param("PDF/A-3b", id="async-pdfa-3b"), + pytest.param("PDF/A-3u", id="async-pdfa-3u"), + ], +) +async def test_async_convert_to_pdfa_success( + monkeypatch: pytest.MonkeyPatch, output_type: PdfAType +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + payload_dump = PdfToPdfaPayload.model_validate( + {"files": [input_file], "output_type": output_type} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/pdfa": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=build_file_info_payload(output_id, "async.pdf", "application/pdf"), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.convert_to_pdfa( + input_file, + output_type=output_type, + ) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "async.pdf" + assert response.output_file.type == "application/pdf" + assert str(response.input_id) == str(input_file.id) + + +def test_convert_to_pdfa_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/pdfa": + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload["output_type"] == "PDF/A-3b" + assert payload["rasterize_if_errors_encountered"] == "on" + assert payload["debug"] == "yes" + assert payload["id"] == str(input_file.id) + assert payload["output"] == "custom" + return httpx.Response( + 200, + json={"inputId": [input_file.id], "outputId": [output_id]}, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, "custom.pdf", "application/pdf" + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.convert_to_pdfa( + input_file, + output_type="PDF/A-3b", + output="custom", + rasterize_if_errors_encountered="on", + extra_query={"trace": "true"}, + extra_headers={"X-Debug": "sync"}, + extra_body={"debug": "yes"}, + timeout=0.33, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "custom.pdf" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.33) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.33) + + +@pytest.mark.asyncio +async def test_async_convert_to_pdfa_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/pdfa": + assert request.url.params["trace"] == "async" + assert request.headers["X-Debug"] == "async" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload["output_type"] == "PDF/A-2u" + assert payload["id"] == str(input_file.id) + assert payload["extra"] == {"note": "async"} + assert payload["rasterize_if_errors_encountered"] == "off" + return httpx.Response( + 200, + json={"inputId": [input_file.id], "outputId": [output_id]}, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "async" + assert request.headers["X-Debug"] == "async" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, "async-custom.pdf", "application/pdf" + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.convert_to_pdfa( + input_file, + output_type="PDF/A-2u", + rasterize_if_errors_encountered="off", + extra_query={"trace": "async"}, + extra_headers={"X-Debug": "async"}, + extra_body={"extra": {"note": "async"}}, + timeout=0.72, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "async-custom.pdf" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.72) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.72) + + +def test_convert_to_pdfa_validation(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + pdf_file = make_pdf_file(PdfRestFileID.generate(1)) + png_file = PdfRestFile.model_validate( + build_file_info_payload( + PdfRestFileID.generate(), + "example.png", + "image/png", + ) + ) + transport = httpx.MockTransport(lambda request: (_ for _ in ()).throw(RuntimeError)) + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises( + ValidationError, + match=( + "Input should be 'PDF/A-1b', 'PDF/A-2b', 'PDF/A-2u', " + "'PDF/A-3b' or 'PDF/A-3u'" + ), + ), + ): + client.convert_to_pdfa(pdf_file, output_type=None) # type: ignore[arg-type] + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises(ValidationError, match="Must be a PDF file"), + ): + client.convert_to_pdfa(png_file, output_type="PDF/A-2b") + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises(ValidationError, match="PDF/A-1b"), + ): + client.convert_to_pdfa(pdf_file, output_type="PDF/A-4") # type: ignore[arg-type] + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises( + ValidationError, match="List should have at most 1 item after validation" + ), + ): + client.convert_to_pdfa( + [pdf_file, make_pdf_file(PdfRestFileID.generate())], + output_type="PDF/A-2b", + ) diff --git a/tests/test_convert_to_powerpoint.py b/tests/test_convert_to_powerpoint.py new file mode 100644 index 00000000..a8c1daa0 --- /dev/null +++ b/tests/test_convert_to_powerpoint.py @@ -0,0 +1,277 @@ +from __future__ import annotations + +import json + +import httpx +import pytest +from pydantic import ValidationError + +from pdfrest import AsyncPdfRestClient, PdfRestClient +from pdfrest.models import PdfRestFile, PdfRestFileBasedResponse, PdfRestFileID +from pdfrest.models._internal import PdfToPowerpointPayload + +from .graphics_test_helpers import ( + ASYNC_API_KEY, + VALID_API_KEY, + build_file_info_payload, + make_pdf_file, +) + + +def test_convert_to_powerpoint_success(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + + payload_dump = PdfToPowerpointPayload.model_validate( + {"files": [input_file], "output": "slides"} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/powerpoint": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "slides.pptx", + "application/vnd.openxmlformats-officedocument.presentationml.presentation", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.convert_to_powerpoint(input_file, output="slides") + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + output_file = response.output_file + assert output_file.name == "slides.pptx" + assert ( + output_file.type + == "application/vnd.openxmlformats-officedocument.presentationml.presentation" + ) + assert response.warning is None + assert str(response.input_id) == str(input_file.id) + + +def test_convert_to_powerpoint_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/powerpoint": + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload["debug"] is True + assert payload["id"] == str(input_file.id) + assert payload["output"] == "custom" + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "custom.pptx", + "application/vnd.openxmlformats-officedocument.presentationml.presentation", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.convert_to_powerpoint( + input_file, + output="custom", + extra_query={"trace": "true"}, + extra_headers={"X-Debug": "sync"}, + extra_body={"debug": True}, + timeout=0.4, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "custom.pptx" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.4) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.4) + + +@pytest.mark.asyncio +async def test_async_convert_to_powerpoint_success( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + + payload_dump = PdfToPowerpointPayload.model_validate( + {"files": [input_file]} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/powerpoint": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "async.pptx", + "application/vnd.openxmlformats-officedocument.presentationml.presentation", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.convert_to_powerpoint(input_file) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "async.pptx" + assert ( + response.output_file.type + == "application/vnd.openxmlformats-officedocument.presentationml.presentation" + ) + assert str(response.input_id) == str(input_file.id) + + +@pytest.mark.asyncio +async def test_async_convert_to_powerpoint_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/powerpoint": + assert request.url.params["trace"] == "async" + assert request.headers["X-Debug"] == "async" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload["debug"] == "yes" + assert payload["id"] == str(input_file.id) + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "async" + assert request.headers["X-Debug"] == "async" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "async-custom.pptx", + "application/vnd.openxmlformats-officedocument.presentationml.presentation", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.convert_to_powerpoint( + input_file, + extra_query={"trace": "async"}, + extra_headers={"X-Debug": "async"}, + extra_body={"debug": "yes"}, + timeout=0.55, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "async-custom.pptx" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.55) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.55) + + +def test_convert_to_powerpoint_validation(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + pdf_file = make_pdf_file(PdfRestFileID.generate(1)) + png_file = PdfRestFile.model_validate( + build_file_info_payload( + PdfRestFileID.generate(), + "example.png", + "image/png", + ) + ) + transport = httpx.MockTransport(lambda request: (_ for _ in ()).throw(RuntimeError)) + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises(ValidationError, match="Must be a PDF file"), + ): + client.convert_to_powerpoint(png_file) + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises( + ValidationError, match="List should have at most 1 item after validation" + ), + ): + client.convert_to_powerpoint( + [pdf_file, make_pdf_file(PdfRestFileID.generate())] + ) diff --git a/tests/test_convert_xfa_to_acroforms.py b/tests/test_convert_xfa_to_acroforms.py new file mode 100644 index 00000000..6080d22f --- /dev/null +++ b/tests/test_convert_xfa_to_acroforms.py @@ -0,0 +1,271 @@ +from __future__ import annotations + +import json + +import httpx +import pytest +from pydantic import ValidationError + +from pdfrest import AsyncPdfRestClient, PdfRestClient +from pdfrest.models import PdfRestFile, PdfRestFileBasedResponse, PdfRestFileID +from pdfrest.models._internal import PdfXfaToAcroformsPayload + +from .graphics_test_helpers import ( + ASYNC_API_KEY, + VALID_API_KEY, + build_file_info_payload, + make_pdf_file, +) + + +def test_convert_xfa_to_acroforms_success(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + + payload_dump = PdfXfaToAcroformsPayload.model_validate( + {"files": [input_file], "output": "acro"} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/pdf-with-acroforms": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "acro.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.convert_xfa_to_acroforms(input_file, output="acro") + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + output_file = response.output_file + assert output_file.name == "acro.pdf" + assert output_file.type == "application/pdf" + assert response.warning is None + assert str(response.input_id) == str(input_file.id) + + +def test_convert_xfa_to_acroforms_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/pdf-with-acroforms": + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload["debug"] == "yes" + assert payload["id"] == str(input_file.id) + assert payload["output"] == "custom" + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "custom.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.convert_xfa_to_acroforms( + input_file, + output="custom", + extra_query={"trace": "true"}, + extra_headers={"X-Debug": "sync"}, + extra_body={"debug": "yes"}, + timeout=0.31, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "custom.pdf" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.31) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.31) + + +@pytest.mark.asyncio +async def test_async_convert_xfa_to_acroforms_success( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + + payload_dump = PdfXfaToAcroformsPayload.model_validate( + {"files": [input_file]} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/pdf-with-acroforms": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "async.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.convert_xfa_to_acroforms(input_file) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "async.pdf" + assert response.output_file.type == "application/pdf" + assert str(response.input_id) == str(input_file.id) + + +@pytest.mark.asyncio +async def test_async_convert_xfa_to_acroforms_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/pdf-with-acroforms": + assert request.url.params["trace"] == "async" + assert request.headers["X-Debug"] == "async" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload["debug"] == "yes" + assert payload["id"] == str(input_file.id) + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "async" + assert request.headers["X-Debug"] == "async" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "async-custom.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.convert_xfa_to_acroforms( + input_file, + extra_query={"trace": "async"}, + extra_headers={"X-Debug": "async"}, + extra_body={"debug": "yes"}, + timeout=0.52, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "async-custom.pdf" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.52) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.52) + + +def test_convert_xfa_to_acroforms_validation(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + pdf_file = make_pdf_file(PdfRestFileID.generate(1)) + png_file = PdfRestFile.model_validate( + build_file_info_payload( + PdfRestFileID.generate(), + "example.png", + "image/png", + ) + ) + transport = httpx.MockTransport(lambda request: (_ for _ in ()).throw(RuntimeError)) + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises(ValidationError, match="Must be a PDF file"), + ): + client.convert_xfa_to_acroforms(png_file) + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises( + ValidationError, match="List should have at most 1 item after validation" + ), + ): + client.convert_xfa_to_acroforms( + [pdf_file, make_pdf_file(PdfRestFileID.generate())] + ) diff --git a/tests/test_extract_images.py b/tests/test_extract_images.py new file mode 100644 index 00000000..5dea441b --- /dev/null +++ b/tests/test_extract_images.py @@ -0,0 +1,211 @@ +from __future__ import annotations + +import json + +import httpx +import pytest +from pydantic import ValidationError + +from pdfrest import AsyncPdfRestClient, PdfRestClient +from pdfrest.models import PdfRestFile, PdfRestFileBasedResponse, PdfRestFileID +from pdfrest.models._internal import ExtractImagesPayload + +from .graphics_test_helpers import ASYNC_API_KEY, VALID_API_KEY, make_pdf_file + + +def _make_png_file(file_id: str, name: str) -> PdfRestFile: + return PdfRestFile.model_validate( + { + "id": file_id, + "name": name, + "url": f"https://api.pdfrest.com/resource/{file_id}", + "type": "image/png", + "size": 10, + "modified": "2024-01-01T00:00:00Z", + "scheduledDeletionTimeUtc": None, + } + ) + + +def test_extract_images_payload_rejects_non_pdf() -> None: + file_id = str(PdfRestFileID.generate()) + text_file = PdfRestFile.model_validate( + { + "id": file_id, + "name": "notes.txt", + "url": f"https://api.pdfrest.com/resource/{file_id}", + "type": "text/plain", + "size": 64, + "modified": "2024-01-01T00:00:00Z", + "scheduledDeletionTimeUtc": None, + } + ) + with pytest.raises(ValidationError, match="Must be a PDF file"): + ExtractImagesPayload.model_validate({"files": [text_file]}) + + +def test_extract_images_payload_invalid_page_range() -> None: + file_repr = make_pdf_file(PdfRestFileID.generate(1)) + with pytest.raises( + ValidationError, match="The start page must be less than or equal to the end" + ): + ExtractImagesPayload.model_validate({"files": [file_repr], "pages": ["5-2"]}) + + +def test_extract_images_success(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id_1 = str(PdfRestFileID.generate()) + output_id_2 = str(PdfRestFileID.generate()) + + payload_dump = ExtractImagesPayload.model_validate( + {"files": [input_file], "pages": ["1-3"], "output": "images"} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/extracted-images": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [str(input_file.id)], + "outputId": [output_id_1, output_id_2], + }, + ) + if request.method == "GET" and request.url.path in { + f"/resource/{output_id_1}", + f"/resource/{output_id_2}", + }: + seen["get"] += 1 + return httpx.Response( + 200, + json=_make_png_file( + output_id_1 + if request.url.path.endswith(output_id_1) + else output_id_2, + "image.png", + ).model_dump(mode="json", by_alias=True), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.extract_images(input_file, pages=["1-3"], output="images") + + assert seen == {"post": 1, "get": 2} + assert isinstance(response, PdfRestFileBasedResponse) + assert len(response.output_files) == 2 + assert response.input_id == input_file.id + + +def test_extract_images_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + payload_dump = ExtractImagesPayload.model_validate( + {"files": [input_file], "pages": ["1-last"]} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/extracted-images": + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump | {"debug": True} + return httpx.Response( + 200, + json={ + "inputId": str(input_file.id), + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + return httpx.Response( + 200, + json=_make_png_file(output_id, "debug.png").model_dump( + mode="json", by_alias=True + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.extract_images( + input_file, + pages=["1-last"], + extra_query={"trace": "true"}, + extra_headers={"X-Debug": "sync"}, + extra_body={"debug": True}, + timeout=0.3, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert len(response.output_files) == 1 + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.3) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.3) + + +@pytest.mark.asyncio +async def test_async_extract_images_success( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + + payload_dump = ExtractImagesPayload.model_validate( + {"files": [input_file]} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/extracted-images": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [str(input_file.id)], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + return httpx.Response( + 200, + json=_make_png_file(output_id, "async.png").model_dump( + mode="json", by_alias=True + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.extract_images(input_file) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert len(response.output_files) == 1 + assert response.input_id == input_file.id diff --git a/tests/test_extract_pdf_text_to_file.py b/tests/test_extract_pdf_text_to_file.py new file mode 100644 index 00000000..a2ad457c --- /dev/null +++ b/tests/test_extract_pdf_text_to_file.py @@ -0,0 +1,240 @@ +from __future__ import annotations + +import json + +import httpx +import pytest +from pydantic import ValidationError + +from pdfrest import AsyncPdfRestClient, PdfRestClient +from pdfrest.models import PdfRestFile, PdfRestFileBasedResponse, PdfRestFileID +from pdfrest.models._internal import ExtractTextPayload + +from .graphics_test_helpers import ASYNC_API_KEY, VALID_API_KEY, make_pdf_file + + +def _make_text_file(file_id: str, name: str = "extracted.txt") -> PdfRestFile: + return PdfRestFile.model_validate( + { + "id": file_id, + "name": name, + "url": f"https://api.pdfrest.com/resource/{file_id}", + "type": "text/plain", + "size": 64, + "modified": "2024-01-01T00:00:00Z", + "scheduledDeletionTimeUtc": None, + } + ) + + +def test_extract_pdf_text_payload_rejects_non_pdf() -> None: + file_id = str(PdfRestFileID.generate()) + text_file = PdfRestFile.model_validate( + { + "id": file_id, + "name": "notes.txt", + "url": f"https://api.pdfrest.com/resource/{file_id}", + "type": "text/plain", + "size": 64, + "modified": "2024-01-01T00:00:00Z", + "scheduledDeletionTimeUtc": None, + } + ) + with pytest.raises(ValidationError, match="Must be a PDF file"): + ExtractTextPayload.model_validate({"files": [text_file]}) + + +def test_extract_pdf_text_payload_invalid_page_range() -> None: + file_repr = make_pdf_file(PdfRestFileID.generate(1)) + with pytest.raises( + ValidationError, match="The start page must be less than or equal to the end" + ): + ExtractTextPayload.model_validate({"files": [file_repr], "pages": ["5-2"]}) + + +def test_extract_pdf_text_to_file_success(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + payload_dump = ExtractTextPayload.model_validate( + { + "files": [input_file], + "pages": ["1-3"], + "output": "text", + "full_text": "document", + "preserve_line_breaks": "off", + "word_style": "off", + "word_coordinates": "off", + "output_type": "file", + } + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/extracted-text": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [str(input_file.id)], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=_make_text_file(output_id).model_dump(mode="json", by_alias=True), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.extract_pdf_text_to_file( + input_file, + pages=["1-3"], + output="text", + ) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert response.input_id == input_file.id + assert len(response.output_files) == 1 + assert response.output_file.id == output_id + + +def test_extract_pdf_text_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + payload_dump = ExtractTextPayload.model_validate( + { + "files": [input_file], + "output": "file-output", + "full_text": "document", + "preserve_line_breaks": "off", + "word_style": "off", + "word_coordinates": "off", + "output_type": "file", + } + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/extracted-text": + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + captured_timeout["post"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump | {"debug": True} + return httpx.Response( + 200, + json={ + "inputId": [str(input_file.id)], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + captured_timeout["get"] = request.extensions.get("timeout") + return httpx.Response( + 200, + json=_make_text_file(output_id, "debug.txt").model_dump( + mode="json", by_alias=True + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.extract_pdf_text_to_file( + input_file, + output="file-output", + extra_query={"trace": "true"}, + extra_headers={"X-Debug": "sync"}, + extra_body={"debug": True}, + timeout=0.35, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert len(response.output_files) == 1 + post_timeout = captured_timeout["post"] + get_timeout = captured_timeout["get"] + assert post_timeout is not None + assert get_timeout is not None + if isinstance(post_timeout, dict): + assert all( + component == pytest.approx(0.35) for component in post_timeout.values() + ) + else: + assert post_timeout == pytest.approx(0.35) + if isinstance(get_timeout, dict): + assert all( + component == pytest.approx(0.35) for component in get_timeout.values() + ) + else: + assert get_timeout == pytest.approx(0.35) + + +@pytest.mark.asyncio +async def test_async_extract_pdf_text_to_file_success( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + payload_dump = ExtractTextPayload.model_validate( + { + "files": [input_file], + "full_text": "document", + "preserve_line_breaks": "off", + "word_style": "off", + "word_coordinates": "off", + "output_type": "file", + } + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/extracted-text": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [str(input_file.id)], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=_make_text_file(output_id, "async.txt").model_dump( + mode="json", by_alias=True + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.extract_pdf_text_to_file(input_file) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert len(response.output_files) == 1 + assert response.input_id == input_file.id diff --git a/tests/test_flatten_annotations.py b/tests/test_flatten_annotations.py new file mode 100644 index 00000000..d5407a3d --- /dev/null +++ b/tests/test_flatten_annotations.py @@ -0,0 +1,281 @@ +from __future__ import annotations + +import json + +import httpx +import pytest +from pydantic import ValidationError + +from pdfrest import AsyncPdfRestClient, PdfRestClient +from pdfrest.models import PdfRestFile, PdfRestFileBasedResponse, PdfRestFileID +from pdfrest.models._internal import PdfFlattenAnnotationsPayload + +from .graphics_test_helpers import ( + ASYNC_API_KEY, + VALID_API_KEY, + build_file_info_payload, + make_pdf_file, +) + + +def test_flatten_annotations_success(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + + payload_dump = PdfFlattenAnnotationsPayload.model_validate( + {"files": [input_file], "output": "flattened"} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if ( + request.method == "POST" + and request.url.path == "/flattened-annotations-pdf" + ): + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "flattened.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.flatten_annotations(input_file, output="flattened") + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + output_file = response.output_file + assert output_file.name == "flattened.pdf" + assert output_file.type == "application/pdf" + assert response.warning is None + assert str(response.input_id) == str(input_file.id) + + +def test_flatten_annotations_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if ( + request.method == "POST" + and request.url.path == "/flattened-annotations-pdf" + ): + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload["debug"] == "yes" + assert payload["id"] == str(input_file.id) + assert payload["output"] == "custom" + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "custom.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.flatten_annotations( + input_file, + output="custom", + extra_query={"trace": "true"}, + extra_headers={"X-Debug": "sync"}, + extra_body={"debug": "yes"}, + timeout=0.29, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "custom.pdf" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.29) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.29) + + +@pytest.mark.asyncio +async def test_async_flatten_annotations_success( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + + payload_dump = PdfFlattenAnnotationsPayload.model_validate( + {"files": [input_file]} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if ( + request.method == "POST" + and request.url.path == "/flattened-annotations-pdf" + ): + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "async.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.flatten_annotations(input_file) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "async.pdf" + assert response.output_file.type == "application/pdf" + assert str(response.input_id) == str(input_file.id) + + +@pytest.mark.asyncio +async def test_async_flatten_annotations_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if ( + request.method == "POST" + and request.url.path == "/flattened-annotations-pdf" + ): + assert request.url.params["trace"] == "async" + assert request.headers["X-Debug"] == "async" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload["debug"] == "yes" + assert payload["id"] == str(input_file.id) + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "async" + assert request.headers["X-Debug"] == "async" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "async-custom.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.flatten_annotations( + input_file, + extra_query={"trace": "async"}, + extra_headers={"X-Debug": "async"}, + extra_body={"debug": "yes"}, + timeout=0.52, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "async-custom.pdf" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.52) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.52) + + +def test_flatten_annotations_validation(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + pdf_file = make_pdf_file(PdfRestFileID.generate(1)) + png_file = PdfRestFile.model_validate( + build_file_info_payload( + PdfRestFileID.generate(), + "example.png", + "image/png", + ) + ) + transport = httpx.MockTransport(lambda request: (_ for _ in ()).throw(RuntimeError)) + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises(ValidationError, match="Must be a PDF file"), + ): + client.flatten_annotations(png_file) + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises( + ValidationError, match="List should have at most 1 item after validation" + ), + ): + client.flatten_annotations([pdf_file, make_pdf_file(PdfRestFileID.generate())]) diff --git a/tests/test_flatten_layers.py b/tests/test_flatten_layers.py new file mode 100644 index 00000000..963a4482 --- /dev/null +++ b/tests/test_flatten_layers.py @@ -0,0 +1,269 @@ +from __future__ import annotations + +import json + +import httpx +import pytest +from pydantic import ValidationError + +from pdfrest import AsyncPdfRestClient, PdfRestClient +from pdfrest.models import PdfRestFile, PdfRestFileBasedResponse, PdfRestFileID +from pdfrest.models._internal import PdfFlattenLayersPayload + +from .graphics_test_helpers import ( + ASYNC_API_KEY, + VALID_API_KEY, + build_file_info_payload, + make_pdf_file, +) + + +def test_flatten_layers_success(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + + payload_dump = PdfFlattenLayersPayload.model_validate( + {"files": [input_file], "output": "flattened-layers"} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/flattened-layers-pdf": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "flattened-layers.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.flatten_layers(input_file, output="flattened-layers") + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + output_file = response.output_file + assert output_file.name == "flattened-layers.pdf" + assert output_file.type == "application/pdf" + assert response.warning is None + assert str(response.input_id) == str(input_file.id) + + +def test_flatten_layers_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/flattened-layers-pdf": + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload["debug"] == "yes" + assert payload["id"] == str(input_file.id) + assert payload["output"] == "custom" + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "custom.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.flatten_layers( + input_file, + output="custom", + extra_query={"trace": "true"}, + extra_headers={"X-Debug": "sync"}, + extra_body={"debug": "yes"}, + timeout=0.29, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "custom.pdf" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.29) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.29) + + +@pytest.mark.asyncio +async def test_async_flatten_layers_success( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + + payload_dump = PdfFlattenLayersPayload.model_validate( + {"files": [input_file]} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/flattened-layers-pdf": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "async.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.flatten_layers(input_file) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "async.pdf" + assert response.output_file.type == "application/pdf" + assert str(response.input_id) == str(input_file.id) + + +@pytest.mark.asyncio +async def test_async_flatten_layers_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/flattened-layers-pdf": + assert request.url.params["trace"] == "async" + assert request.headers["X-Debug"] == "async" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload["debug"] == "yes" + assert payload["id"] == str(input_file.id) + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "async" + assert request.headers["X-Debug"] == "async" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "async-custom.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.flatten_layers( + input_file, + extra_query={"trace": "async"}, + extra_headers={"X-Debug": "async"}, + extra_body={"debug": "yes"}, + timeout=0.52, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "async-custom.pdf" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.52) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.52) + + +def test_flatten_layers_validation(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + pdf_file = make_pdf_file(PdfRestFileID.generate(1)) + png_file = PdfRestFile.model_validate( + build_file_info_payload( + PdfRestFileID.generate(), + "example.png", + "image/png", + ) + ) + transport = httpx.MockTransport(lambda request: (_ for _ in ()).throw(RuntimeError)) + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises(ValidationError, match="Must be a PDF file"), + ): + client.flatten_layers(png_file) + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises( + ValidationError, match="List should have at most 1 item after validation" + ), + ): + client.flatten_layers([pdf_file, make_pdf_file(PdfRestFileID.generate())]) diff --git a/tests/test_flatten_transparencies.py b/tests/test_flatten_transparencies.py new file mode 100644 index 00000000..0035fd70 --- /dev/null +++ b/tests/test_flatten_transparencies.py @@ -0,0 +1,297 @@ +from __future__ import annotations + +import json + +import httpx +import pytest +from pydantic import ValidationError + +from pdfrest import AsyncPdfRestClient, PdfRestClient +from pdfrest.models import PdfRestFile, PdfRestFileBasedResponse, PdfRestFileID +from pdfrest.models._internal import PdfFlattenTransparenciesPayload + +from .graphics_test_helpers import ( + ASYNC_API_KEY, + VALID_API_KEY, + build_file_info_payload, + make_pdf_file, +) + + +def test_flatten_transparencies_success(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + + payload_dump = PdfFlattenTransparenciesPayload.model_validate( + {"files": [input_file], "output": "flattened", "quality": "high"} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if ( + request.method == "POST" + and request.url.path == "/flattened-transparencies-pdf" + ): + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "flattened.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.flatten_transparencies( + input_file, output="flattened", quality="high" + ) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + output_file = response.output_file + assert output_file.name == "flattened.pdf" + assert output_file.type == "application/pdf" + assert response.warning is None + assert str(response.input_id) == str(input_file.id) + + +def test_flatten_transparencies_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if ( + request.method == "POST" + and request.url.path == "/flattened-transparencies-pdf" + ): + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload["debug"] == "yes" + assert payload["id"] == str(input_file.id) + assert payload["output"] == "custom" + assert payload["quality"] == "low" + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "custom.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.flatten_transparencies( + input_file, + output="custom", + quality="low", + extra_query={"trace": "true"}, + extra_headers={"X-Debug": "sync"}, + extra_body={"debug": "yes"}, + timeout=0.29, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "custom.pdf" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.29) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.29) + + +@pytest.mark.asyncio +async def test_async_flatten_transparencies_success( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + + payload_dump = PdfFlattenTransparenciesPayload.model_validate( + {"files": [input_file], "quality": "medium"} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if ( + request.method == "POST" + and request.url.path == "/flattened-transparencies-pdf" + ): + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "async.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.flatten_transparencies(input_file) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "async.pdf" + assert response.output_file.type == "application/pdf" + assert str(response.input_id) == str(input_file.id) + + +@pytest.mark.asyncio +async def test_async_flatten_transparencies_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if ( + request.method == "POST" + and request.url.path == "/flattened-transparencies-pdf" + ): + assert request.url.params["trace"] == "async" + assert request.headers["X-Debug"] == "async" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload["debug"] == "yes" + assert payload["id"] == str(input_file.id) + assert payload["quality"] == "high" + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "async" + assert request.headers["X-Debug"] == "async" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "async-custom.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.flatten_transparencies( + input_file, + quality="high", + extra_query={"trace": "async"}, + extra_headers={"X-Debug": "async"}, + extra_body={"debug": "yes"}, + timeout=0.52, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "async-custom.pdf" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.52) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.52) + + +def test_flatten_transparencies_validation(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + pdf_file = make_pdf_file(PdfRestFileID.generate(1)) + png_file = PdfRestFile.model_validate( + build_file_info_payload( + PdfRestFileID.generate(), + "example.png", + "image/png", + ) + ) + transport = httpx.MockTransport(lambda request: (_ for _ in ()).throw(RuntimeError)) + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises(ValidationError, match="Must be a PDF file"), + ): + client.flatten_transparencies(png_file) + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises( + ValidationError, match="List should have at most 1 item after validation" + ), + ): + client.flatten_transparencies( + [pdf_file, make_pdf_file(PdfRestFileID.generate())] + ) + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises( + ValidationError, match="Input should be 'low', 'medium' or 'high'" + ), + ): + client.flatten_transparencies(pdf_file, quality="ultra") # type: ignore[arg-type] diff --git a/tests/test_linearize_pdf.py b/tests/test_linearize_pdf.py new file mode 100644 index 00000000..6b212437 --- /dev/null +++ b/tests/test_linearize_pdf.py @@ -0,0 +1,282 @@ +from __future__ import annotations + +import json + +import httpx +import pytest +from pydantic import ValidationError + +from pdfrest import AsyncPdfRestClient, PdfRestClient +from pdfrest.models import PdfRestFile, PdfRestFileBasedResponse, PdfRestFileID +from pdfrest.models._internal import PdfLinearizePayload + +from .graphics_test_helpers import ( + ASYNC_API_KEY, + VALID_API_KEY, + build_file_info_payload, + make_pdf_file, +) + + +def test_linearize_pdf_success(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + + payload_dump = PdfLinearizePayload.model_validate( + {"files": [input_file]} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/linearized-pdf": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "linearized.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.linearize_pdf(input_file) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "linearized.pdf" + assert response.output_file.type == "application/pdf" + assert str(response.input_id) == str(input_file.id) + assert response.warning is None + + +def test_linearize_pdf_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/linearized-pdf": + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload["debug"] == "yes" + assert payload["id"] == str(input_file.id) + assert payload["output"] == "linearized" + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "linearized-out.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.linearize_pdf( + input_file, + output="linearized", + extra_query={"trace": "true"}, + extra_headers={"X-Debug": "sync"}, + extra_body={"debug": "yes"}, + timeout=0.61, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "linearized-out.pdf" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.61) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.61) + + +@pytest.mark.asyncio +async def test_async_linearize_pdf_success(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + + payload_dump = PdfLinearizePayload.model_validate( + {"files": [input_file]} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/linearized-pdf": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "async-linearized.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.linearize_pdf(input_file) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "async-linearized.pdf" + assert response.output_file.type == "application/pdf" + assert str(response.input_id) == str(input_file.id) + + +@pytest.mark.asyncio +async def test_async_linearize_pdf_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/linearized-pdf": + assert request.url.params["trace"] == "async" + assert request.headers["X-Debug"] == "async" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload["flags"] == ["a", "b"] + assert payload["id"] == str(input_file.id) + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "async" + assert request.headers["X-Debug"] == "async" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "async-linearized-custom.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.linearize_pdf( + input_file, + extra_query={"trace": "async"}, + extra_headers={"X-Debug": "async"}, + extra_body={"flags": ["a", "b"]}, + timeout=0.83, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "async-linearized-custom.pdf" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.83) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.83) + + +@pytest.mark.parametrize( + ("files", "match"), + [ + pytest.param( + "png", + "Must be a PDF file", + id="non-pdf-file", + ), + pytest.param( + "multiple", + "List should have at most 1 item after validation", + id="multiple-files", + ), + ], +) +def test_linearize_pdf_validation( + monkeypatch: pytest.MonkeyPatch, + files: str, + match: str, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + pdf_file = make_pdf_file(PdfRestFileID.generate(1)) + png_file = PdfRestFile.model_validate( + build_file_info_payload( + PdfRestFileID.generate(), + "example.png", + "image/png", + ) + ) + transport = httpx.MockTransport(lambda request: (_ for _ in ()).throw(RuntimeError)) + files_argument = ( + png_file + if files == "png" + else [pdf_file, make_pdf_file(PdfRestFileID.generate())] + ) + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises(ValidationError, match=match), + ): + client.linearize_pdf(files_argument) diff --git a/tests/test_ocr_pdf.py b/tests/test_ocr_pdf.py new file mode 100644 index 00000000..625f92f4 --- /dev/null +++ b/tests/test_ocr_pdf.py @@ -0,0 +1,217 @@ +from __future__ import annotations + +import json + +import httpx +import pytest +from pydantic import ValidationError + +from pdfrest import AsyncPdfRestClient, PdfRestClient +from pdfrest.models import PdfRestFile, PdfRestFileBasedResponse, PdfRestFileID +from pdfrest.models._internal import OcrPdfPayload + +from .graphics_test_helpers import ASYNC_API_KEY, VALID_API_KEY, make_pdf_file + + +def test_ocr_payload_rejects_non_pdf() -> None: + file_id = str(PdfRestFileID.generate()) + text_file = PdfRestFile.model_validate( + { + "id": file_id, + "name": "notes.txt", + "url": f"https://api.pdfrest.com/resource/{file_id}", + "type": "text/plain", + "size": 64, + "modified": "2024-01-01T00:00:00Z", + "scheduledDeletionTimeUtc": None, + } + ) + with pytest.raises(ValidationError, match="Must be a PDF file"): + OcrPdfPayload.model_validate({"files": [text_file]}) + + +def test_ocr_payload_invalid_page_range() -> None: + file_repr = make_pdf_file(PdfRestFileID.generate(1)) + with pytest.raises( + ValidationError, match="The start page must be less than or equal to the end" + ): + OcrPdfPayload.model_validate({"files": [file_repr], "pages": ["5-2"]}) + + +def test_ocr_payload_languages() -> None: + file_repr = make_pdf_file(PdfRestFileID.generate(1)) + payload = OcrPdfPayload.model_validate( + {"files": [file_repr], "languages": ["English", "German"]} + ) + assert payload.languages == ["English", "German"] + assert ( + payload.model_dump( + mode="json", by_alias=True, exclude_none=True, exclude_unset=True + )["languages"] + == "English,German" + ) + + +def test_ocr_payload_invalid_language() -> None: + file_repr = make_pdf_file(PdfRestFileID.generate(1)) + with pytest.raises(ValidationError, match="ChineseSimplified"): + OcrPdfPayload.model_validate({"files": [file_repr], "languages": ["Klingon"]}) + + +def test_ocr_pdf_success(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + payload_dump = OcrPdfPayload.model_validate( + { + "files": [input_file], + "pages": ["1-3"], + "output": "ocr", + "languages": ["English"], + } + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + output_id = str(PdfRestFileID.generate()) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/pdf-with-ocr-text": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": str(input_file.id), + "outputId": output_id, + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + return httpx.Response( + 200, + json=make_pdf_file(output_id, "ocr.pdf").model_dump( + mode="json", by_alias=True + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.ocr_pdf( + input_file, + pages=["1-3"], + output="ocr", + ) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.id == output_id + assert response.output_file.name == "ocr.pdf" + assert response.input_id == input_file.id + + +def test_ocr_pdf_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + payload_dump = OcrPdfPayload.model_validate( + {"files": [input_file], "languages": ["English"]} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/pdf-with-ocr-text": + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump | {"debug": True} + return httpx.Response( + 200, + json={ + "outputId": output_id, + "inputId": str(input_file.id), + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + return httpx.Response( + 200, + json=make_pdf_file(output_id, "custom-ocr.pdf").model_dump( + mode="json", by_alias=True + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.ocr_pdf( + input_file, + extra_query={"trace": "true"}, + extra_headers={"X-Debug": "sync"}, + extra_body={"debug": True}, + timeout=0.4, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.id == output_id + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.4) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.4) + + +@pytest.mark.asyncio +async def test_async_ocr_pdf_success( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + payload_dump = OcrPdfPayload.model_validate( + {"files": [input_file], "languages": ["English"]} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + output_id = str(PdfRestFileID.generate()) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/pdf-with-ocr-text": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "outputId": output_id, + "inputId": str(input_file.id), + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + return httpx.Response( + 200, + json=make_pdf_file(output_id, "async-ocr.pdf").model_dump( + mode="json", by_alias=True + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.ocr_pdf(input_file) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.id == output_id + assert response.input_id == input_file.id diff --git a/tests/test_rasterize_pdf.py b/tests/test_rasterize_pdf.py new file mode 100644 index 00000000..707ab223 --- /dev/null +++ b/tests/test_rasterize_pdf.py @@ -0,0 +1,265 @@ +from __future__ import annotations + +import json + +import httpx +import pytest +from pydantic import ValidationError + +from pdfrest import AsyncPdfRestClient, PdfRestClient +from pdfrest.models import PdfRestFile, PdfRestFileBasedResponse, PdfRestFileID +from pdfrest.models._internal import PdfRasterizePayload + +from .graphics_test_helpers import ( + ASYNC_API_KEY, + VALID_API_KEY, + build_file_info_payload, + make_pdf_file, +) + + +def test_rasterize_pdf_success(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + + payload_dump = PdfRasterizePayload.model_validate( + {"files": [input_file], "output": "rasterized"} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/rasterized-pdf": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "rasterized.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.rasterize_pdf(input_file, output="rasterized") + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + output_file = response.output_file + assert output_file.name == "rasterized.pdf" + assert output_file.type == "application/pdf" + assert response.warning is None + assert str(response.input_id) == str(input_file.id) + + +def test_rasterize_pdf_request_customization(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/rasterized-pdf": + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload["debug"] == "yes" + assert payload["id"] == str(input_file.id) + assert payload["output"] == "custom" + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "custom.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.rasterize_pdf( + input_file, + output="custom", + extra_query={"trace": "true"}, + extra_headers={"X-Debug": "sync"}, + extra_body={"debug": "yes"}, + timeout=0.31, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "custom.pdf" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.31) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.31) + + +@pytest.mark.asyncio +async def test_async_rasterize_pdf_success(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + + payload_dump = PdfRasterizePayload.model_validate( + {"files": [input_file]} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/rasterized-pdf": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "async.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.rasterize_pdf(input_file) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "async.pdf" + assert response.output_file.type == "application/pdf" + assert str(response.input_id) == str(input_file.id) + + +@pytest.mark.asyncio +async def test_async_rasterize_pdf_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/rasterized-pdf": + assert request.url.params["trace"] == "async" + assert request.headers["X-Debug"] == "async" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload["debug"] == "yes" + assert payload["id"] == str(input_file.id) + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "async" + assert request.headers["X-Debug"] == "async" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "async-custom.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.rasterize_pdf( + input_file, + extra_query={"trace": "async"}, + extra_headers={"X-Debug": "async"}, + extra_body={"debug": "yes"}, + timeout=0.52, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "async-custom.pdf" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.52) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.52) + + +def test_rasterize_pdf_validation(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + pdf_file = make_pdf_file(PdfRestFileID.generate(1)) + png_file = PdfRestFile.model_validate( + build_file_info_payload( + PdfRestFileID.generate(), + "example.png", + "image/png", + ) + ) + transport = httpx.MockTransport(lambda request: (_ for _ in ()).throw(RuntimeError)) + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises(ValidationError, match="Must be a PDF file"), + ): + client.rasterize_pdf(png_file) + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises( + ValidationError, match="List should have at most 1 item after validation" + ), + ): + client.rasterize_pdf([pdf_file, make_pdf_file(PdfRestFileID.generate())]) diff --git a/tests/test_summarize_pdf_text.py b/tests/test_summarize_pdf_text.py new file mode 100644 index 00000000..4263c488 --- /dev/null +++ b/tests/test_summarize_pdf_text.py @@ -0,0 +1,330 @@ +from __future__ import annotations + +import json + +import httpx +import pytest +from pydantic import ValidationError + +from pdfrest import AsyncPdfRestClient, PdfRestClient +from pdfrest.models import ( + PdfRestFile, + PdfRestFileBasedResponse, + PdfRestFileID, + SummarizePdfTextResponse, +) +from pdfrest.models._internal import SummarizePdfTextPayload + +from .graphics_test_helpers import ( + ASYNC_API_KEY, + VALID_API_KEY, + build_file_info_payload, + make_pdf_file, +) + + +def _make_text_file(file_id: str) -> PdfRestFile: + return PdfRestFile.model_validate( + { + "id": file_id, + "name": "notes.txt", + "url": f"https://api.pdfrest.com/resource/{file_id}", + "type": "text/plain", + "size": 64, + "modified": "2024-01-01T00:00:00Z", + "scheduledDeletionTimeUtc": None, + } + ) + + +def test_summarize_payload_rejects_invalid_mime() -> None: + file_id = str(PdfRestFileID.generate()) + image_file = PdfRestFile.model_validate( + { + "id": file_id, + "name": "image.png", + "url": f"https://api.pdfrest.com/resource/{file_id}", + "type": "image/png", + "size": 10, + "modified": "2024-01-01T00:00:00Z", + "scheduledDeletionTimeUtc": None, + } + ) + + with pytest.raises( + ValidationError, match="Must be a PDF, Markdown, or plain text file" + ): + SummarizePdfTextPayload.model_validate({"files": [image_file]}) + + +def test_summarize_payload_invalid_page_range() -> None: + file_repr = make_pdf_file(PdfRestFileID.generate(1)) + + with pytest.raises( + ValidationError, match="The start page must be less than or equal to the end" + ): + SummarizePdfTextPayload.model_validate({"files": [file_repr], "pages": ["5-2"]}) + + +def test_summarize_text_json_success(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = _make_text_file(str(PdfRestFileID.generate(1))) + payload_dump = SummarizePdfTextPayload.model_validate( + { + "files": [input_file], + "target_word_count": 120, + "summary_format": "bullet_points", + "pages": ["1-3"], + "output_format": "plaintext", + "output_type": "json", + "output": "summary", + } + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/summarized-pdf-text": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "summary": "Key points...", + "inputId": str(input_file.id), + }, + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.summarize_text( + input_file, + target_word_count=120, + summary_format="bullet_points", + pages=["1-3"], + output_format="plaintext", + output="summary", + ) + + assert seen == {"post": 1} + assert isinstance(response, SummarizePdfTextResponse) + assert response.summary == "Key points..." + assert response.input_id == input_file.id + + +def test_summarize_text_to_file_success( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = _make_text_file(str(PdfRestFileID.generate(1))) + payload_dump = SummarizePdfTextPayload.model_validate( + { + "files": [input_file], + "target_word_count": 200, + "summary_format": "bullet_points", + "pages": ["2-last"], + "output_format": "plaintext", + "output_type": "file", + "output": "summary", + } + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + output_id = str(PdfRestFileID.generate()) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/summarized-pdf-text": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "outputId": output_id, + "inputId": str(input_file.id), + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + return httpx.Response( + 200, + json=build_file_info_payload(output_id, "summary.txt", "text/plain"), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.summarize_text_to_file( + input_file, + target_word_count=200, + summary_format="bullet_points", + pages=["2-last"], + output_format="plaintext", + output="summary", + ) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.id == output_id + assert response.output_file.name == "summary.txt" + assert response.input_id == input_file.id + + +def test_summarize_text_to_file_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + payload_dump = SummarizePdfTextPayload.model_validate( + { + "files": [input_file], + "output_type": "file", + "output_format": "markdown", + "summary_format": "overview", + } + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + output_id = str(PdfRestFileID.generate()) + + captured_timeout: dict[str, float | dict[str, float] | None] = {} + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/summarized-pdf-text": + seen["post"] += 1 + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + for key, value in payload_dump.items(): + assert payload[key] == value + assert payload["debug"] is True + return httpx.Response( + 200, + json={ + "outputId": output_id, + "inputId": str(input_file.id), + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + return httpx.Response( + 200, + json=build_file_info_payload(output_id, "summary.txt", "text/plain"), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.summarize_text_to_file( + input_file, + extra_query={"trace": "true"}, + extra_headers={"X-Debug": "sync"}, + extra_body={"debug": True}, + timeout=0.25, + ) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.id == output_id + assert response.output_file.name == "summary.txt" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.25) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.25) + + +@pytest.mark.asyncio +async def test_async_summarize_text_success( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + payload_dump = SummarizePdfTextPayload.model_validate( + {"files": [input_file], "output_type": "json"} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/summarized-pdf-text": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + for key, value in payload_dump.items(): + assert payload[key] == value + return httpx.Response( + 200, + json={ + "summary": "Async summary", + "inputId": str(input_file.id), + }, + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.summarize_text(input_file) + + assert seen == {"post": 1} + assert isinstance(response, SummarizePdfTextResponse) + assert response.summary == "Async summary" + assert response.input_id == input_file.id + + +@pytest.mark.asyncio +async def test_async_summarize_text_to_file_success( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + payload_dump = SummarizePdfTextPayload.model_validate( + {"files": [input_file], "output_type": "file"} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + output_id = str(PdfRestFileID.generate()) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/summarized-pdf-text": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + for key, value in payload_dump.items(): + assert payload[key] == value + return httpx.Response( + 200, + json={ + "outputId": output_id, + "inputId": str(input_file.id), + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, "async-summary.txt", "text/plain" + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.summarize_text_to_file(input_file) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.id == output_id + assert response.input_id == input_file.id diff --git a/tests/test_translate_pdf_text.py b/tests/test_translate_pdf_text.py new file mode 100644 index 00000000..fc7eadcf --- /dev/null +++ b/tests/test_translate_pdf_text.py @@ -0,0 +1,229 @@ +from __future__ import annotations + +import json + +import httpx +import pytest +from pydantic import ValidationError + +from pdfrest import AsyncPdfRestClient, PdfRestClient +from pdfrest.models import ( + PdfRestFile, + PdfRestFileID, + TranslatePdfTextFileResponse, + TranslatePdfTextResponse, +) +from pdfrest.models._internal import TranslatePdfTextPayload + +from .graphics_test_helpers import ASYNC_API_KEY, VALID_API_KEY, make_pdf_file + + +def _make_markdown_file(file_id: str) -> PdfRestFile: + return PdfRestFile.model_validate( + { + "id": file_id, + "name": "notes.md", + "url": f"https://api.pdfrest.com/resource/{file_id}", + "type": "text/markdown", + "size": 64, + "modified": "2024-01-01T00:00:00Z", + "scheduledDeletionTimeUtc": None, + } + ) + + +def test_translate_payload_rejects_invalid_mime() -> None: + file_id = str(PdfRestFileID.generate()) + image_file = PdfRestFile.model_validate( + { + "id": file_id, + "name": "image.png", + "url": f"https://api.pdfrest.com/resource/{file_id}", + "type": "image/png", + "size": 10, + "modified": "2024-01-01T00:00:00Z", + "scheduledDeletionTimeUtc": None, + } + ) + + with pytest.raises( + ValidationError, match="Must be a PDF, Markdown, or plain text file" + ): + TranslatePdfTextPayload.model_validate( + {"files": [image_file], "output_language": "fr"} + ) + + +def test_translate_payload_requires_target_language() -> None: + file_repr = make_pdf_file(PdfRestFileID.generate(1)) + with pytest.raises(ValidationError): + TranslatePdfTextPayload.model_validate({"files": [file_repr]}) + + +def test_translate_pdf_text_json_success(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = _make_markdown_file(str(PdfRestFileID.generate(1))) + payload_dump = TranslatePdfTextPayload.model_validate( + { + "files": [input_file], + "output_language": "fr", + "pages": ["1-2"], + "output_format": "plaintext", + "output_type": "json", + "output": "translation", + } + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/translated-pdf-text": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "translated_text": "Bonjour", + "inputId": str(input_file.id), + "source_languages": ["en"], + "output_language": "fr", + }, + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.translate_pdf_text( + input_file, + output_language="fr", + pages=["1-2"], + output_format="plaintext", + output="translation", + ) + + assert seen == {"post": 1} + assert isinstance(response, TranslatePdfTextResponse) + assert response.translated_text == "Bonjour" + assert response.source_languages == ["en"] + assert response.output_language == "fr" + assert response.input_id == input_file.id + + +def test_translate_pdf_text_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + payload_dump = TranslatePdfTextPayload.model_validate( + { + "files": [input_file], + "output_language": "es", + "output_type": "file", + } + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + output_id = str(PdfRestFileID.generate()) + + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/translated-pdf-text": + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + for key, value in payload_dump.items(): + assert payload[key] == value + assert payload["debug"] is True + return httpx.Response( + 200, + json={ + "outputUrl": f"https://api.pdfrest.com/resource/{output_id}?format=file", + "outputId": output_id, + "inputId": str(input_file.id), + "source_languages": ["en"], + "output_language": "es", + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + return httpx.Response( + 200, + json=_make_markdown_file(output_id).model_dump( + mode="json", by_alias=True + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.translate_pdf_text_to_file( + input_file, + output_language="es", + extra_query={"trace": "true"}, + extra_headers={"X-Debug": "sync"}, + extra_body={"debug": True}, + timeout=0.3, + ) + + assert isinstance(response, TranslatePdfTextFileResponse) + assert response.output_file.id == output_id + assert response.output_language == "es" + assert response.source_languages == ["en"] + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.3) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.3) + + +@pytest.mark.asyncio +async def test_async_translate_pdf_text_success( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + payload_dump = TranslatePdfTextPayload.model_validate( + {"files": [input_file], "output_language": "de", "output_type": "json"} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/translated-pdf-text": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + for key, value in payload_dump.items(): + assert payload[key] == value + return httpx.Response( + 200, + json={ + "translated_text": "Hallo", + "inputId": str(input_file.id), + "source_languages": ["en"], + "output_language": "de", + }, + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.translate_pdf_text( + input_file, + output_language="de", + ) + + assert seen == {"post": 1} + assert isinstance(response, TranslatePdfTextResponse) + assert response.translated_text == "Hallo" + assert response.source_languages == ["en"] + assert response.output_language == "de" + assert response.input_id == input_file.id